llama-stack 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. llama_stack/cli/stack/_list_deps.py +11 -7
  2. llama_stack/cli/stack/run.py +3 -25
  3. llama_stack/core/access_control/datatypes.py +78 -0
  4. llama_stack/core/configure.py +2 -2
  5. llama_stack/{distributions/meta-reference-gpu → core/connectors}/__init__.py +3 -1
  6. llama_stack/core/connectors/connectors.py +162 -0
  7. llama_stack/core/conversations/conversations.py +61 -58
  8. llama_stack/core/datatypes.py +54 -8
  9. llama_stack/core/library_client.py +60 -13
  10. llama_stack/core/prompts/prompts.py +43 -42
  11. llama_stack/core/routers/datasets.py +20 -17
  12. llama_stack/core/routers/eval_scoring.py +143 -53
  13. llama_stack/core/routers/inference.py +20 -9
  14. llama_stack/core/routers/safety.py +30 -42
  15. llama_stack/core/routers/vector_io.py +15 -7
  16. llama_stack/core/routing_tables/models.py +42 -3
  17. llama_stack/core/routing_tables/scoring_functions.py +19 -19
  18. llama_stack/core/routing_tables/shields.py +20 -17
  19. llama_stack/core/routing_tables/vector_stores.py +8 -5
  20. llama_stack/core/server/auth.py +192 -17
  21. llama_stack/core/server/fastapi_router_registry.py +40 -5
  22. llama_stack/core/server/server.py +24 -5
  23. llama_stack/core/stack.py +54 -10
  24. llama_stack/core/storage/datatypes.py +9 -0
  25. llama_stack/core/store/registry.py +1 -1
  26. llama_stack/core/utils/exec.py +2 -2
  27. llama_stack/core/utils/type_inspection.py +16 -2
  28. llama_stack/distributions/dell/config.yaml +4 -1
  29. llama_stack/distributions/dell/run-with-safety.yaml +4 -1
  30. llama_stack/distributions/nvidia/config.yaml +4 -1
  31. llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
  32. llama_stack/distributions/oci/config.yaml +4 -1
  33. llama_stack/distributions/open-benchmark/config.yaml +9 -1
  34. llama_stack/distributions/postgres-demo/config.yaml +1 -1
  35. llama_stack/distributions/starter/build.yaml +62 -0
  36. llama_stack/distributions/starter/config.yaml +22 -3
  37. llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
  38. llama_stack/distributions/starter/starter.py +13 -1
  39. llama_stack/distributions/starter-gpu/build.yaml +62 -0
  40. llama_stack/distributions/starter-gpu/config.yaml +22 -3
  41. llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
  42. llama_stack/distributions/template.py +10 -2
  43. llama_stack/distributions/watsonx/config.yaml +4 -1
  44. llama_stack/log.py +1 -0
  45. llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
  46. llama_stack/providers/inline/agents/meta_reference/agents.py +58 -61
  47. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +53 -51
  48. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +99 -22
  49. llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
  50. llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
  51. llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
  52. llama_stack/providers/inline/batches/reference/batches.py +2 -1
  53. llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
  54. llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
  55. llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
  56. llama_stack/providers/inline/post_training/torchtune/common/utils.py +5 -9
  57. llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
  58. llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
  59. llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
  60. llama_stack/providers/inline/safety/llama_guard/llama_guard.py +20 -24
  61. llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
  62. llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
  63. llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
  64. llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
  65. llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
  66. llama_stack/providers/registry/agents.py +1 -0
  67. llama_stack/providers/registry/inference.py +1 -9
  68. llama_stack/providers/registry/vector_io.py +136 -16
  69. llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
  70. llama_stack/providers/remote/files/s3/config.py +5 -3
  71. llama_stack/providers/remote/files/s3/files.py +2 -2
  72. llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
  73. llama_stack/providers/remote/inference/openai/openai.py +2 -0
  74. llama_stack/providers/remote/inference/together/together.py +4 -0
  75. llama_stack/providers/remote/inference/vertexai/config.py +3 -3
  76. llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
  77. llama_stack/providers/remote/inference/vllm/config.py +37 -18
  78. llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
  79. llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
  80. llama_stack/providers/remote/post_training/nvidia/models.py +3 -11
  81. llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
  82. llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
  83. llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
  84. llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
  85. llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
  86. llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
  87. llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
  88. llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
  89. llama_stack/providers/remote/vector_io/oci/config.py +41 -0
  90. llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
  91. llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
  92. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
  93. llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
  94. llama_stack/providers/utils/bedrock/client.py +3 -3
  95. llama_stack/providers/utils/bedrock/config.py +7 -7
  96. llama_stack/providers/utils/inference/__init__.py +0 -25
  97. llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
  98. llama_stack/providers/utils/inference/http_client.py +239 -0
  99. llama_stack/providers/utils/inference/litellm_openai_mixin.py +6 -0
  100. llama_stack/providers/utils/inference/model_registry.py +148 -2
  101. llama_stack/providers/utils/inference/openai_compat.py +1 -158
  102. llama_stack/providers/utils/inference/openai_mixin.py +42 -2
  103. llama_stack/providers/utils/inference/prompt_adapter.py +0 -209
  104. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
  105. llama_stack/providers/utils/memory/vector_store.py +46 -19
  106. llama_stack/providers/utils/responses/responses_store.py +7 -7
  107. llama_stack/providers/utils/safety.py +114 -0
  108. llama_stack/providers/utils/tools/mcp.py +44 -3
  109. llama_stack/testing/api_recorder.py +9 -3
  110. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/METADATA +14 -2
  111. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/RECORD +115 -148
  112. llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
  113. llama_stack/distributions/meta-reference-gpu/doc_template.md +0 -119
  114. llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
  115. llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
  116. llama_stack/models/llama/hadamard_utils.py +0 -88
  117. llama_stack/models/llama/llama3/args.py +0 -74
  118. llama_stack/models/llama/llama3/dog.jpg +0 -0
  119. llama_stack/models/llama/llama3/generation.py +0 -378
  120. llama_stack/models/llama/llama3/model.py +0 -304
  121. llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
  122. llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
  123. llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
  124. llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
  125. llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
  126. llama_stack/models/llama/llama3/pasta.jpeg +0 -0
  127. llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
  128. llama_stack/models/llama/llama3/quantization/loader.py +0 -316
  129. llama_stack/models/llama/llama3_1/__init__.py +0 -12
  130. llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
  131. llama_stack/models/llama/llama3_1/prompts.py +0 -258
  132. llama_stack/models/llama/llama3_2/__init__.py +0 -5
  133. llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
  134. llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
  135. llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
  136. llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
  137. llama_stack/models/llama/llama3_3/__init__.py +0 -5
  138. llama_stack/models/llama/llama3_3/prompts.py +0 -259
  139. llama_stack/models/llama/llama4/args.py +0 -107
  140. llama_stack/models/llama/llama4/ffn.py +0 -58
  141. llama_stack/models/llama/llama4/moe.py +0 -214
  142. llama_stack/models/llama/llama4/preprocess.py +0 -435
  143. llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
  144. llama_stack/models/llama/llama4/quantization/loader.py +0 -226
  145. llama_stack/models/llama/llama4/vision/__init__.py +0 -5
  146. llama_stack/models/llama/llama4/vision/embedding.py +0 -210
  147. llama_stack/models/llama/llama4/vision/encoder.py +0 -412
  148. llama_stack/models/llama/quantize_impls.py +0 -316
  149. llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
  150. llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
  151. llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
  152. llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
  153. llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
  154. llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
  155. llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
  156. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/WHEEL +0 -0
  157. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/entry_points.txt +0 -0
  158. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/licenses/LICENSE +0 -0
  159. {llama_stack-0.4.4.dist-info → llama_stack-0.5.0.dist-info}/top_level.txt +0 -0
@@ -1,140 +0,0 @@
1
- version: 2
2
- image_name: meta-reference-gpu
3
- apis:
4
- - agents
5
- - datasetio
6
- - eval
7
- - inference
8
- - safety
9
- - scoring
10
- - tool_runtime
11
- - vector_io
12
- providers:
13
- inference:
14
- - provider_id: meta-reference-inference
15
- provider_type: inline::meta-reference
16
- config:
17
- model: ${env.INFERENCE_MODEL}
18
- checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:=null}
19
- quantization:
20
- type: ${env.QUANTIZATION_TYPE:=bf16}
21
- model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
22
- max_batch_size: ${env.MAX_BATCH_SIZE:=1}
23
- max_seq_len: ${env.MAX_SEQ_LEN:=4096}
24
- - provider_id: sentence-transformers
25
- provider_type: inline::sentence-transformers
26
- vector_io:
27
- - provider_id: faiss
28
- provider_type: inline::faiss
29
- config:
30
- persistence:
31
- namespace: vector_io::faiss
32
- backend: kv_default
33
- safety:
34
- - provider_id: llama-guard
35
- provider_type: inline::llama-guard
36
- config:
37
- excluded_categories: []
38
- agents:
39
- - provider_id: meta-reference
40
- provider_type: inline::meta-reference
41
- config:
42
- persistence:
43
- agent_state:
44
- namespace: agents
45
- backend: kv_default
46
- responses:
47
- table_name: responses
48
- backend: sql_default
49
- max_write_queue_size: 10000
50
- num_writers: 4
51
- eval:
52
- - provider_id: meta-reference
53
- provider_type: inline::meta-reference
54
- config:
55
- kvstore:
56
- namespace: eval
57
- backend: kv_default
58
- datasetio:
59
- - provider_id: huggingface
60
- provider_type: remote::huggingface
61
- config:
62
- kvstore:
63
- namespace: datasetio::huggingface
64
- backend: kv_default
65
- - provider_id: localfs
66
- provider_type: inline::localfs
67
- config:
68
- kvstore:
69
- namespace: datasetio::localfs
70
- backend: kv_default
71
- scoring:
72
- - provider_id: basic
73
- provider_type: inline::basic
74
- - provider_id: llm-as-judge
75
- provider_type: inline::llm-as-judge
76
- - provider_id: braintrust
77
- provider_type: inline::braintrust
78
- config:
79
- openai_api_key: ${env.OPENAI_API_KEY:=}
80
- tool_runtime:
81
- - provider_id: brave-search
82
- provider_type: remote::brave-search
83
- config:
84
- api_key: ${env.BRAVE_SEARCH_API_KEY:=}
85
- max_results: 3
86
- - provider_id: tavily-search
87
- provider_type: remote::tavily-search
88
- config:
89
- api_key: ${env.TAVILY_SEARCH_API_KEY:=}
90
- max_results: 3
91
- - provider_id: rag-runtime
92
- provider_type: inline::rag-runtime
93
- - provider_id: model-context-protocol
94
- provider_type: remote::model-context-protocol
95
- storage:
96
- backends:
97
- kv_default:
98
- type: kv_sqlite
99
- db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/kvstore.db
100
- sql_default:
101
- type: sql_sqlite
102
- db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/sql_store.db
103
- stores:
104
- metadata:
105
- namespace: registry
106
- backend: kv_default
107
- inference:
108
- table_name: inference_store
109
- backend: sql_default
110
- max_write_queue_size: 10000
111
- num_writers: 4
112
- conversations:
113
- table_name: openai_conversations
114
- backend: sql_default
115
- prompts:
116
- namespace: prompts
117
- backend: kv_default
118
- registered_resources:
119
- models:
120
- - metadata: {}
121
- model_id: ${env.INFERENCE_MODEL}
122
- provider_id: meta-reference-inference
123
- model_type: llm
124
- - metadata:
125
- embedding_dimension: 768
126
- model_id: nomic-embed-text-v1.5
127
- provider_id: sentence-transformers
128
- model_type: embedding
129
- shields: []
130
- vector_dbs: []
131
- datasets: []
132
- scoring_fns: []
133
- benchmarks: []
134
- tool_groups:
135
- - toolgroup_id: builtin::websearch
136
- provider_id: tavily-search
137
- - toolgroup_id: builtin::rag
138
- provider_id: rag-runtime
139
- server:
140
- port: 8321
@@ -1,119 +0,0 @@
1
- ---
2
- orphan: true
3
- ---
4
- # Meta Reference GPU Distribution
5
-
6
- ```{toctree}
7
- :maxdepth: 2
8
- :hidden:
9
-
10
- self
11
- ```
12
-
13
- The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
14
-
15
- {{ providers_table }}
16
-
17
- Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
18
-
19
- {% if run_config_env_vars %}
20
- ### Environment Variables
21
-
22
- The following environment variables can be configured:
23
-
24
- {% for var, (default_value, description) in run_config_env_vars.items() %}
25
- - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
26
- {% endfor %}
27
- {% endif %}
28
-
29
-
30
- ## Prerequisite: Downloading Models
31
-
32
- Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
33
- ```
34
-
35
- ## Running the Distribution
36
-
37
- You can do this via venv or Docker which has a pre-built image.
38
-
39
- ### Via Docker
40
-
41
- This method allows you to get started quickly without having to build the distribution code.
42
-
43
- ```bash
44
- LLAMA_STACK_PORT=8321
45
- docker run \
46
- -it \
47
- --pull always \
48
- --gpu all \
49
- -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
50
- -v ~/.llama:/root/.llama \
51
- -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
52
- llamastack/distribution-{{ name }} \
53
- --port $LLAMA_STACK_PORT
54
- ```
55
-
56
- If you are using Llama Stack Safety / Shield APIs, use:
57
-
58
- ```bash
59
- docker run \
60
- -it \
61
- --pull always \
62
- --gpu all \
63
- -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
64
- -v ~/.llama:/root/.llama \
65
- -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
66
- -e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
67
- llamastack/distribution-{{ name }} \
68
- --port $LLAMA_STACK_PORT
69
- ```
70
-
71
- ### Via Docker with Custom Run Configuration
72
-
73
- You can also run the Docker container with a custom run configuration file by mounting it into the container:
74
-
75
- ```bash
76
- # Set the path to your custom config.yaml file
77
- CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
78
- LLAMA_STACK_PORT=8321
79
-
80
- docker run \
81
- -it \
82
- --pull always \
83
- --gpu all \
84
- -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
85
- -v ~/.llama:/root/.llama \
86
- -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
87
- -e RUN_CONFIG_PATH=/app/custom-config.yaml \
88
- llamastack/distribution-{{ name }} \
89
- --port $LLAMA_STACK_PORT
90
- ```
91
-
92
- **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
93
-
94
- {% if run_configs %}
95
- Available run configurations for this distribution:
96
- {% for config in run_configs %}
97
- - `{{ config }}`
98
- {% endfor %}
99
- {% endif %}
100
-
101
- ### Via venv
102
-
103
- Make sure you have the Llama Stack CLI available.
104
-
105
- ```bash
106
- llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
107
- INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
108
- llama stack run distributions/{{ name }}/config.yaml \
109
- --port 8321
110
- ```
111
-
112
- If you are using Llama Stack Safety / Shield APIs, use:
113
-
114
- ```bash
115
- INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
116
- SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
117
- llama stack run distributions/{{ name }}/run-with-safety.yaml \
118
- --port 8321
119
- ```
@@ -1,163 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
6
-
7
- from pathlib import Path
8
-
9
- from llama_stack.core.datatypes import (
10
- BuildProvider,
11
- ModelInput,
12
- Provider,
13
- ShieldInput,
14
- ToolGroupInput,
15
- )
16
- from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
17
- from llama_stack.providers.inline.inference.meta_reference import (
18
- MetaReferenceInferenceConfig,
19
- )
20
- from llama_stack.providers.inline.inference.sentence_transformers import (
21
- SentenceTransformersInferenceConfig,
22
- )
23
- from llama_stack.providers.inline.vector_io.faiss.config import FaissVectorIOConfig
24
- from llama_stack_api import ModelType
25
-
26
-
27
- def get_distribution_template() -> DistributionTemplate:
28
- providers = {
29
- "inference": [BuildProvider(provider_type="inline::meta-reference")],
30
- "vector_io": [
31
- BuildProvider(provider_type="inline::faiss"),
32
- BuildProvider(provider_type="remote::chromadb"),
33
- BuildProvider(provider_type="remote::pgvector"),
34
- ],
35
- "safety": [BuildProvider(provider_type="inline::llama-guard")],
36
- "agents": [BuildProvider(provider_type="inline::meta-reference")],
37
- "eval": [BuildProvider(provider_type="inline::meta-reference")],
38
- "datasetio": [
39
- BuildProvider(provider_type="remote::huggingface"),
40
- BuildProvider(provider_type="inline::localfs"),
41
- ],
42
- "scoring": [
43
- BuildProvider(provider_type="inline::basic"),
44
- BuildProvider(provider_type="inline::llm-as-judge"),
45
- BuildProvider(provider_type="inline::braintrust"),
46
- ],
47
- "tool_runtime": [
48
- BuildProvider(provider_type="remote::brave-search"),
49
- BuildProvider(provider_type="remote::tavily-search"),
50
- BuildProvider(provider_type="inline::rag-runtime"),
51
- BuildProvider(provider_type="remote::model-context-protocol"),
52
- ],
53
- }
54
- name = "meta-reference-gpu"
55
- inference_provider = Provider(
56
- provider_id="meta-reference-inference",
57
- provider_type="inline::meta-reference",
58
- config=MetaReferenceInferenceConfig.sample_run_config(
59
- model="${env.INFERENCE_MODEL}",
60
- checkpoint_dir="${env.INFERENCE_CHECKPOINT_DIR:=null}",
61
- ),
62
- )
63
- embedding_provider = Provider(
64
- provider_id="sentence-transformers",
65
- provider_type="inline::sentence-transformers",
66
- config=SentenceTransformersInferenceConfig.sample_run_config(),
67
- )
68
- vector_io_provider = Provider(
69
- provider_id="faiss",
70
- provider_type="inline::faiss",
71
- config=FaissVectorIOConfig.sample_run_config(f"~/.llama/distributions/{name}"),
72
- )
73
-
74
- inference_model = ModelInput(
75
- model_id="${env.INFERENCE_MODEL}",
76
- provider_id="meta-reference-inference",
77
- )
78
- embedding_model = ModelInput(
79
- model_id="nomic-embed-text-v1.5",
80
- provider_id="sentence-transformers",
81
- model_type=ModelType.embedding,
82
- metadata={
83
- "embedding_dimension": 768,
84
- },
85
- )
86
- safety_model = ModelInput(
87
- model_id="${env.SAFETY_MODEL}",
88
- provider_id="meta-reference-safety",
89
- )
90
- default_tool_groups = [
91
- ToolGroupInput(
92
- toolgroup_id="builtin::websearch",
93
- provider_id="tavily-search",
94
- ),
95
- ToolGroupInput(
96
- toolgroup_id="builtin::rag",
97
- provider_id="rag-runtime",
98
- ),
99
- ]
100
-
101
- return DistributionTemplate(
102
- name=name,
103
- distro_type="self_hosted",
104
- description="Use Meta Reference for running LLM inference",
105
- template_path=Path(__file__).parent / "doc_template.md",
106
- providers=providers,
107
- run_configs={
108
- "config.yaml": RunConfigSettings(
109
- provider_overrides={
110
- "inference": [inference_provider, embedding_provider],
111
- "vector_io": [vector_io_provider],
112
- },
113
- default_models=[inference_model, embedding_model],
114
- default_tool_groups=default_tool_groups,
115
- ),
116
- "run-with-safety.yaml": RunConfigSettings(
117
- provider_overrides={
118
- "inference": [
119
- inference_provider,
120
- embedding_provider,
121
- Provider(
122
- provider_id="meta-reference-safety",
123
- provider_type="inline::meta-reference",
124
- config=MetaReferenceInferenceConfig.sample_run_config(
125
- model="${env.SAFETY_MODEL}",
126
- checkpoint_dir="${env.SAFETY_CHECKPOINT_DIR:=null}",
127
- ),
128
- ),
129
- ],
130
- "vector_io": [vector_io_provider],
131
- },
132
- default_models=[
133
- inference_model,
134
- safety_model,
135
- embedding_model,
136
- ],
137
- default_shields=[ShieldInput(shield_id="${env.SAFETY_MODEL}")],
138
- default_tool_groups=default_tool_groups,
139
- ),
140
- },
141
- run_config_env_vars={
142
- "LLAMA_STACK_PORT": (
143
- "8321",
144
- "Port for the Llama Stack distribution server",
145
- ),
146
- "INFERENCE_MODEL": (
147
- "meta-llama/Llama-3.2-3B-Instruct",
148
- "Inference model loaded into the Meta Reference server",
149
- ),
150
- "INFERENCE_CHECKPOINT_DIR": (
151
- "null",
152
- "Directory containing the Meta Reference model checkpoint",
153
- ),
154
- "SAFETY_MODEL": (
155
- "meta-llama/Llama-Guard-3-1B",
156
- "Name of the safety (Llama-Guard) model to use",
157
- ),
158
- "SAFETY_CHECKPOINT_DIR": (
159
- "null",
160
- "Directory containing the Llama-Guard model checkpoint",
161
- ),
162
- },
163
- )
@@ -1,155 +0,0 @@
1
- version: 2
2
- image_name: meta-reference-gpu
3
- apis:
4
- - agents
5
- - datasetio
6
- - eval
7
- - inference
8
- - safety
9
- - scoring
10
- - tool_runtime
11
- - vector_io
12
- providers:
13
- inference:
14
- - provider_id: meta-reference-inference
15
- provider_type: inline::meta-reference
16
- config:
17
- model: ${env.INFERENCE_MODEL}
18
- checkpoint_dir: ${env.INFERENCE_CHECKPOINT_DIR:=null}
19
- quantization:
20
- type: ${env.QUANTIZATION_TYPE:=bf16}
21
- model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
22
- max_batch_size: ${env.MAX_BATCH_SIZE:=1}
23
- max_seq_len: ${env.MAX_SEQ_LEN:=4096}
24
- - provider_id: sentence-transformers
25
- provider_type: inline::sentence-transformers
26
- - provider_id: meta-reference-safety
27
- provider_type: inline::meta-reference
28
- config:
29
- model: ${env.SAFETY_MODEL}
30
- checkpoint_dir: ${env.SAFETY_CHECKPOINT_DIR:=null}
31
- quantization:
32
- type: ${env.QUANTIZATION_TYPE:=bf16}
33
- model_parallel_size: ${env.MODEL_PARALLEL_SIZE:=0}
34
- max_batch_size: ${env.MAX_BATCH_SIZE:=1}
35
- max_seq_len: ${env.MAX_SEQ_LEN:=4096}
36
- vector_io:
37
- - provider_id: faiss
38
- provider_type: inline::faiss
39
- config:
40
- persistence:
41
- namespace: vector_io::faiss
42
- backend: kv_default
43
- safety:
44
- - provider_id: llama-guard
45
- provider_type: inline::llama-guard
46
- config:
47
- excluded_categories: []
48
- agents:
49
- - provider_id: meta-reference
50
- provider_type: inline::meta-reference
51
- config:
52
- persistence:
53
- agent_state:
54
- namespace: agents
55
- backend: kv_default
56
- responses:
57
- table_name: responses
58
- backend: sql_default
59
- max_write_queue_size: 10000
60
- num_writers: 4
61
- eval:
62
- - provider_id: meta-reference
63
- provider_type: inline::meta-reference
64
- config:
65
- kvstore:
66
- namespace: eval
67
- backend: kv_default
68
- datasetio:
69
- - provider_id: huggingface
70
- provider_type: remote::huggingface
71
- config:
72
- kvstore:
73
- namespace: datasetio::huggingface
74
- backend: kv_default
75
- - provider_id: localfs
76
- provider_type: inline::localfs
77
- config:
78
- kvstore:
79
- namespace: datasetio::localfs
80
- backend: kv_default
81
- scoring:
82
- - provider_id: basic
83
- provider_type: inline::basic
84
- - provider_id: llm-as-judge
85
- provider_type: inline::llm-as-judge
86
- - provider_id: braintrust
87
- provider_type: inline::braintrust
88
- config:
89
- openai_api_key: ${env.OPENAI_API_KEY:=}
90
- tool_runtime:
91
- - provider_id: brave-search
92
- provider_type: remote::brave-search
93
- config:
94
- api_key: ${env.BRAVE_SEARCH_API_KEY:=}
95
- max_results: 3
96
- - provider_id: tavily-search
97
- provider_type: remote::tavily-search
98
- config:
99
- api_key: ${env.TAVILY_SEARCH_API_KEY:=}
100
- max_results: 3
101
- - provider_id: rag-runtime
102
- provider_type: inline::rag-runtime
103
- - provider_id: model-context-protocol
104
- provider_type: remote::model-context-protocol
105
- storage:
106
- backends:
107
- kv_default:
108
- type: kv_sqlite
109
- db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/kvstore.db
110
- sql_default:
111
- type: sql_sqlite
112
- db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/meta-reference-gpu}/sql_store.db
113
- stores:
114
- metadata:
115
- namespace: registry
116
- backend: kv_default
117
- inference:
118
- table_name: inference_store
119
- backend: sql_default
120
- max_write_queue_size: 10000
121
- num_writers: 4
122
- conversations:
123
- table_name: openai_conversations
124
- backend: sql_default
125
- prompts:
126
- namespace: prompts
127
- backend: kv_default
128
- registered_resources:
129
- models:
130
- - metadata: {}
131
- model_id: ${env.INFERENCE_MODEL}
132
- provider_id: meta-reference-inference
133
- model_type: llm
134
- - metadata: {}
135
- model_id: ${env.SAFETY_MODEL}
136
- provider_id: meta-reference-safety
137
- model_type: llm
138
- - metadata:
139
- embedding_dimension: 768
140
- model_id: nomic-embed-text-v1.5
141
- provider_id: sentence-transformers
142
- model_type: embedding
143
- shields:
144
- - shield_id: ${env.SAFETY_MODEL}
145
- vector_dbs: []
146
- datasets: []
147
- scoring_fns: []
148
- benchmarks: []
149
- tool_groups:
150
- - toolgroup_id: builtin::websearch
151
- provider_id: tavily-search
152
- - toolgroup_id: builtin::rag
153
- provider_id: rag-runtime
154
- server:
155
- port: 8321
@@ -1,88 +0,0 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the terms described in the LICENSE file in
5
- # the root directory of this source tree.
6
-
7
- import math
8
- import re
9
-
10
- import torch
11
- from torch import nn
12
-
13
-
14
- def hadamard_transform(x: torch.Tensor) -> torch.Tensor:
15
- """Hadamard transform.
16
-
17
- This function performs the Hadamard transform on the input tensor 'x'.
18
- The Hadamard transform is a linear transformation that multiplies the input
19
- tensor by the Hadamard matrix of dimension n x n, where n is the size of
20
- the last dimension of the input tensor.
21
- """
22
- *_, n = x.shape
23
- m = int(math.log2(n))
24
- assert n == 1 << m, "n must be a power of 2"
25
- x = x[..., None]
26
- inv_sqrt2 = 0.5**0.5
27
- for _ in range(m):
28
- top = x[..., ::2, :] + x[..., 1::2, :]
29
- bot = x[..., ::2, :] - x[..., 1::2, :]
30
- x = torch.cat((top, bot), dim=-1)
31
- x *= inv_sqrt2
32
- res = x.squeeze(-2)
33
- return res
34
-
35
-
36
- class HadamardModule(torch.nn.Module):
37
- """A module that applies the Hadamard transform to the input tensor.
38
-
39
- Args:
40
- group_size: The size of the groups that the input tensor will be divided into
41
- before applying the Hadamard transform.
42
- """
43
-
44
- def __init__(self, group_size: int) -> None:
45
- super().__init__()
46
- self.group_size = group_size
47
-
48
- def forward(self, x: torch.Tensor) -> torch.Tensor:
49
- reshape_back = False
50
- orig_shape = x.shape
51
- if self.group_size != x.shape[-1]:
52
- reshape_back = True
53
- x = x.reshape(-1, x.shape[-1] // self.group_size, self.group_size)
54
- x = hadamard_transform(x)
55
- if reshape_back:
56
- x = x.reshape(orig_shape)
57
- return x
58
-
59
-
60
- def add_hadamard_transform_for_spinquant(model: torch.nn.Module, prefix: str = "") -> None:
61
- """
62
- Adds a Hadamard transform to the last linear layer of each feedforward network (FFN) in the model.
63
- This function recursively traverses the model's children and looks for layers that match the pattern
64
- "layers.<digit>.feed_forward.w2", where <digit> is one or more digits. When such a layer is found,
65
- it is replaced with a new sequential module that consists of a HadamardModule followed by the original
66
- layer. The HadamardModule applies the Hadamard transform to the input tensor.
67
-
68
- See `SpinQuant <https://arxiv.org/abs/2405.16406>_` paper for more details.
69
-
70
- Args:
71
- model: An instance of 'torch.nn.Module' (e.g., Transformer model).
72
- prefix: A string prefix to add to the full name of each child module.
73
-
74
- Returns:
75
- None
76
- """
77
-
78
- pattern_last_linear_ffn = r"layers.\d+.feed_forward.w2"
79
- for module_name, module in model.named_children():
80
- child_full_name = prefix + "." + module_name
81
- if re.search(pattern_last_linear_ffn, child_full_name):
82
- # Module matching this pattern should be nn.Linear with in_features
83
- assert isinstance(module, nn.Linear), f"Expected nn.Linear, got {type(module)}"
84
- new_module = nn.Sequential(HadamardModule(group_size=module.in_features), module)
85
- del module
86
- setattr(model, module_name, new_module)
87
- else:
88
- add_hadamard_transform_for_spinquant(module, (prefix + "." if prefix else prefix) + module_name)