llama-stack 0.4.3__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/cli/stack/_list_deps.py +11 -7
- llama_stack/cli/stack/run.py +3 -25
- llama_stack/core/access_control/datatypes.py +78 -0
- llama_stack/core/configure.py +2 -2
- {llama_stack_api/internal → llama_stack/core/connectors}/__init__.py +2 -2
- llama_stack/core/connectors/connectors.py +162 -0
- llama_stack/core/conversations/conversations.py +61 -58
- llama_stack/core/datatypes.py +54 -8
- llama_stack/core/library_client.py +60 -13
- llama_stack/core/prompts/prompts.py +43 -42
- llama_stack/core/routers/datasets.py +20 -17
- llama_stack/core/routers/eval_scoring.py +143 -53
- llama_stack/core/routers/inference.py +20 -9
- llama_stack/core/routers/safety.py +30 -42
- llama_stack/core/routers/vector_io.py +15 -7
- llama_stack/core/routing_tables/models.py +42 -3
- llama_stack/core/routing_tables/scoring_functions.py +19 -19
- llama_stack/core/routing_tables/shields.py +20 -17
- llama_stack/core/routing_tables/vector_stores.py +8 -5
- llama_stack/core/server/auth.py +192 -17
- llama_stack/core/server/fastapi_router_registry.py +40 -5
- llama_stack/core/server/server.py +24 -5
- llama_stack/core/stack.py +54 -10
- llama_stack/core/storage/datatypes.py +9 -0
- llama_stack/core/store/registry.py +1 -1
- llama_stack/core/utils/exec.py +2 -2
- llama_stack/core/utils/type_inspection.py +16 -2
- llama_stack/distributions/dell/config.yaml +4 -1
- llama_stack/distributions/dell/doc_template.md +209 -0
- llama_stack/distributions/dell/run-with-safety.yaml +4 -1
- llama_stack/distributions/nvidia/config.yaml +4 -1
- llama_stack/distributions/nvidia/doc_template.md +170 -0
- llama_stack/distributions/nvidia/run-with-safety.yaml +4 -1
- llama_stack/distributions/oci/config.yaml +4 -1
- llama_stack/distributions/oci/doc_template.md +140 -0
- llama_stack/distributions/open-benchmark/config.yaml +9 -1
- llama_stack/distributions/postgres-demo/config.yaml +1 -1
- llama_stack/distributions/starter/build.yaml +62 -0
- llama_stack/distributions/starter/config.yaml +22 -3
- llama_stack/distributions/starter/run-with-postgres-store.yaml +22 -3
- llama_stack/distributions/starter/starter.py +13 -1
- llama_stack/distributions/starter-gpu/build.yaml +62 -0
- llama_stack/distributions/starter-gpu/config.yaml +22 -3
- llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml +22 -3
- llama_stack/distributions/template.py +10 -2
- llama_stack/distributions/watsonx/config.yaml +4 -1
- llama_stack/log.py +1 -0
- llama_stack/models/llama/resources/dog.jpg +0 -0
- llama_stack/models/llama/resources/pasta.jpeg +0 -0
- llama_stack/models/llama/resources/small_dog.jpg +0 -0
- llama_stack/providers/inline/agents/meta_reference/__init__.py +1 -0
- llama_stack/providers/inline/agents/meta_reference/agents.py +57 -61
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +183 -60
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +94 -22
- llama_stack/providers/inline/agents/meta_reference/responses/types.py +2 -1
- llama_stack/providers/inline/agents/meta_reference/responses/utils.py +4 -1
- llama_stack/providers/inline/agents/meta_reference/safety.py +2 -2
- llama_stack/providers/inline/batches/reference/batches.py +2 -1
- llama_stack/providers/inline/eval/meta_reference/eval.py +40 -32
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
- llama_stack/providers/inline/post_training/huggingface/post_training.py +33 -38
- llama_stack/providers/inline/post_training/huggingface/utils.py +2 -5
- llama_stack/providers/inline/post_training/torchtune/post_training.py +28 -33
- llama_stack/providers/inline/post_training/torchtune/recipes/lora_finetuning_single_device.py +2 -4
- llama_stack/providers/inline/safety/code_scanner/code_scanner.py +12 -15
- llama_stack/providers/inline/safety/llama_guard/llama_guard.py +15 -18
- llama_stack/providers/inline/safety/prompt_guard/prompt_guard.py +11 -17
- llama_stack/providers/inline/scoring/basic/scoring.py +13 -17
- llama_stack/providers/inline/scoring/braintrust/braintrust.py +15 -15
- llama_stack/providers/inline/scoring/llm_as_judge/scoring.py +13 -17
- llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec.py +1 -1
- llama_stack/providers/registry/agents.py +1 -0
- llama_stack/providers/registry/inference.py +1 -9
- llama_stack/providers/registry/vector_io.py +136 -16
- llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
- llama_stack/providers/remote/eval/nvidia/README.md +134 -0
- llama_stack/providers/remote/eval/nvidia/eval.py +22 -21
- llama_stack/providers/remote/files/s3/README.md +266 -0
- llama_stack/providers/remote/files/s3/config.py +5 -3
- llama_stack/providers/remote/files/s3/files.py +2 -2
- llama_stack/providers/remote/inference/gemini/gemini.py +4 -0
- llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
- llama_stack/providers/remote/inference/openai/openai.py +2 -0
- llama_stack/providers/remote/inference/together/together.py +4 -0
- llama_stack/providers/remote/inference/vertexai/config.py +3 -3
- llama_stack/providers/remote/inference/vertexai/vertexai.py +5 -2
- llama_stack/providers/remote/inference/vllm/config.py +37 -18
- llama_stack/providers/remote/inference/vllm/vllm.py +0 -3
- llama_stack/providers/remote/inference/watsonx/watsonx.py +4 -0
- llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
- llama_stack/providers/remote/post_training/nvidia/post_training.py +31 -33
- llama_stack/providers/remote/safety/bedrock/bedrock.py +10 -27
- llama_stack/providers/remote/safety/nvidia/README.md +78 -0
- llama_stack/providers/remote/safety/nvidia/nvidia.py +9 -25
- llama_stack/providers/remote/safety/sambanova/sambanova.py +13 -11
- llama_stack/providers/remote/vector_io/elasticsearch/__init__.py +17 -0
- llama_stack/providers/remote/vector_io/elasticsearch/config.py +32 -0
- llama_stack/providers/remote/vector_io/elasticsearch/elasticsearch.py +463 -0
- llama_stack/providers/remote/vector_io/oci/__init__.py +22 -0
- llama_stack/providers/remote/vector_io/oci/config.py +41 -0
- llama_stack/providers/remote/vector_io/oci/oci26ai.py +595 -0
- llama_stack/providers/remote/vector_io/pgvector/config.py +69 -2
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +255 -6
- llama_stack/providers/remote/vector_io/qdrant/qdrant.py +62 -38
- llama_stack/providers/utils/bedrock/client.py +3 -3
- llama_stack/providers/utils/bedrock/config.py +7 -7
- llama_stack/providers/utils/inference/embedding_mixin.py +4 -0
- llama_stack/providers/utils/inference/http_client.py +239 -0
- llama_stack/providers/utils/inference/litellm_openai_mixin.py +5 -0
- llama_stack/providers/utils/inference/model_registry.py +148 -2
- llama_stack/providers/utils/inference/openai_compat.py +2 -1
- llama_stack/providers/utils/inference/openai_mixin.py +41 -2
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +92 -5
- llama_stack/providers/utils/memory/vector_store.py +46 -19
- llama_stack/providers/utils/responses/responses_store.py +40 -6
- llama_stack/providers/utils/safety.py +114 -0
- llama_stack/providers/utils/tools/mcp.py +44 -3
- llama_stack/testing/api_recorder.py +9 -3
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/METADATA +14 -2
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/RECORD +131 -275
- llama_stack-0.5.0rc1.dist-info/top_level.txt +1 -0
- llama_stack/distributions/meta-reference-gpu/__init__.py +0 -7
- llama_stack/distributions/meta-reference-gpu/config.yaml +0 -140
- llama_stack/distributions/meta-reference-gpu/meta_reference.py +0 -163
- llama_stack/distributions/meta-reference-gpu/run-with-safety.yaml +0 -155
- llama_stack/models/llama/hadamard_utils.py +0 -88
- llama_stack/models/llama/llama3/args.py +0 -74
- llama_stack/models/llama/llama3/generation.py +0 -378
- llama_stack/models/llama/llama3/model.py +0 -304
- llama_stack/models/llama/llama3/multimodal/__init__.py +0 -12
- llama_stack/models/llama/llama3/multimodal/encoder_utils.py +0 -180
- llama_stack/models/llama/llama3/multimodal/image_transform.py +0 -409
- llama_stack/models/llama/llama3/multimodal/model.py +0 -1430
- llama_stack/models/llama/llama3/multimodal/utils.py +0 -26
- llama_stack/models/llama/llama3/quantization/__init__.py +0 -5
- llama_stack/models/llama/llama3/quantization/loader.py +0 -316
- llama_stack/models/llama/llama3_1/__init__.py +0 -12
- llama_stack/models/llama/llama3_1/prompt_format.md +0 -358
- llama_stack/models/llama/llama3_1/prompts.py +0 -258
- llama_stack/models/llama/llama3_2/__init__.py +0 -5
- llama_stack/models/llama/llama3_2/prompts_text.py +0 -229
- llama_stack/models/llama/llama3_2/prompts_vision.py +0 -126
- llama_stack/models/llama/llama3_2/text_prompt_format.md +0 -286
- llama_stack/models/llama/llama3_2/vision_prompt_format.md +0 -141
- llama_stack/models/llama/llama3_3/__init__.py +0 -5
- llama_stack/models/llama/llama3_3/prompts.py +0 -259
- llama_stack/models/llama/llama4/args.py +0 -107
- llama_stack/models/llama/llama4/ffn.py +0 -58
- llama_stack/models/llama/llama4/moe.py +0 -214
- llama_stack/models/llama/llama4/preprocess.py +0 -435
- llama_stack/models/llama/llama4/quantization/__init__.py +0 -5
- llama_stack/models/llama/llama4/quantization/loader.py +0 -226
- llama_stack/models/llama/llama4/vision/__init__.py +0 -5
- llama_stack/models/llama/llama4/vision/embedding.py +0 -210
- llama_stack/models/llama/llama4/vision/encoder.py +0 -412
- llama_stack/models/llama/quantize_impls.py +0 -316
- llama_stack/providers/inline/inference/meta_reference/__init__.py +0 -20
- llama_stack/providers/inline/inference/meta_reference/common.py +0 -24
- llama_stack/providers/inline/inference/meta_reference/config.py +0 -68
- llama_stack/providers/inline/inference/meta_reference/generators.py +0 -201
- llama_stack/providers/inline/inference/meta_reference/inference.py +0 -542
- llama_stack/providers/inline/inference/meta_reference/model_parallel.py +0 -77
- llama_stack/providers/inline/inference/meta_reference/parallel_utils.py +0 -353
- llama_stack-0.4.3.dist-info/top_level.txt +0 -2
- llama_stack_api/__init__.py +0 -945
- llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/admin/api.py +0 -72
- llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/admin/models.py +0 -113
- llama_stack_api/agents.py +0 -173
- llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/batches/api.py +0 -53
- llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/batches/models.py +0 -78
- llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/common/errors.py +0 -95
- llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/common/responses.py +0 -77
- llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/connectors.py +0 -146
- llama_stack_api/conversations.py +0 -270
- llama_stack_api/datasetio.py +0 -55
- llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/datatypes.py +0 -373
- llama_stack_api/eval.py +0 -137
- llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/files/api.py +0 -51
- llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/files/models.py +0 -107
- llama_stack_api/inference.py +0 -1169
- llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/internal/kvstore.py +0 -28
- llama_stack_api/internal/sqlstore.py +0 -81
- llama_stack_api/llama_stack_api/__init__.py +0 -945
- llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/llama_stack_api/admin/api.py +0 -72
- llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/llama_stack_api/admin/models.py +0 -113
- llama_stack_api/llama_stack_api/agents.py +0 -173
- llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/llama_stack_api/batches/api.py +0 -53
- llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/llama_stack_api/batches/models.py +0 -78
- llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/llama_stack_api/common/errors.py +0 -95
- llama_stack_api/llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/llama_stack_api/common/responses.py +0 -77
- llama_stack_api/llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/llama_stack_api/connectors.py +0 -146
- llama_stack_api/llama_stack_api/conversations.py +0 -270
- llama_stack_api/llama_stack_api/datasetio.py +0 -55
- llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/llama_stack_api/datatypes.py +0 -373
- llama_stack_api/llama_stack_api/eval.py +0 -137
- llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/llama_stack_api/files/api.py +0 -51
- llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/llama_stack_api/files/models.py +0 -107
- llama_stack_api/llama_stack_api/inference.py +0 -1169
- llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
- llama_stack_api/llama_stack_api/internal/kvstore.py +0 -28
- llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -81
- llama_stack_api/llama_stack_api/models.py +0 -171
- llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/llama_stack_api/post_training.py +0 -370
- llama_stack_api/llama_stack_api/prompts.py +0 -203
- llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/llama_stack_api/providers/api.py +0 -16
- llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/llama_stack_api/providers/models.py +0 -24
- llama_stack_api/llama_stack_api/py.typed +0 -0
- llama_stack_api/llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/llama_stack_api/resource.py +0 -37
- llama_stack_api/llama_stack_api/router_utils.py +0 -160
- llama_stack_api/llama_stack_api/safety.py +0 -132
- llama_stack_api/llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/llama_stack_api/scoring.py +0 -93
- llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/llama_stack_api/shields.py +0 -93
- llama_stack_api/llama_stack_api/tools.py +0 -226
- llama_stack_api/llama_stack_api/vector_io.py +0 -941
- llama_stack_api/llama_stack_api/vector_stores.py +0 -53
- llama_stack_api/llama_stack_api/version.py +0 -9
- llama_stack_api/models.py +0 -171
- llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/post_training.py +0 -370
- llama_stack_api/prompts.py +0 -203
- llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/providers/api.py +0 -16
- llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/providers/models.py +0 -24
- llama_stack_api/py.typed +0 -0
- llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/resource.py +0 -37
- llama_stack_api/router_utils.py +0 -160
- llama_stack_api/safety.py +0 -132
- llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/scoring.py +0 -93
- llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/shields.py +0 -93
- llama_stack_api/tools.py +0 -226
- llama_stack_api/vector_io.py +0 -941
- llama_stack_api/vector_stores.py +0 -53
- llama_stack_api/version.py +0 -9
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/WHEEL +0 -0
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.4.3.dist-info → llama_stack-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
|
@@ -190,7 +190,7 @@ class CachedDiskDistributionRegistry(DiskDistributionRegistry):
|
|
|
190
190
|
|
|
191
191
|
|
|
192
192
|
async def create_dist_registry(
|
|
193
|
-
metadata_store: KVStoreReference,
|
|
193
|
+
metadata_store: KVStoreReference, distro_name: str
|
|
194
194
|
) -> tuple[CachedDiskDistributionRegistry, KVStore]:
|
|
195
195
|
# instantiate kvstore for storing and retrieving distribution metadata
|
|
196
196
|
dist_kvstore = await kvstore_impl(metadata_store)
|
llama_stack/core/utils/exec.py
CHANGED
|
@@ -17,10 +17,10 @@ from llama_stack.log import get_logger
|
|
|
17
17
|
log = get_logger(name=__name__, category="core")
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
def formulate_run_args(image_type: str,
|
|
20
|
+
def formulate_run_args(image_type: str, distro_name: str) -> list:
|
|
21
21
|
# Only venv is supported now
|
|
22
22
|
current_venv = os.environ.get("VIRTUAL_ENV")
|
|
23
|
-
env_name =
|
|
23
|
+
env_name = distro_name or current_venv
|
|
24
24
|
if not env_name:
|
|
25
25
|
cprint(
|
|
26
26
|
"No current virtual environment detected, please specify a virtual environment name with --image-name",
|
|
@@ -36,10 +36,24 @@ def is_unwrapped_body_param(param_type: Any) -> bool:
|
|
|
36
36
|
base_type = args[0]
|
|
37
37
|
metadata = args[1:]
|
|
38
38
|
|
|
39
|
-
# Look for Body annotation
|
|
39
|
+
# Look for Body annotation; treat embed=None (default) as unwrapped
|
|
40
40
|
# Body() returns a FieldInfo object, so we check for that type and the embed attribute
|
|
41
41
|
for item in metadata:
|
|
42
|
-
if isinstance(item, FieldInfo) and hasattr(item, "embed") and
|
|
42
|
+
if isinstance(item, FieldInfo) and hasattr(item, "embed") and item.embed is not True:
|
|
43
43
|
return inspect.isclass(base_type) and issubclass(base_type, BaseModel)
|
|
44
44
|
|
|
45
45
|
return False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def is_body_param(param_type: Any) -> bool:
|
|
49
|
+
"""
|
|
50
|
+
Check if a parameter type represents a body parameter (Annotated with Body()).
|
|
51
|
+
"""
|
|
52
|
+
if get_origin(param_type) is typing.Annotated:
|
|
53
|
+
args = get_args(param_type)
|
|
54
|
+
base_type = args[0]
|
|
55
|
+
metadata = args[1:]
|
|
56
|
+
for item in metadata:
|
|
57
|
+
if isinstance(item, FieldInfo):
|
|
58
|
+
return inspect.isclass(base_type) and issubclass(base_type, BaseModel)
|
|
59
|
+
return False
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
version: 2
|
|
2
|
-
|
|
2
|
+
distro_name: dell
|
|
3
3
|
apis:
|
|
4
4
|
- agents
|
|
5
5
|
- datasetio
|
|
@@ -108,6 +108,9 @@ storage:
|
|
|
108
108
|
prompts:
|
|
109
109
|
namespace: prompts
|
|
110
110
|
backend: kv_default
|
|
111
|
+
connectors:
|
|
112
|
+
namespace: connectors
|
|
113
|
+
backend: kv_default
|
|
111
114
|
registered_resources:
|
|
112
115
|
models:
|
|
113
116
|
- metadata: {}
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
---
|
|
2
|
+
orphan: true
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
# Dell Distribution of Llama Stack
|
|
6
|
+
|
|
7
|
+
```{toctree}
|
|
8
|
+
:maxdepth: 2
|
|
9
|
+
:hidden:
|
|
10
|
+
|
|
11
|
+
self
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
|
|
15
|
+
|
|
16
|
+
{{ providers_table }}
|
|
17
|
+
|
|
18
|
+
You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
|
|
19
|
+
|
|
20
|
+
{% if run_config_env_vars %}
|
|
21
|
+
### Environment Variables
|
|
22
|
+
|
|
23
|
+
The following environment variables can be configured:
|
|
24
|
+
|
|
25
|
+
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
|
26
|
+
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
|
27
|
+
{% endfor %}
|
|
28
|
+
{% endif %}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
|
|
32
|
+
|
|
33
|
+
NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
export INFERENCE_PORT=8181
|
|
37
|
+
export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
|
|
38
|
+
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
|
39
|
+
export CHROMADB_HOST=localhost
|
|
40
|
+
export CHROMADB_PORT=6601
|
|
41
|
+
export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
|
|
42
|
+
export CUDA_VISIBLE_DEVICES=0
|
|
43
|
+
export LLAMA_STACK_PORT=8321
|
|
44
|
+
|
|
45
|
+
docker run --rm -it \
|
|
46
|
+
--pull always \
|
|
47
|
+
--network host \
|
|
48
|
+
-v $HOME/.cache/huggingface:/data \
|
|
49
|
+
-e HF_TOKEN=$HF_TOKEN \
|
|
50
|
+
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
|
51
|
+
--gpus $CUDA_VISIBLE_DEVICES \
|
|
52
|
+
ghcr.io/huggingface/text-generation-inference \
|
|
53
|
+
--dtype bfloat16 \
|
|
54
|
+
--usage-stats off \
|
|
55
|
+
--sharded false \
|
|
56
|
+
--cuda-memory-fraction 0.7 \
|
|
57
|
+
--model-id $INFERENCE_MODEL \
|
|
58
|
+
--port $INFERENCE_PORT --hostname 0.0.0.0
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
export SAFETY_INFERENCE_PORT=8282
|
|
65
|
+
export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
|
|
66
|
+
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
|
67
|
+
export CUDA_VISIBLE_DEVICES=1
|
|
68
|
+
|
|
69
|
+
docker run --rm -it \
|
|
70
|
+
--pull always \
|
|
71
|
+
--network host \
|
|
72
|
+
-v $HOME/.cache/huggingface:/data \
|
|
73
|
+
-e HF_TOKEN=$HF_TOKEN \
|
|
74
|
+
-p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
|
|
75
|
+
--gpus $CUDA_VISIBLE_DEVICES \
|
|
76
|
+
ghcr.io/huggingface/text-generation-inference \
|
|
77
|
+
--dtype bfloat16 \
|
|
78
|
+
--usage-stats off \
|
|
79
|
+
--sharded false \
|
|
80
|
+
--cuda-memory-fraction 0.7 \
|
|
81
|
+
--model-id $SAFETY_MODEL \
|
|
82
|
+
--hostname 0.0.0.0 \
|
|
83
|
+
--port $SAFETY_INFERENCE_PORT
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Dell distribution relies on ChromaDB for vector database usage
|
|
87
|
+
|
|
88
|
+
You can start a chroma-db easily using docker.
|
|
89
|
+
```bash
|
|
90
|
+
# This is where the indices are persisted
|
|
91
|
+
mkdir -p $HOME/chromadb
|
|
92
|
+
|
|
93
|
+
podman run --rm -it \
|
|
94
|
+
--network host \
|
|
95
|
+
--name chromadb \
|
|
96
|
+
-v $HOME/chromadb:/chroma/chroma \
|
|
97
|
+
-e IS_PERSISTENT=TRUE \
|
|
98
|
+
chromadb/chroma:latest \
|
|
99
|
+
--port $CHROMADB_PORT \
|
|
100
|
+
--host $CHROMADB_HOST
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Running Llama Stack
|
|
104
|
+
|
|
105
|
+
Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
|
106
|
+
|
|
107
|
+
### Via Docker
|
|
108
|
+
|
|
109
|
+
This method allows you to get started quickly without having to build the distribution code.
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
docker run -it \
|
|
113
|
+
--pull always \
|
|
114
|
+
--network host \
|
|
115
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
116
|
+
-v $HOME/.llama:/root/.llama \
|
|
117
|
+
# NOTE: mount the llama-stack directory if testing local changes else not needed
|
|
118
|
+
-v $HOME/git/llama-stack:/app/llama-stack-source \
|
|
119
|
+
# localhost/distribution-dell:dev if building / testing locally
|
|
120
|
+
-e INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
121
|
+
-e DEH_URL=$DEH_URL \
|
|
122
|
+
-e CHROMA_URL=$CHROMA_URL \
|
|
123
|
+
llamastack/distribution-{{ name }}\
|
|
124
|
+
--port $LLAMA_STACK_PORT
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
If you are using Llama Stack Safety / Shield APIs, use:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# You need a local checkout of llama-stack to run this, get it using
|
|
132
|
+
# git clone https://github.com/meta-llama/llama-stack.git
|
|
133
|
+
cd /path/to/llama-stack
|
|
134
|
+
|
|
135
|
+
export SAFETY_INFERENCE_PORT=8282
|
|
136
|
+
export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
|
|
137
|
+
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
|
138
|
+
|
|
139
|
+
docker run \
|
|
140
|
+
-it \
|
|
141
|
+
--pull always \
|
|
142
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
143
|
+
-v $HOME/.llama:/root/.llama \
|
|
144
|
+
-v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-config.yaml \
|
|
145
|
+
-e INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
146
|
+
-e DEH_URL=$DEH_URL \
|
|
147
|
+
-e SAFETY_MODEL=$SAFETY_MODEL \
|
|
148
|
+
-e DEH_SAFETY_URL=$DEH_SAFETY_URL \
|
|
149
|
+
-e CHROMA_URL=$CHROMA_URL \
|
|
150
|
+
llamastack/distribution-{{ name }} \
|
|
151
|
+
--config /root/my-config.yaml \
|
|
152
|
+
--port $LLAMA_STACK_PORT
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Via Docker with Custom Run Configuration
|
|
156
|
+
|
|
157
|
+
You can also run the Docker container with a custom run configuration file by mounting it into the container:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
# Set the path to your custom config.yaml file
|
|
161
|
+
CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
|
|
162
|
+
|
|
163
|
+
docker run -it \
|
|
164
|
+
--pull always \
|
|
165
|
+
--network host \
|
|
166
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
167
|
+
-v $HOME/.llama:/root/.llama \
|
|
168
|
+
-v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
|
|
169
|
+
-e RUN_CONFIG_PATH=/app/custom-config.yaml \
|
|
170
|
+
-e INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
171
|
+
-e DEH_URL=$DEH_URL \
|
|
172
|
+
-e CHROMA_URL=$CHROMA_URL \
|
|
173
|
+
llamastack/distribution-{{ name }} \
|
|
174
|
+
--port $LLAMA_STACK_PORT
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
|
|
178
|
+
|
|
179
|
+
{% if run_configs %}
|
|
180
|
+
Available run configurations for this distribution:
|
|
181
|
+
{% for config in run_configs %}
|
|
182
|
+
- `{{ config }}`
|
|
183
|
+
{% endfor %}
|
|
184
|
+
{% endif %}
|
|
185
|
+
|
|
186
|
+
### Via Conda
|
|
187
|
+
|
|
188
|
+
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
llama stack list-deps {{ name }} | xargs -L1 pip install
|
|
192
|
+
INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
193
|
+
DEH_URL=$DEH_URL \
|
|
194
|
+
CHROMA_URL=$CHROMA_URL \
|
|
195
|
+
llama stack run {{ name }} \
|
|
196
|
+
--port $LLAMA_STACK_PORT
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
If you are using Llama Stack Safety / Shield APIs, use:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
203
|
+
DEH_URL=$DEH_URL \
|
|
204
|
+
SAFETY_MODEL=$SAFETY_MODEL \
|
|
205
|
+
DEH_SAFETY_URL=$DEH_SAFETY_URL \
|
|
206
|
+
CHROMA_URL=$CHROMA_URL \
|
|
207
|
+
llama stack run ./run-with-safety.yaml \
|
|
208
|
+
--port $LLAMA_STACK_PORT
|
|
209
|
+
```
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
version: 2
|
|
2
|
-
|
|
2
|
+
distro_name: dell
|
|
3
3
|
apis:
|
|
4
4
|
- agents
|
|
5
5
|
- datasetio
|
|
@@ -112,6 +112,9 @@ storage:
|
|
|
112
112
|
prompts:
|
|
113
113
|
namespace: prompts
|
|
114
114
|
backend: kv_default
|
|
115
|
+
connectors:
|
|
116
|
+
namespace: connectors
|
|
117
|
+
backend: kv_default
|
|
115
118
|
registered_resources:
|
|
116
119
|
models:
|
|
117
120
|
- metadata: {}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
version: 2
|
|
2
|
-
|
|
2
|
+
distro_name: nvidia
|
|
3
3
|
apis:
|
|
4
4
|
- agents
|
|
5
5
|
- datasetio
|
|
@@ -102,6 +102,9 @@ storage:
|
|
|
102
102
|
prompts:
|
|
103
103
|
namespace: prompts
|
|
104
104
|
backend: kv_default
|
|
105
|
+
connectors:
|
|
106
|
+
namespace: connectors
|
|
107
|
+
backend: kv_default
|
|
105
108
|
registered_resources:
|
|
106
109
|
models: []
|
|
107
110
|
shields: []
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
---
|
|
2
|
+
orphan: true
|
|
3
|
+
---
|
|
4
|
+
# NVIDIA Distribution
|
|
5
|
+
|
|
6
|
+
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
|
|
7
|
+
|
|
8
|
+
{{ providers_table }}
|
|
9
|
+
|
|
10
|
+
{% if run_config_env_vars %}
|
|
11
|
+
### Environment Variables
|
|
12
|
+
|
|
13
|
+
The following environment variables can be configured:
|
|
14
|
+
|
|
15
|
+
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
|
16
|
+
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
|
17
|
+
{% endfor %}
|
|
18
|
+
{% endif %}
|
|
19
|
+
|
|
20
|
+
{% if default_models %}
|
|
21
|
+
### Models
|
|
22
|
+
|
|
23
|
+
The following models are available by default:
|
|
24
|
+
|
|
25
|
+
{% for model in default_models %}
|
|
26
|
+
- `{{ model.model_id }} {{ model.doc_string }}`
|
|
27
|
+
{% endfor %}
|
|
28
|
+
{% endif %}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Prerequisites
|
|
32
|
+
### NVIDIA API Keys
|
|
33
|
+
|
|
34
|
+
Make sure you have access to a NVIDIA API Key. You can get one by visiting [https://build.nvidia.com/](https://build.nvidia.com/). Use this key for the `NVIDIA_API_KEY` environment variable.
|
|
35
|
+
|
|
36
|
+
### Deploy NeMo Microservices Platform
|
|
37
|
+
The NVIDIA NeMo microservices platform supports end-to-end microservice deployment of a complete AI flywheel on your Kubernetes cluster through the NeMo Microservices Helm Chart. Please reference the [NVIDIA NeMo Microservices documentation](https://docs.nvidia.com/nemo/microservices/latest/about/index.html) for platform prerequisites and instructions to install and deploy the platform.
|
|
38
|
+
|
|
39
|
+
## Supported Services
|
|
40
|
+
Each Llama Stack API corresponds to a specific NeMo microservice. The core microservices (Customizer, Evaluator, Guardrails) are exposed by the same endpoint. The platform components (Data Store) are each exposed by separate endpoints.
|
|
41
|
+
|
|
42
|
+
### Inference: NVIDIA NIM
|
|
43
|
+
NVIDIA NIM is used for running inference with registered models. There are two ways to access NVIDIA NIMs:
|
|
44
|
+
1. Hosted (default): Preview APIs hosted at https://integrate.api.nvidia.com (Requires an API key)
|
|
45
|
+
2. Self-hosted: NVIDIA NIMs that run on your own infrastructure.
|
|
46
|
+
|
|
47
|
+
The deployed platform includes the NIM Proxy microservice, which is the service that provides to access your NIMs (for example, to run inference on a model). Set the `NVIDIA_BASE_URL` environment variable to use your NVIDIA NIM Proxy deployment.
|
|
48
|
+
|
|
49
|
+
### Datasetio API: NeMo Data Store
|
|
50
|
+
The NeMo Data Store microservice serves as the default file storage solution for the NeMo microservices platform. It exposts APIs compatible with the Hugging Face Hub client (`HfApi`), so you can use the client to interact with Data Store. The `NVIDIA_DATASETS_URL` environment variable should point to your NeMo Data Store endpoint.
|
|
51
|
+
|
|
52
|
+
See the [NVIDIA Datasetio docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/datasetio/nvidia/README.md) for supported features and example usage.
|
|
53
|
+
|
|
54
|
+
### Eval API: NeMo Evaluator
|
|
55
|
+
The NeMo Evaluator microservice supports evaluation of LLMs. Launching an Evaluation job with NeMo Evaluator requires an Evaluation Config (an object that contains metadata needed by the job). A Llama Stack Benchmark maps to an Evaluation Config, so registering a Benchmark creates an Evaluation Config in NeMo Evaluator. The `NVIDIA_EVALUATOR_URL` environment variable should point to your NeMo Microservices endpoint.
|
|
56
|
+
|
|
57
|
+
See the [NVIDIA Eval docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/eval/nvidia/README.md) for supported features and example usage.
|
|
58
|
+
|
|
59
|
+
### Post-Training API: NeMo Customizer
|
|
60
|
+
The NeMo Customizer microservice supports fine-tuning models. You can reference [this list of supported models](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/models.py) that can be fine-tuned using Llama Stack. The `NVIDIA_CUSTOMIZER_URL` environment variable should point to your NeMo Microservices endpoint.
|
|
61
|
+
|
|
62
|
+
See the [NVIDIA Post-Training docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/post_training/nvidia/README.md) for supported features and example usage.
|
|
63
|
+
|
|
64
|
+
### Safety API: NeMo Guardrails
|
|
65
|
+
The NeMo Guardrails microservice sits between your application and the LLM, and adds checks and content moderation to a model. The `GUARDRAILS_SERVICE_URL` environment variable should point to your NeMo Microservices endpoint.
|
|
66
|
+
|
|
67
|
+
See the [NVIDIA Safety docs](https://github.com/meta-llama/llama-stack/blob/main/llama_stack/providers/remote/safety/nvidia/README.md) for supported features and example usage.
|
|
68
|
+
|
|
69
|
+
## Deploying models
|
|
70
|
+
In order to use a registered model with the Llama Stack APIs, ensure the corresponding NIM is deployed to your environment. For example, you can use the NIM Proxy microservice to deploy `meta/llama-3.2-1b-instruct`.
|
|
71
|
+
|
|
72
|
+
Note: For improved inference speeds, we need to use NIM with `fast_outlines` guided decoding system (specified in the request body). This is the default if you deployed the platform with the NeMo Microservices Helm Chart.
|
|
73
|
+
```sh
|
|
74
|
+
# URL to NeMo NIM Proxy service
|
|
75
|
+
export NEMO_URL="http://nemo.test"
|
|
76
|
+
|
|
77
|
+
curl --location "$NEMO_URL/v1/deployment/model-deployments" \
|
|
78
|
+
-H 'accept: application/json' \
|
|
79
|
+
-H 'Content-Type: application/json' \
|
|
80
|
+
-d '{
|
|
81
|
+
"name": "llama-3.2-1b-instruct",
|
|
82
|
+
"namespace": "meta",
|
|
83
|
+
"config": {
|
|
84
|
+
"model": "meta/llama-3.2-1b-instruct",
|
|
85
|
+
"nim_deployment": {
|
|
86
|
+
"image_name": "nvcr.io/nim/meta/llama-3.2-1b-instruct",
|
|
87
|
+
"image_tag": "1.8.3",
|
|
88
|
+
"pvc_size": "25Gi",
|
|
89
|
+
"gpu": 1,
|
|
90
|
+
"additional_envs": {
|
|
91
|
+
"NIM_GUIDED_DECODING_BACKEND": "fast_outlines"
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}'
|
|
96
|
+
```
|
|
97
|
+
This NIM deployment should take approximately 10 minutes to go live. [See the docs](https://docs.nvidia.com/nemo/microservices/latest/get-started/tutorials/deploy-nims.html) for more information on how to deploy a NIM and verify it's available for inference.
|
|
98
|
+
|
|
99
|
+
You can also remove a deployed NIM to free up GPU resources, if needed.
|
|
100
|
+
```sh
|
|
101
|
+
export NEMO_URL="http://nemo.test"
|
|
102
|
+
|
|
103
|
+
curl -X DELETE "$NEMO_URL/v1/deployment/model-deployments/meta/llama-3.1-8b-instruct"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## Running Llama Stack with NVIDIA
|
|
107
|
+
|
|
108
|
+
You can do this via venv (build code), or Docker which has a pre-built image.
|
|
109
|
+
|
|
110
|
+
### Via Docker
|
|
111
|
+
|
|
112
|
+
This method allows you to get started quickly without having to build the distribution code.
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
LLAMA_STACK_PORT=8321
|
|
116
|
+
docker run \
|
|
117
|
+
-it \
|
|
118
|
+
--pull always \
|
|
119
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
120
|
+
-v ~/.llama:/root/.llama \
|
|
121
|
+
-e NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
|
122
|
+
llamastack/distribution-{{ name }} \
|
|
123
|
+
--port $LLAMA_STACK_PORT
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
### Via Docker with Custom Run Configuration
|
|
127
|
+
|
|
128
|
+
You can also run the Docker container with a custom run configuration file by mounting it into the container:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# Set the path to your custom config.yaml file
|
|
132
|
+
CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
|
|
133
|
+
LLAMA_STACK_PORT=8321
|
|
134
|
+
|
|
135
|
+
docker run \
|
|
136
|
+
-it \
|
|
137
|
+
--pull always \
|
|
138
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
139
|
+
-v ~/.llama:/root/.llama \
|
|
140
|
+
-v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
|
|
141
|
+
-e RUN_CONFIG_PATH=/app/custom-config.yaml \
|
|
142
|
+
-e NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
|
143
|
+
llamastack/distribution-{{ name }} \
|
|
144
|
+
--port $LLAMA_STACK_PORT
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
|
|
148
|
+
|
|
149
|
+
{% if run_configs %}
|
|
150
|
+
Available run configurations for this distribution:
|
|
151
|
+
{% for config in run_configs %}
|
|
152
|
+
- `{{ config }}`
|
|
153
|
+
{% endfor %}
|
|
154
|
+
{% endif %}
|
|
155
|
+
|
|
156
|
+
### Via venv
|
|
157
|
+
|
|
158
|
+
If you've set up your local development environment, you can also install the distribution dependencies using your local virtual environment.
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
|
162
|
+
llama stack list-deps nvidia | xargs -L1 uv pip install
|
|
163
|
+
NVIDIA_API_KEY=$NVIDIA_API_KEY \
|
|
164
|
+
INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
165
|
+
llama stack run ./config.yaml \
|
|
166
|
+
--port 8321
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## Example Notebooks
|
|
170
|
+
For examples of how to use the NVIDIA Distribution to run inference, fine-tune, evaluate, and run safety checks on your LLMs, you can reference the example notebooks in [docs/notebooks/nvidia](https://github.com/meta-llama/llama-stack/tree/main/docs/notebooks/nvidia).
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
version: 2
|
|
2
|
-
|
|
2
|
+
distro_name: nvidia
|
|
3
3
|
apis:
|
|
4
4
|
- agents
|
|
5
5
|
- datasetio
|
|
@@ -113,6 +113,9 @@ storage:
|
|
|
113
113
|
prompts:
|
|
114
114
|
namespace: prompts
|
|
115
115
|
backend: kv_default
|
|
116
|
+
connectors:
|
|
117
|
+
namespace: connectors
|
|
118
|
+
backend: kv_default
|
|
116
119
|
registered_resources:
|
|
117
120
|
models:
|
|
118
121
|
- metadata: {}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
version: 2
|
|
2
|
-
|
|
2
|
+
distro_name: oci
|
|
3
3
|
apis:
|
|
4
4
|
- agents
|
|
5
5
|
- datasetio
|
|
@@ -120,6 +120,9 @@ storage:
|
|
|
120
120
|
prompts:
|
|
121
121
|
namespace: prompts
|
|
122
122
|
backend: kv_default
|
|
123
|
+
connectors:
|
|
124
|
+
namespace: connectors
|
|
125
|
+
backend: kv_default
|
|
123
126
|
registered_resources:
|
|
124
127
|
models: []
|
|
125
128
|
shields: []
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
---
|
|
2
|
+
orphan: true
|
|
3
|
+
---
|
|
4
|
+
# OCI Distribution
|
|
5
|
+
|
|
6
|
+
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
|
|
7
|
+
|
|
8
|
+
{{ providers_table }}
|
|
9
|
+
|
|
10
|
+
{% if run_config_env_vars %}
|
|
11
|
+
### Environment Variables
|
|
12
|
+
|
|
13
|
+
The following environment variables can be configured:
|
|
14
|
+
|
|
15
|
+
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
|
16
|
+
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
|
17
|
+
{% endfor %}
|
|
18
|
+
{% endif %}
|
|
19
|
+
|
|
20
|
+
{% if default_models %}
|
|
21
|
+
### Models
|
|
22
|
+
|
|
23
|
+
The following models are available by default:
|
|
24
|
+
|
|
25
|
+
{% for model in default_models %}
|
|
26
|
+
- `{{ model.model_id }} {{ model.doc_string }}`
|
|
27
|
+
{% endfor %}
|
|
28
|
+
{% endif %}
|
|
29
|
+
|
|
30
|
+
## Prerequisites
|
|
31
|
+
### Oracle Cloud Infrastructure Setup
|
|
32
|
+
|
|
33
|
+
Before using the OCI Generative AI distribution, ensure you have:
|
|
34
|
+
|
|
35
|
+
1. **Oracle Cloud Infrastructure Account**: Sign up at [Oracle Cloud Infrastructure](https://cloud.oracle.com/)
|
|
36
|
+
2. **Generative AI Service Access**: Enable the Generative AI service in your OCI tenancy
|
|
37
|
+
3. **Compartment**: Create or identify a compartment where you'll deploy Generative AI models
|
|
38
|
+
4. **Authentication**: Configure authentication using either:
|
|
39
|
+
- **Instance Principal** (recommended for cloud-hosted deployments)
|
|
40
|
+
- **API Key** (for on-premises or development environments)
|
|
41
|
+
|
|
42
|
+
### Authentication Methods
|
|
43
|
+
|
|
44
|
+
#### Instance Principal Authentication (Recommended)
|
|
45
|
+
Instance Principal authentication allows OCI resources to authenticate using the identity of the compute instance they're running on. This is the most secure method for production deployments.
|
|
46
|
+
|
|
47
|
+
Requirements:
|
|
48
|
+
- Instance must be running in an Oracle Cloud Infrastructure compartment
|
|
49
|
+
- Instance must have appropriate IAM policies to access Generative AI services
|
|
50
|
+
|
|
51
|
+
#### API Key Authentication
|
|
52
|
+
For development or on-premises deployments, follow [this doc](https://docs.oracle.com/en-us/iaas/Content/API/Concepts/apisigningkey.htm) to learn how to create your API signing key for your config file.
|
|
53
|
+
|
|
54
|
+
### Required IAM Policies
|
|
55
|
+
|
|
56
|
+
Ensure your OCI user or instance has the following policy statements:
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
Allow group <group_name> to use generative-ai-inference-endpoints in compartment <compartment_name>
|
|
60
|
+
Allow group <group_name> to manage generative-ai-inference-endpoints in compartment <compartment_name>
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Supported Services
|
|
64
|
+
|
|
65
|
+
### Inference: OCI Generative AI
|
|
66
|
+
Oracle Cloud Infrastructure Generative AI provides access to high-performance AI models through OCI's Platform-as-a-Service offering. The service supports:
|
|
67
|
+
|
|
68
|
+
- **Chat Completions**: Conversational AI with context awareness
|
|
69
|
+
- **Text Generation**: Complete prompts and generate text content
|
|
70
|
+
|
|
71
|
+
#### Available Models
|
|
72
|
+
Common OCI Generative AI models include access to Meta, Cohere, OpenAI, Grok, and more models.
|
|
73
|
+
|
|
74
|
+
### Safety: Llama Guard
|
|
75
|
+
For content safety and moderation, this distribution uses Meta's LlamaGuard model through the OCI Generative AI service to provide:
|
|
76
|
+
- Content filtering and moderation
|
|
77
|
+
- Policy compliance checking
|
|
78
|
+
- Harmful content detection
|
|
79
|
+
|
|
80
|
+
### Vector Storage: Multiple Options
|
|
81
|
+
The distribution supports several vector storage providers:
|
|
82
|
+
- **FAISS**: Local in-memory vector search
|
|
83
|
+
- **ChromaDB**: Distributed vector database
|
|
84
|
+
- **PGVector**: PostgreSQL with vector extensions
|
|
85
|
+
|
|
86
|
+
### Additional Services
|
|
87
|
+
- **Dataset I/O**: Local filesystem and Hugging Face integration
|
|
88
|
+
- **Tool Runtime**: Web search (Brave, Tavily) and RAG capabilities
|
|
89
|
+
- **Evaluation**: Meta reference evaluation framework
|
|
90
|
+
|
|
91
|
+
## Running Llama Stack with OCI
|
|
92
|
+
|
|
93
|
+
You can run the OCI distribution via Docker or local virtual environment.
|
|
94
|
+
|
|
95
|
+
### Via venv
|
|
96
|
+
|
|
97
|
+
If you've set up your local development environment, you can also build the image using your local virtual environment.
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
OCI_AUTH=$OCI_AUTH_TYPE OCI_REGION=$OCI_REGION OCI_COMPARTMENT_OCID=$OCI_COMPARTMENT_OCID llama stack run --port 8321 oci
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Configuration Examples
|
|
104
|
+
|
|
105
|
+
#### Using Instance Principal (Recommended for Production)
|
|
106
|
+
```bash
|
|
107
|
+
export OCI_AUTH_TYPE=instance_principal
|
|
108
|
+
export OCI_REGION=us-chicago-1
|
|
109
|
+
export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..<your-compartment-id>
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
#### Using API Key Authentication (Development)
|
|
113
|
+
```bash
|
|
114
|
+
export OCI_AUTH_TYPE=config_file
|
|
115
|
+
export OCI_CONFIG_FILE_PATH=~/.oci/config
|
|
116
|
+
export OCI_CLI_PROFILE=DEFAULT
|
|
117
|
+
export OCI_REGION=us-chicago-1
|
|
118
|
+
export OCI_COMPARTMENT_OCID=ocid1.compartment.oc1..your-compartment-id
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Regional Endpoints
|
|
122
|
+
|
|
123
|
+
OCI Generative AI is available in multiple regions. The service automatically routes to the appropriate regional endpoint based on your configuration. For a full list of regional model availability, visit:
|
|
124
|
+
|
|
125
|
+
https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm#regions
|
|
126
|
+
|
|
127
|
+
## Troubleshooting
|
|
128
|
+
|
|
129
|
+
### Common Issues
|
|
130
|
+
|
|
131
|
+
1. **Authentication Errors**: Verify your OCI credentials and IAM policies
|
|
132
|
+
2. **Model Not Found**: Ensure the model OCID is correct and the model is available in your region
|
|
133
|
+
3. **Permission Denied**: Check compartment permissions and Generative AI service access
|
|
134
|
+
4. **Region Unavailable**: Verify the specified region supports Generative AI services
|
|
135
|
+
|
|
136
|
+
### Getting Help
|
|
137
|
+
|
|
138
|
+
For additional support:
|
|
139
|
+
- [OCI Generative AI Documentation](https://docs.oracle.com/en-us/iaas/Content/generative-ai/home.htm)
|
|
140
|
+
- [Llama Stack Issues](https://github.com/meta-llama/llama-stack/issues)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
version: 2
|
|
2
|
-
|
|
2
|
+
distro_name: open-benchmark
|
|
3
3
|
apis:
|
|
4
4
|
- agents
|
|
5
5
|
- datasetio
|
|
@@ -57,6 +57,11 @@ providers:
|
|
|
57
57
|
db: ${env.PGVECTOR_DB:=}
|
|
58
58
|
user: ${env.PGVECTOR_USER:=}
|
|
59
59
|
password: ${env.PGVECTOR_PASSWORD:=}
|
|
60
|
+
distance_metric: COSINE
|
|
61
|
+
vector_index:
|
|
62
|
+
type: HNSW
|
|
63
|
+
m: 16
|
|
64
|
+
ef_construction: 64
|
|
60
65
|
persistence:
|
|
61
66
|
namespace: vector_io::pgvector
|
|
62
67
|
backend: kv_default
|
|
@@ -145,6 +150,9 @@ storage:
|
|
|
145
150
|
prompts:
|
|
146
151
|
namespace: prompts
|
|
147
152
|
backend: kv_default
|
|
153
|
+
connectors:
|
|
154
|
+
namespace: connectors
|
|
155
|
+
backend: kv_default
|
|
148
156
|
registered_resources:
|
|
149
157
|
models:
|
|
150
158
|
- metadata: {}
|