PyPI - llama-stack - Versions diffs - 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

llama-stack 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (182) hide show

llama_stack/core/library_client.py +80 -3
llama_stack/core/routing_tables/common.py +11 -0
llama_stack/core/routing_tables/vector_stores.py +4 -0
llama_stack/core/stack.py +16 -1
llama_stack/core/storage/kvstore/kvstore.py +11 -0
llama_stack/core/storage/kvstore/mongodb/mongodb.py +5 -0
llama_stack/core/storage/kvstore/postgres/postgres.py +8 -0
llama_stack/core/storage/kvstore/redis/redis.py +5 -0
llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py +8 -0
llama_stack/core/storage/sqlstore/sqlstore.py +8 -0
llama_stack/distributions/dell/doc_template.md +209 -0
llama_stack/distributions/meta-reference-gpu/doc_template.md +119 -0
llama_stack/distributions/nvidia/doc_template.md +170 -0
llama_stack/distributions/oci/doc_template.md +140 -0
llama_stack/models/llama/llama3/dog.jpg +0 -0
llama_stack/models/llama/llama3/pasta.jpeg +0 -0
llama_stack/models/llama/resources/dog.jpg +0 -0
llama_stack/models/llama/resources/pasta.jpeg +0 -0
llama_stack/models/llama/resources/small_dog.jpg +0 -0
llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +184 -33
llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +4 -0
llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +9 -1
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
llama_stack/providers/remote/eval/nvidia/README.md +134 -0
llama_stack/providers/remote/files/s3/README.md +266 -0
llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
llama_stack/providers/remote/safety/nvidia/README.md +78 -0
llama_stack/providers/remote/vector_io/pgvector/pgvector.py +13 -1
llama_stack/providers/utils/inference/embedding_mixin.py +20 -16
llama_stack/providers/utils/memory/openai_vector_store_mixin.py +33 -0
llama_stack/providers/utils/responses/responses_store.py +34 -0
llama_stack/providers/utils/tools/mcp.py +258 -16
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/METADATA +2 -2
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/RECORD +47 -158
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/WHEEL +1 -1
llama_stack-0.4.4.dist-info/top_level.txt +1 -0
llama_stack-0.4.2.dist-info/top_level.txt +0 -2
llama_stack_api/__init__.py +0 -945
llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/admin/api.py +0 -72
llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/admin/models.py +0 -113
llama_stack_api/agents.py +0 -173
llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/batches/api.py +0 -53
llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/batches/models.py +0 -78
llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/common/__init__.py +0 -5
llama_stack_api/common/content_types.py +0 -101
llama_stack_api/common/errors.py +0 -95
llama_stack_api/common/job_types.py +0 -38
llama_stack_api/common/responses.py +0 -77
llama_stack_api/common/training_types.py +0 -47
llama_stack_api/common/type_system.py +0 -146
llama_stack_api/connectors.py +0 -146
llama_stack_api/conversations.py +0 -270
llama_stack_api/datasetio.py +0 -55
llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/datasets/api.py +0 -35
llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/datasets/models.py +0 -152
llama_stack_api/datatypes.py +0 -373
llama_stack_api/eval.py +0 -137
llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/files/__init__.py +0 -35
llama_stack_api/files/api.py +0 -51
llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/files/models.py +0 -107
llama_stack_api/inference.py +0 -1169
llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/internal/__init__.py +0 -9
llama_stack_api/internal/kvstore.py +0 -26
llama_stack_api/internal/sqlstore.py +0 -79
llama_stack_api/llama_stack_api/__init__.py +0 -945
llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
llama_stack_api/llama_stack_api/admin/api.py +0 -72
llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
llama_stack_api/llama_stack_api/admin/models.py +0 -113
llama_stack_api/llama_stack_api/agents.py +0 -173
llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
llama_stack_api/llama_stack_api/batches/api.py +0 -53
llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
llama_stack_api/llama_stack_api/batches/models.py +0 -78
llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
llama_stack_api/llama_stack_api/common/__init__.py +0 -5
llama_stack_api/llama_stack_api/common/content_types.py +0 -101
llama_stack_api/llama_stack_api/common/errors.py +0 -95
llama_stack_api/llama_stack_api/common/job_types.py +0 -38
llama_stack_api/llama_stack_api/common/responses.py +0 -77
llama_stack_api/llama_stack_api/common/training_types.py +0 -47
llama_stack_api/llama_stack_api/common/type_system.py +0 -146
llama_stack_api/llama_stack_api/connectors.py +0 -146
llama_stack_api/llama_stack_api/conversations.py +0 -270
llama_stack_api/llama_stack_api/datasetio.py +0 -55
llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
llama_stack_api/llama_stack_api/datasets/api.py +0 -35
llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
llama_stack_api/llama_stack_api/datasets/models.py +0 -152
llama_stack_api/llama_stack_api/datatypes.py +0 -373
llama_stack_api/llama_stack_api/eval.py +0 -137
llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
llama_stack_api/llama_stack_api/files/__init__.py +0 -35
llama_stack_api/llama_stack_api/files/api.py +0 -51
llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
llama_stack_api/llama_stack_api/files/models.py +0 -107
llama_stack_api/llama_stack_api/inference.py +0 -1169
llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
llama_stack_api/llama_stack_api/internal/kvstore.py +0 -26
llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -79
llama_stack_api/llama_stack_api/models.py +0 -171
llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/llama_stack_api/post_training.py +0 -370
llama_stack_api/llama_stack_api/prompts.py +0 -203
llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/llama_stack_api/providers/api.py +0 -16
llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/llama_stack_api/providers/models.py +0 -24
llama_stack_api/llama_stack_api/py.typed +0 -0
llama_stack_api/llama_stack_api/rag_tool.py +0 -168
llama_stack_api/llama_stack_api/resource.py +0 -37
llama_stack_api/llama_stack_api/router_utils.py +0 -160
llama_stack_api/llama_stack_api/safety.py +0 -132
llama_stack_api/llama_stack_api/schema_utils.py +0 -208
llama_stack_api/llama_stack_api/scoring.py +0 -93
llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/llama_stack_api/shields.py +0 -93
llama_stack_api/llama_stack_api/tools.py +0 -226
llama_stack_api/llama_stack_api/vector_io.py +0 -941
llama_stack_api/llama_stack_api/vector_stores.py +0 -51
llama_stack_api/llama_stack_api/version.py +0 -9
llama_stack_api/models.py +0 -171
llama_stack_api/openai_responses.py +0 -1468
llama_stack_api/post_training.py +0 -370
llama_stack_api/prompts.py +0 -203
llama_stack_api/providers/__init__.py +0 -33
llama_stack_api/providers/api.py +0 -16
llama_stack_api/providers/fastapi_routes.py +0 -57
llama_stack_api/providers/models.py +0 -24
llama_stack_api/py.typed +0 -0
llama_stack_api/rag_tool.py +0 -168
llama_stack_api/resource.py +0 -37
llama_stack_api/router_utils.py +0 -160
llama_stack_api/safety.py +0 -132
llama_stack_api/schema_utils.py +0 -208
llama_stack_api/scoring.py +0 -93
llama_stack_api/scoring_functions.py +0 -211
llama_stack_api/shields.py +0 -93
llama_stack_api/tools.py +0 -226
llama_stack_api/vector_io.py +0 -941
llama_stack_api/vector_stores.py +0 -51
llama_stack_api/version.py +0 -9
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/entry_points.txt +0 -0
{llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/licenses/LICENSE +0 -0

llama_stack/core/library_client.py CHANGED Viewed

@@ -161,6 +161,45 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
         """
         pass
+    def shutdown(self) -> None:
+        """Shutdown the client and release all resources.
+        This method should be called when you're done using the client to properly
+        close database connections and release other resources. Failure to call this
+        method may result in the program hanging on exit while waiting for background
+        threads to complete.
+        This method is idempotent and can be called multiple times safely.
+        Example:
+            client = LlamaStackAsLibraryClient("starter")
+            # ... use the client ...
+            client.shutdown()
+        """
+        loop = self.loop
+        asyncio.set_event_loop(loop)
+        try:
+            loop.run_until_complete(self.async_client.shutdown())
+        finally:
+            loop.close()
+            asyncio.set_event_loop(None)
+    def __enter__(self) -> "LlamaStackAsLibraryClient":
+        """Enter the context manager.
+        The client is already initialized in __init__, so this just returns self.
+        Example:
+            with LlamaStackAsLibraryClient("starter") as client:
+                response = client.models.list()
+            # Client is automatically shut down here
+        """
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Exit the context manager and shut down the client."""
+        self.shutdown()
     def request(self, *args, **kwargs):
         loop = self.loop
         asyncio.set_event_loop(loop)
@@ -224,6 +263,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         self.custom_provider_registry = custom_provider_registry
         self.provider_data = provider_data
         self.route_impls: RouteImpls | None = None  # Initialize to None to prevent AttributeError
+        self.stack: Stack | None = None
     def _remove_root_logger_handlers(self):
         """
@@ -246,9 +286,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         try:
             self.route_impls = None
-            stack = Stack(self.config, self.custom_provider_registry)
-            await stack.initialize()
-            self.impls = stack.impls
+            self.stack = Stack(self.config, self.custom_provider_registry)
+            await self.stack.initialize()
+            self.impls = self.stack.impls
         except ModuleNotFoundError as _e:
             cprint(_e.msg, color="red", file=sys.stderr)
             cprint(
@@ -283,6 +323,43 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
         self.route_impls = initialize_route_impls(self.impls)
         return True
+    async def shutdown(self) -> None:
+        """Shutdown the client and release all resources.
+        This method should be called when you're done using the client to properly
+        close database connections and release other resources. Failure to call this
+        method may result in the program hanging on exit while waiting for background
+        threads to complete.
+        This method is idempotent and can be called multiple times safely.
+        Example:
+            client = AsyncLlamaStackAsLibraryClient("starter")
+            await client.initialize()
+            # ... use the client ...
+            await client.shutdown()
+        """
+        if self.stack:
+            await self.stack.shutdown()
+            self.stack = None
+    async def __aenter__(self) -> "AsyncLlamaStackAsLibraryClient":
+        """Enter the async context manager.
+        Initializes the client and returns it.
+        Example:
+            async with AsyncLlamaStackAsLibraryClient("starter") as client:
+                response = await client.models.list()
+            # Client is automatically shut down here
+        """
+        await self.initialize()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        """Exit the async context manager and shut down the client."""
+        await self.shutdown()
     async def request(
         self,
         cast_to: Any,

llama_stack/core/routing_tables/common.py CHANGED Viewed

@@ -209,6 +209,17 @@ class CommonRoutingTableImpl(RoutingTable):
             logger.info(f"Setting owner for {obj.type} '{obj.identifier}' to {obj.owner.principal}")
         registered_obj = await register_object_with_provider(obj, p)
+        # Ensure OpenAI metadata exists for vector stores
+        if obj.type == ResourceType.vector_store.value:
+            if hasattr(p, "_ensure_openai_metadata_exists"):
+                await p._ensure_openai_metadata_exists(obj)
+            else:
+                logger.warning(
+                    f"Provider {obj.provider_id} does not support OpenAI metadata creation. "
+                    f"Vector store {obj.identifier} may not work with OpenAI-compatible APIs."
+                )
         # TODO: This needs to be fixed for all APIs once they return the registered object
         if obj.type == ResourceType.model.value:
             await self.dist_registry.register(registered_obj)

llama_stack/core/routing_tables/vector_stores.py CHANGED Viewed

@@ -55,6 +55,10 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
     # Internal methods only - no public API exposure
+    async def list_vector_stores(self) -> list[VectorStoreWithOwner]:
+        """List all registered vector stores."""
+        return await self.get_all_with_type(ResourceType.vector_store.value)
     async def register_vector_store(
         self,
         vector_store_id: str,

llama_stack/core/stack.py CHANGED Viewed

@@ -108,6 +108,7 @@ RESOURCES = [
     ),
     ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks", RegisterBenchmarkRequest),
     ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups", None),
+    ("vector_stores", Api.vector_stores, "register_vector_store", "list_vector_stores", None),
 ]
@@ -620,7 +621,7 @@ class Stack:
     async def shutdown(self):
         for impl in self.impls.values():
             impl_name = impl.__class__.__name__
-            logger.info(f"Shutting down {impl_name}")
+            logger.debug(f"Shutting down {impl_name}")
             try:
                 if hasattr(impl, "shutdown"):
                     await asyncio.wait_for(impl.shutdown(), timeout=5)
@@ -642,6 +643,20 @@ class Stack:
         if REGISTRY_REFRESH_TASK:
             REGISTRY_REFRESH_TASK.cancel()
+        # Shutdown storage backends
+        from llama_stack.core.storage.kvstore.kvstore import shutdown_kvstore_backends
+        from llama_stack.core.storage.sqlstore.sqlstore import shutdown_sqlstore_backends
+        try:
+            await shutdown_kvstore_backends()
+        except Exception as e:
+            logger.exception(f"Failed to shutdown KV store backends: {e}")
+        try:
+            await shutdown_sqlstore_backends()
+        except Exception as e:
+            logger.exception(f"Failed to shutdown SQL store backends: {e}")
 async def refresh_registry_once(impls: dict[Api, Any]):
     logger.debug("refreshing registry")

llama_stack/core/storage/kvstore/kvstore.py CHANGED Viewed

@@ -62,6 +62,9 @@ class InmemoryKVStoreImpl(KVStore):
     async def delete(self, key: str) -> None:
         del self._store[key]
+    async def shutdown(self) -> None:
+        self._store.clear()
 _KVSTORE_BACKENDS: dict[str, KVStoreConfig] = {}
 _KVSTORE_INSTANCES: dict[tuple[str, str], KVStore] = {}
@@ -126,3 +129,11 @@ async def kvstore_impl(reference: KVStoreReference) -> KVStore:
         await impl.initialize()
         _KVSTORE_INSTANCES[cache_key] = impl
         return impl
+async def shutdown_kvstore_backends() -> None:
+    """Shutdown all cached KV store instances."""
+    global _KVSTORE_INSTANCES
+    for instance in _KVSTORE_INSTANCES.values():
+        await instance.shutdown()
+    _KVSTORE_INSTANCES.clear()

llama_stack/core/storage/kvstore/mongodb/mongodb.py CHANGED Viewed

@@ -83,3 +83,8 @@ class MongoDBKVStoreImpl(KVStore):
         async for doc in cursor:
             result.append(doc["key"])
         return result
+    async def shutdown(self) -> None:
+        if self.conn:
+            await self.conn.close()
+            self.conn = None

llama_stack/core/storage/kvstore/postgres/postgres.py CHANGED Viewed

@@ -123,3 +123,11 @@ class PostgresKVStoreImpl(KVStore):
             (start_key, end_key),
         )
         return [row[0] for row in cursor.fetchall()]
+    async def shutdown(self) -> None:
+        if self._cursor:
+            self._cursor.close()
+            self._cursor = None
+        if self._conn:
+            self._conn.close()
+            self._conn = None

llama_stack/core/storage/kvstore/redis/redis.py CHANGED Viewed

@@ -99,3 +99,8 @@ class RedisKVStoreImpl(KVStore):
             if cursor == 0:
                 break
         return result
+    async def shutdown(self) -> None:
+        if self._redis:
+            await self._redis.close()
+            self._redis = None

llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py CHANGED Viewed

@@ -107,6 +107,14 @@ class SqlAlchemySqlStoreImpl(SqlStore):
         return engine
+    async def shutdown(self) -> None:
+        """Dispose the session maker's engine and close all connections."""
+        # The async_session holds a reference to the engine created in __init__
+        if self.async_session:
+            engine = self.async_session.kw.get("bind")
+            if engine:
+                await engine.dispose()
     async def create_table(
         self,
         table: str,

llama_stack/core/storage/sqlstore/sqlstore.py CHANGED Viewed

@@ -85,3 +85,11 @@ def register_sqlstore_backends(backends: dict[str, StorageBackendConfig]) -> Non
     _SQLSTORE_LOCKS.clear()
     for name, cfg in backends.items():
         _SQLSTORE_BACKENDS[name] = cfg
+async def shutdown_sqlstore_backends() -> None:
+    """Shutdown all cached SQL store instances."""
+    global _SQLSTORE_INSTANCES
+    for instance in _SQLSTORE_INSTANCES.values():
+        await instance.shutdown()
+    _SQLSTORE_INSTANCES.clear()

llama_stack/distributions/dell/doc_template.md ADDED Viewed

@@ -0,0 +1,209 @@
+---
+orphan: true
+---
+# Dell Distribution of Llama Stack
+```{toctree}
+:maxdepth: 2
+:hidden:
+self
+```
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
+{{ providers_table }}
+You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
+{% if run_config_env_vars %}
+### Environment Variables
+The following environment variables can be configured:
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
+NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
+```bash
+export INFERENCE_PORT=8181
+export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
+export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
+export CHROMADB_HOST=localhost
+export CHROMADB_PORT=6601
+export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
+export CUDA_VISIBLE_DEVICES=0
+export LLAMA_STACK_PORT=8321
+docker run --rm -it \
+  --pull always \
+  --network host \
+  -v $HOME/.cache/huggingface:/data \
+  -e HF_TOKEN=$HF_TOKEN \
+  -p $INFERENCE_PORT:$INFERENCE_PORT \
+  --gpus $CUDA_VISIBLE_DEVICES \
+  ghcr.io/huggingface/text-generation-inference \
+  --dtype bfloat16 \
+  --usage-stats off \
+  --sharded false \
+  --cuda-memory-fraction 0.7 \
+  --model-id $INFERENCE_MODEL \
+  --port $INFERENCE_PORT --hostname 0.0.0.0
+```
+If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
+```bash
+export SAFETY_INFERENCE_PORT=8282
+export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+export CUDA_VISIBLE_DEVICES=1
+docker run --rm -it \
+  --pull always \
+  --network host \
+  -v $HOME/.cache/huggingface:/data \
+  -e HF_TOKEN=$HF_TOKEN \
+  -p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
+  --gpus $CUDA_VISIBLE_DEVICES \
+  ghcr.io/huggingface/text-generation-inference \
+  --dtype bfloat16 \
+  --usage-stats off \
+  --sharded false \
+  --cuda-memory-fraction 0.7 \
+  --model-id $SAFETY_MODEL \
+  --hostname 0.0.0.0 \
+  --port $SAFETY_INFERENCE_PORT
+```
+## Dell distribution relies on ChromaDB for vector database usage
+You can start a chroma-db easily using docker.
+```bash
+# This is where the indices are persisted
+mkdir -p $HOME/chromadb
+podman run --rm -it \
+  --network host \
+  --name chromadb \
+  -v $HOME/chromadb:/chroma/chroma \
+  -e IS_PERSISTENT=TRUE \
+  chromadb/chroma:latest \
+  --port $CHROMADB_PORT \
+  --host $CHROMADB_HOST
+```
+## Running Llama Stack
+Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
+### Via Docker
+This method allows you to get started quickly without having to build the distribution code.
+```bash
+docker run -it \
+  --pull always \
+  --network host \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v $HOME/.llama:/root/.llama \
+  # NOTE: mount the llama-stack directory if testing local changes else not needed
+  -v $HOME/git/llama-stack:/app/llama-stack-source \
+  # localhost/distribution-dell:dev if building / testing locally
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e CHROMA_URL=$CHROMA_URL \
+  llamastack/distribution-{{ name }}\
+  --port $LLAMA_STACK_PORT
+```
+If you are using Llama Stack Safety / Shield APIs, use:
+```bash
+# You need a local checkout of llama-stack to run this, get it using
+# git clone https://github.com/meta-llama/llama-stack.git
+cd /path/to/llama-stack
+export SAFETY_INFERENCE_PORT=8282
+export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
+export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
+docker run \
+  -it \
+  --pull always \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v $HOME/.llama:/root/.llama \
+  -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-config.yaml \
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e SAFETY_MODEL=$SAFETY_MODEL \
+  -e DEH_SAFETY_URL=$DEH_SAFETY_URL \
+  -e CHROMA_URL=$CHROMA_URL \
+  llamastack/distribution-{{ name }} \
+  --config /root/my-config.yaml \
+  --port $LLAMA_STACK_PORT
+```
+### Via Docker with Custom Run Configuration
+You can also run the Docker container with a custom run configuration file by mounting it into the container:
+```bash
+# Set the path to your custom config.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
+docker run -it \
+  --pull always \
+  --network host \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v $HOME/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-config.yaml \
+  -e INFERENCE_MODEL=$INFERENCE_MODEL \
+  -e DEH_URL=$DEH_URL \
+  -e CHROMA_URL=$CHROMA_URL \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+{% if run_configs %}
+Available run configurations for this distribution:
+{% for config in run_configs %}
+- `{{ config }}`
+{% endfor %}
+{% endif %}
+### Via Conda
+Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
+```bash
+llama stack list-deps {{ name }} | xargs -L1 pip install
+INFERENCE_MODEL=$INFERENCE_MODEL \
+DEH_URL=$DEH_URL \
+CHROMA_URL=$CHROMA_URL \
+llama stack run {{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+If you are using Llama Stack Safety / Shield APIs, use:
+```bash
+INFERENCE_MODEL=$INFERENCE_MODEL \
+DEH_URL=$DEH_URL \
+SAFETY_MODEL=$SAFETY_MODEL \
+DEH_SAFETY_URL=$DEH_SAFETY_URL \
+CHROMA_URL=$CHROMA_URL \
+llama stack run ./run-with-safety.yaml \
+  --port $LLAMA_STACK_PORT
+```

llama_stack/distributions/meta-reference-gpu/doc_template.md ADDED Viewed

@@ -0,0 +1,119 @@
+---
+orphan: true
+---
+# Meta Reference GPU Distribution
+```{toctree}
+:maxdepth: 2
+:hidden:
+self
+```
+The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
+{{ providers_table }}
+Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
+{% if run_config_env_vars %}
+### Environment Variables
+The following environment variables can be configured:
+{% for var, (default_value, description) in run_config_env_vars.items() %}
+- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
+{% endfor %}
+{% endif %}
+## Prerequisite: Downloading Models
+Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
+```
+## Running the Distribution
+You can do this via venv or Docker which has a pre-built image.
+### Via Docker
+This method allows you to get started quickly without having to build the distribution code.
+```bash
+LLAMA_STACK_PORT=8321
+docker run \
+  -it \
+  --pull always \
+  --gpu all \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+If you are using Llama Stack Safety / Shield APIs, use:
+```bash
+docker run \
+  -it \
+  --pull always \
+  --gpu all \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+  -e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+### Via Docker with Custom Run Configuration
+You can also run the Docker container with a custom run configuration file by mounting it into the container:
+```bash
+# Set the path to your custom config.yaml file
+CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
+LLAMA_STACK_PORT=8321
+docker run \
+  -it \
+  --pull always \
+  --gpu all \
+  -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+  -v ~/.llama:/root/.llama \
+  -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
+  -e RUN_CONFIG_PATH=/app/custom-config.yaml \
+  llamastack/distribution-{{ name }} \
+  --port $LLAMA_STACK_PORT
+```
+**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
+{% if run_configs %}
+Available run configurations for this distribution:
+{% for config in run_configs %}
+- `{{ config }}`
+{% endfor %}
+{% endif %}
+### Via venv
+Make sure you have the Llama Stack CLI available.
+```bash
+llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
+INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+llama stack run distributions/{{ name }}/config.yaml \
+  --port 8321
+```
+If you are using Llama Stack Safety / Shield APIs, use:
+```bash
+INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
+SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
+llama stack run distributions/{{ name }}/run-with-safety.yaml \
+  --port 8321
+```

llama-stack 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

llama-stack 0.4.2py3-none-any.whl → 0.4.4py3-none-any.whl