llama-stack 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_stack/core/library_client.py +80 -3
- llama_stack/core/routing_tables/common.py +11 -0
- llama_stack/core/routing_tables/vector_stores.py +4 -0
- llama_stack/core/stack.py +16 -1
- llama_stack/core/storage/kvstore/kvstore.py +11 -0
- llama_stack/core/storage/kvstore/mongodb/mongodb.py +5 -0
- llama_stack/core/storage/kvstore/postgres/postgres.py +8 -0
- llama_stack/core/storage/kvstore/redis/redis.py +5 -0
- llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py +8 -0
- llama_stack/core/storage/sqlstore/sqlstore.py +8 -0
- llama_stack/distributions/dell/doc_template.md +209 -0
- llama_stack/distributions/meta-reference-gpu/doc_template.md +119 -0
- llama_stack/distributions/nvidia/doc_template.md +170 -0
- llama_stack/distributions/oci/doc_template.md +140 -0
- llama_stack/models/llama/llama3/dog.jpg +0 -0
- llama_stack/models/llama/llama3/pasta.jpeg +0 -0
- llama_stack/models/llama/resources/dog.jpg +0 -0
- llama_stack/models/llama/resources/pasta.jpeg +0 -0
- llama_stack/models/llama/resources/small_dog.jpg +0 -0
- llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +184 -33
- llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +4 -0
- llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +9 -1
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
- llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
- llama_stack/providers/remote/eval/nvidia/README.md +134 -0
- llama_stack/providers/remote/files/s3/README.md +266 -0
- llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
- llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
- llama_stack/providers/remote/safety/nvidia/README.md +78 -0
- llama_stack/providers/remote/vector_io/pgvector/pgvector.py +13 -1
- llama_stack/providers/utils/inference/embedding_mixin.py +20 -16
- llama_stack/providers/utils/memory/openai_vector_store_mixin.py +33 -0
- llama_stack/providers/utils/responses/responses_store.py +34 -0
- llama_stack/providers/utils/tools/mcp.py +258 -16
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/METADATA +2 -2
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/RECORD +47 -158
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/WHEEL +1 -1
- llama_stack-0.4.4.dist-info/top_level.txt +1 -0
- llama_stack-0.4.2.dist-info/top_level.txt +0 -2
- llama_stack_api/__init__.py +0 -945
- llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/admin/api.py +0 -72
- llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/admin/models.py +0 -113
- llama_stack_api/agents.py +0 -173
- llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/batches/api.py +0 -53
- llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/batches/models.py +0 -78
- llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/common/errors.py +0 -95
- llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/common/responses.py +0 -77
- llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/connectors.py +0 -146
- llama_stack_api/conversations.py +0 -270
- llama_stack_api/datasetio.py +0 -55
- llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/datatypes.py +0 -373
- llama_stack_api/eval.py +0 -137
- llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/files/api.py +0 -51
- llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/files/models.py +0 -107
- llama_stack_api/inference.py +0 -1169
- llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/internal/__init__.py +0 -9
- llama_stack_api/internal/kvstore.py +0 -26
- llama_stack_api/internal/sqlstore.py +0 -79
- llama_stack_api/llama_stack_api/__init__.py +0 -945
- llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
- llama_stack_api/llama_stack_api/admin/api.py +0 -72
- llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
- llama_stack_api/llama_stack_api/admin/models.py +0 -113
- llama_stack_api/llama_stack_api/agents.py +0 -173
- llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
- llama_stack_api/llama_stack_api/batches/api.py +0 -53
- llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
- llama_stack_api/llama_stack_api/batches/models.py +0 -78
- llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
- llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
- llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
- llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
- llama_stack_api/llama_stack_api/common/__init__.py +0 -5
- llama_stack_api/llama_stack_api/common/content_types.py +0 -101
- llama_stack_api/llama_stack_api/common/errors.py +0 -95
- llama_stack_api/llama_stack_api/common/job_types.py +0 -38
- llama_stack_api/llama_stack_api/common/responses.py +0 -77
- llama_stack_api/llama_stack_api/common/training_types.py +0 -47
- llama_stack_api/llama_stack_api/common/type_system.py +0 -146
- llama_stack_api/llama_stack_api/connectors.py +0 -146
- llama_stack_api/llama_stack_api/conversations.py +0 -270
- llama_stack_api/llama_stack_api/datasetio.py +0 -55
- llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
- llama_stack_api/llama_stack_api/datasets/api.py +0 -35
- llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
- llama_stack_api/llama_stack_api/datasets/models.py +0 -152
- llama_stack_api/llama_stack_api/datatypes.py +0 -373
- llama_stack_api/llama_stack_api/eval.py +0 -137
- llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
- llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
- llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
- llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
- llama_stack_api/llama_stack_api/files/__init__.py +0 -35
- llama_stack_api/llama_stack_api/files/api.py +0 -51
- llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
- llama_stack_api/llama_stack_api/files/models.py +0 -107
- llama_stack_api/llama_stack_api/inference.py +0 -1169
- llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
- llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
- llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
- llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
- llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
- llama_stack_api/llama_stack_api/internal/kvstore.py +0 -26
- llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -79
- llama_stack_api/llama_stack_api/models.py +0 -171
- llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/llama_stack_api/post_training.py +0 -370
- llama_stack_api/llama_stack_api/prompts.py +0 -203
- llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/llama_stack_api/providers/api.py +0 -16
- llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/llama_stack_api/providers/models.py +0 -24
- llama_stack_api/llama_stack_api/py.typed +0 -0
- llama_stack_api/llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/llama_stack_api/resource.py +0 -37
- llama_stack_api/llama_stack_api/router_utils.py +0 -160
- llama_stack_api/llama_stack_api/safety.py +0 -132
- llama_stack_api/llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/llama_stack_api/scoring.py +0 -93
- llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/llama_stack_api/shields.py +0 -93
- llama_stack_api/llama_stack_api/tools.py +0 -226
- llama_stack_api/llama_stack_api/vector_io.py +0 -941
- llama_stack_api/llama_stack_api/vector_stores.py +0 -51
- llama_stack_api/llama_stack_api/version.py +0 -9
- llama_stack_api/models.py +0 -171
- llama_stack_api/openai_responses.py +0 -1468
- llama_stack_api/post_training.py +0 -370
- llama_stack_api/prompts.py +0 -203
- llama_stack_api/providers/__init__.py +0 -33
- llama_stack_api/providers/api.py +0 -16
- llama_stack_api/providers/fastapi_routes.py +0 -57
- llama_stack_api/providers/models.py +0 -24
- llama_stack_api/py.typed +0 -0
- llama_stack_api/rag_tool.py +0 -168
- llama_stack_api/resource.py +0 -37
- llama_stack_api/router_utils.py +0 -160
- llama_stack_api/safety.py +0 -132
- llama_stack_api/schema_utils.py +0 -208
- llama_stack_api/scoring.py +0 -93
- llama_stack_api/scoring_functions.py +0 -211
- llama_stack_api/shields.py +0 -93
- llama_stack_api/tools.py +0 -226
- llama_stack_api/vector_io.py +0 -941
- llama_stack_api/vector_stores.py +0 -51
- llama_stack_api/version.py +0 -9
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/entry_points.txt +0 -0
- {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -161,6 +161,45 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
|
|
|
161
161
|
"""
|
|
162
162
|
pass
|
|
163
163
|
|
|
164
|
+
def shutdown(self) -> None:
|
|
165
|
+
"""Shutdown the client and release all resources.
|
|
166
|
+
|
|
167
|
+
This method should be called when you're done using the client to properly
|
|
168
|
+
close database connections and release other resources. Failure to call this
|
|
169
|
+
method may result in the program hanging on exit while waiting for background
|
|
170
|
+
threads to complete.
|
|
171
|
+
|
|
172
|
+
This method is idempotent and can be called multiple times safely.
|
|
173
|
+
|
|
174
|
+
Example:
|
|
175
|
+
client = LlamaStackAsLibraryClient("starter")
|
|
176
|
+
# ... use the client ...
|
|
177
|
+
client.shutdown()
|
|
178
|
+
"""
|
|
179
|
+
loop = self.loop
|
|
180
|
+
asyncio.set_event_loop(loop)
|
|
181
|
+
try:
|
|
182
|
+
loop.run_until_complete(self.async_client.shutdown())
|
|
183
|
+
finally:
|
|
184
|
+
loop.close()
|
|
185
|
+
asyncio.set_event_loop(None)
|
|
186
|
+
|
|
187
|
+
def __enter__(self) -> "LlamaStackAsLibraryClient":
|
|
188
|
+
"""Enter the context manager.
|
|
189
|
+
|
|
190
|
+
The client is already initialized in __init__, so this just returns self.
|
|
191
|
+
|
|
192
|
+
Example:
|
|
193
|
+
with LlamaStackAsLibraryClient("starter") as client:
|
|
194
|
+
response = client.models.list()
|
|
195
|
+
# Client is automatically shut down here
|
|
196
|
+
"""
|
|
197
|
+
return self
|
|
198
|
+
|
|
199
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
200
|
+
"""Exit the context manager and shut down the client."""
|
|
201
|
+
self.shutdown()
|
|
202
|
+
|
|
164
203
|
def request(self, *args, **kwargs):
|
|
165
204
|
loop = self.loop
|
|
166
205
|
asyncio.set_event_loop(loop)
|
|
@@ -224,6 +263,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
|
|
224
263
|
self.custom_provider_registry = custom_provider_registry
|
|
225
264
|
self.provider_data = provider_data
|
|
226
265
|
self.route_impls: RouteImpls | None = None # Initialize to None to prevent AttributeError
|
|
266
|
+
self.stack: Stack | None = None
|
|
227
267
|
|
|
228
268
|
def _remove_root_logger_handlers(self):
|
|
229
269
|
"""
|
|
@@ -246,9 +286,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
|
|
246
286
|
try:
|
|
247
287
|
self.route_impls = None
|
|
248
288
|
|
|
249
|
-
stack = Stack(self.config, self.custom_provider_registry)
|
|
250
|
-
await stack.initialize()
|
|
251
|
-
self.impls = stack.impls
|
|
289
|
+
self.stack = Stack(self.config, self.custom_provider_registry)
|
|
290
|
+
await self.stack.initialize()
|
|
291
|
+
self.impls = self.stack.impls
|
|
252
292
|
except ModuleNotFoundError as _e:
|
|
253
293
|
cprint(_e.msg, color="red", file=sys.stderr)
|
|
254
294
|
cprint(
|
|
@@ -283,6 +323,43 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
|
|
|
283
323
|
self.route_impls = initialize_route_impls(self.impls)
|
|
284
324
|
return True
|
|
285
325
|
|
|
326
|
+
async def shutdown(self) -> None:
|
|
327
|
+
"""Shutdown the client and release all resources.
|
|
328
|
+
|
|
329
|
+
This method should be called when you're done using the client to properly
|
|
330
|
+
close database connections and release other resources. Failure to call this
|
|
331
|
+
method may result in the program hanging on exit while waiting for background
|
|
332
|
+
threads to complete.
|
|
333
|
+
|
|
334
|
+
This method is idempotent and can be called multiple times safely.
|
|
335
|
+
|
|
336
|
+
Example:
|
|
337
|
+
client = AsyncLlamaStackAsLibraryClient("starter")
|
|
338
|
+
await client.initialize()
|
|
339
|
+
# ... use the client ...
|
|
340
|
+
await client.shutdown()
|
|
341
|
+
"""
|
|
342
|
+
if self.stack:
|
|
343
|
+
await self.stack.shutdown()
|
|
344
|
+
self.stack = None
|
|
345
|
+
|
|
346
|
+
async def __aenter__(self) -> "AsyncLlamaStackAsLibraryClient":
|
|
347
|
+
"""Enter the async context manager.
|
|
348
|
+
|
|
349
|
+
Initializes the client and returns it.
|
|
350
|
+
|
|
351
|
+
Example:
|
|
352
|
+
async with AsyncLlamaStackAsLibraryClient("starter") as client:
|
|
353
|
+
response = await client.models.list()
|
|
354
|
+
# Client is automatically shut down here
|
|
355
|
+
"""
|
|
356
|
+
await self.initialize()
|
|
357
|
+
return self
|
|
358
|
+
|
|
359
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
360
|
+
"""Exit the async context manager and shut down the client."""
|
|
361
|
+
await self.shutdown()
|
|
362
|
+
|
|
286
363
|
async def request(
|
|
287
364
|
self,
|
|
288
365
|
cast_to: Any,
|
|
@@ -209,6 +209,17 @@ class CommonRoutingTableImpl(RoutingTable):
|
|
|
209
209
|
logger.info(f"Setting owner for {obj.type} '{obj.identifier}' to {obj.owner.principal}")
|
|
210
210
|
|
|
211
211
|
registered_obj = await register_object_with_provider(obj, p)
|
|
212
|
+
|
|
213
|
+
# Ensure OpenAI metadata exists for vector stores
|
|
214
|
+
if obj.type == ResourceType.vector_store.value:
|
|
215
|
+
if hasattr(p, "_ensure_openai_metadata_exists"):
|
|
216
|
+
await p._ensure_openai_metadata_exists(obj)
|
|
217
|
+
else:
|
|
218
|
+
logger.warning(
|
|
219
|
+
f"Provider {obj.provider_id} does not support OpenAI metadata creation. "
|
|
220
|
+
f"Vector store {obj.identifier} may not work with OpenAI-compatible APIs."
|
|
221
|
+
)
|
|
222
|
+
|
|
212
223
|
# TODO: This needs to be fixed for all APIs once they return the registered object
|
|
213
224
|
if obj.type == ResourceType.model.value:
|
|
214
225
|
await self.dist_registry.register(registered_obj)
|
|
@@ -55,6 +55,10 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
|
|
|
55
55
|
|
|
56
56
|
# Internal methods only - no public API exposure
|
|
57
57
|
|
|
58
|
+
async def list_vector_stores(self) -> list[VectorStoreWithOwner]:
|
|
59
|
+
"""List all registered vector stores."""
|
|
60
|
+
return await self.get_all_with_type(ResourceType.vector_store.value)
|
|
61
|
+
|
|
58
62
|
async def register_vector_store(
|
|
59
63
|
self,
|
|
60
64
|
vector_store_id: str,
|
llama_stack/core/stack.py
CHANGED
|
@@ -108,6 +108,7 @@ RESOURCES = [
|
|
|
108
108
|
),
|
|
109
109
|
("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks", RegisterBenchmarkRequest),
|
|
110
110
|
("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups", None),
|
|
111
|
+
("vector_stores", Api.vector_stores, "register_vector_store", "list_vector_stores", None),
|
|
111
112
|
]
|
|
112
113
|
|
|
113
114
|
|
|
@@ -620,7 +621,7 @@ class Stack:
|
|
|
620
621
|
async def shutdown(self):
|
|
621
622
|
for impl in self.impls.values():
|
|
622
623
|
impl_name = impl.__class__.__name__
|
|
623
|
-
logger.
|
|
624
|
+
logger.debug(f"Shutting down {impl_name}")
|
|
624
625
|
try:
|
|
625
626
|
if hasattr(impl, "shutdown"):
|
|
626
627
|
await asyncio.wait_for(impl.shutdown(), timeout=5)
|
|
@@ -642,6 +643,20 @@ class Stack:
|
|
|
642
643
|
if REGISTRY_REFRESH_TASK:
|
|
643
644
|
REGISTRY_REFRESH_TASK.cancel()
|
|
644
645
|
|
|
646
|
+
# Shutdown storage backends
|
|
647
|
+
from llama_stack.core.storage.kvstore.kvstore import shutdown_kvstore_backends
|
|
648
|
+
from llama_stack.core.storage.sqlstore.sqlstore import shutdown_sqlstore_backends
|
|
649
|
+
|
|
650
|
+
try:
|
|
651
|
+
await shutdown_kvstore_backends()
|
|
652
|
+
except Exception as e:
|
|
653
|
+
logger.exception(f"Failed to shutdown KV store backends: {e}")
|
|
654
|
+
|
|
655
|
+
try:
|
|
656
|
+
await shutdown_sqlstore_backends()
|
|
657
|
+
except Exception as e:
|
|
658
|
+
logger.exception(f"Failed to shutdown SQL store backends: {e}")
|
|
659
|
+
|
|
645
660
|
|
|
646
661
|
async def refresh_registry_once(impls: dict[Api, Any]):
|
|
647
662
|
logger.debug("refreshing registry")
|
|
@@ -62,6 +62,9 @@ class InmemoryKVStoreImpl(KVStore):
|
|
|
62
62
|
async def delete(self, key: str) -> None:
|
|
63
63
|
del self._store[key]
|
|
64
64
|
|
|
65
|
+
async def shutdown(self) -> None:
|
|
66
|
+
self._store.clear()
|
|
67
|
+
|
|
65
68
|
|
|
66
69
|
_KVSTORE_BACKENDS: dict[str, KVStoreConfig] = {}
|
|
67
70
|
_KVSTORE_INSTANCES: dict[tuple[str, str], KVStore] = {}
|
|
@@ -126,3 +129,11 @@ async def kvstore_impl(reference: KVStoreReference) -> KVStore:
|
|
|
126
129
|
await impl.initialize()
|
|
127
130
|
_KVSTORE_INSTANCES[cache_key] = impl
|
|
128
131
|
return impl
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
async def shutdown_kvstore_backends() -> None:
|
|
135
|
+
"""Shutdown all cached KV store instances."""
|
|
136
|
+
global _KVSTORE_INSTANCES
|
|
137
|
+
for instance in _KVSTORE_INSTANCES.values():
|
|
138
|
+
await instance.shutdown()
|
|
139
|
+
_KVSTORE_INSTANCES.clear()
|
|
@@ -123,3 +123,11 @@ class PostgresKVStoreImpl(KVStore):
|
|
|
123
123
|
(start_key, end_key),
|
|
124
124
|
)
|
|
125
125
|
return [row[0] for row in cursor.fetchall()]
|
|
126
|
+
|
|
127
|
+
async def shutdown(self) -> None:
|
|
128
|
+
if self._cursor:
|
|
129
|
+
self._cursor.close()
|
|
130
|
+
self._cursor = None
|
|
131
|
+
if self._conn:
|
|
132
|
+
self._conn.close()
|
|
133
|
+
self._conn = None
|
|
@@ -107,6 +107,14 @@ class SqlAlchemySqlStoreImpl(SqlStore):
|
|
|
107
107
|
|
|
108
108
|
return engine
|
|
109
109
|
|
|
110
|
+
async def shutdown(self) -> None:
|
|
111
|
+
"""Dispose the session maker's engine and close all connections."""
|
|
112
|
+
# The async_session holds a reference to the engine created in __init__
|
|
113
|
+
if self.async_session:
|
|
114
|
+
engine = self.async_session.kw.get("bind")
|
|
115
|
+
if engine:
|
|
116
|
+
await engine.dispose()
|
|
117
|
+
|
|
110
118
|
async def create_table(
|
|
111
119
|
self,
|
|
112
120
|
table: str,
|
|
@@ -85,3 +85,11 @@ def register_sqlstore_backends(backends: dict[str, StorageBackendConfig]) -> Non
|
|
|
85
85
|
_SQLSTORE_LOCKS.clear()
|
|
86
86
|
for name, cfg in backends.items():
|
|
87
87
|
_SQLSTORE_BACKENDS[name] = cfg
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
async def shutdown_sqlstore_backends() -> None:
|
|
91
|
+
"""Shutdown all cached SQL store instances."""
|
|
92
|
+
global _SQLSTORE_INSTANCES
|
|
93
|
+
for instance in _SQLSTORE_INSTANCES.values():
|
|
94
|
+
await instance.shutdown()
|
|
95
|
+
_SQLSTORE_INSTANCES.clear()
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
---
|
|
2
|
+
orphan: true
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
# Dell Distribution of Llama Stack
|
|
6
|
+
|
|
7
|
+
```{toctree}
|
|
8
|
+
:maxdepth: 2
|
|
9
|
+
:hidden:
|
|
10
|
+
|
|
11
|
+
self
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
|
|
15
|
+
|
|
16
|
+
{{ providers_table }}
|
|
17
|
+
|
|
18
|
+
You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
|
|
19
|
+
|
|
20
|
+
{% if run_config_env_vars %}
|
|
21
|
+
### Environment Variables
|
|
22
|
+
|
|
23
|
+
The following environment variables can be configured:
|
|
24
|
+
|
|
25
|
+
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
|
26
|
+
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
|
27
|
+
{% endfor %}
|
|
28
|
+
{% endif %}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
|
|
32
|
+
|
|
33
|
+
NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
export INFERENCE_PORT=8181
|
|
37
|
+
export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
|
|
38
|
+
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
|
|
39
|
+
export CHROMADB_HOST=localhost
|
|
40
|
+
export CHROMADB_PORT=6601
|
|
41
|
+
export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
|
|
42
|
+
export CUDA_VISIBLE_DEVICES=0
|
|
43
|
+
export LLAMA_STACK_PORT=8321
|
|
44
|
+
|
|
45
|
+
docker run --rm -it \
|
|
46
|
+
--pull always \
|
|
47
|
+
--network host \
|
|
48
|
+
-v $HOME/.cache/huggingface:/data \
|
|
49
|
+
-e HF_TOKEN=$HF_TOKEN \
|
|
50
|
+
-p $INFERENCE_PORT:$INFERENCE_PORT \
|
|
51
|
+
--gpus $CUDA_VISIBLE_DEVICES \
|
|
52
|
+
ghcr.io/huggingface/text-generation-inference \
|
|
53
|
+
--dtype bfloat16 \
|
|
54
|
+
--usage-stats off \
|
|
55
|
+
--sharded false \
|
|
56
|
+
--cuda-memory-fraction 0.7 \
|
|
57
|
+
--model-id $INFERENCE_MODEL \
|
|
58
|
+
--port $INFERENCE_PORT --hostname 0.0.0.0
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
export SAFETY_INFERENCE_PORT=8282
|
|
65
|
+
export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
|
|
66
|
+
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
|
67
|
+
export CUDA_VISIBLE_DEVICES=1
|
|
68
|
+
|
|
69
|
+
docker run --rm -it \
|
|
70
|
+
--pull always \
|
|
71
|
+
--network host \
|
|
72
|
+
-v $HOME/.cache/huggingface:/data \
|
|
73
|
+
-e HF_TOKEN=$HF_TOKEN \
|
|
74
|
+
-p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
|
|
75
|
+
--gpus $CUDA_VISIBLE_DEVICES \
|
|
76
|
+
ghcr.io/huggingface/text-generation-inference \
|
|
77
|
+
--dtype bfloat16 \
|
|
78
|
+
--usage-stats off \
|
|
79
|
+
--sharded false \
|
|
80
|
+
--cuda-memory-fraction 0.7 \
|
|
81
|
+
--model-id $SAFETY_MODEL \
|
|
82
|
+
--hostname 0.0.0.0 \
|
|
83
|
+
--port $SAFETY_INFERENCE_PORT
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## Dell distribution relies on ChromaDB for vector database usage
|
|
87
|
+
|
|
88
|
+
You can start a chroma-db easily using docker.
|
|
89
|
+
```bash
|
|
90
|
+
# This is where the indices are persisted
|
|
91
|
+
mkdir -p $HOME/chromadb
|
|
92
|
+
|
|
93
|
+
podman run --rm -it \
|
|
94
|
+
--network host \
|
|
95
|
+
--name chromadb \
|
|
96
|
+
-v $HOME/chromadb:/chroma/chroma \
|
|
97
|
+
-e IS_PERSISTENT=TRUE \
|
|
98
|
+
chromadb/chroma:latest \
|
|
99
|
+
--port $CHROMADB_PORT \
|
|
100
|
+
--host $CHROMADB_HOST
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Running Llama Stack
|
|
104
|
+
|
|
105
|
+
Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
|
|
106
|
+
|
|
107
|
+
### Via Docker
|
|
108
|
+
|
|
109
|
+
This method allows you to get started quickly without having to build the distribution code.
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
docker run -it \
|
|
113
|
+
--pull always \
|
|
114
|
+
--network host \
|
|
115
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
116
|
+
-v $HOME/.llama:/root/.llama \
|
|
117
|
+
# NOTE: mount the llama-stack directory if testing local changes else not needed
|
|
118
|
+
-v $HOME/git/llama-stack:/app/llama-stack-source \
|
|
119
|
+
# localhost/distribution-dell:dev if building / testing locally
|
|
120
|
+
-e INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
121
|
+
-e DEH_URL=$DEH_URL \
|
|
122
|
+
-e CHROMA_URL=$CHROMA_URL \
|
|
123
|
+
llamastack/distribution-{{ name }}\
|
|
124
|
+
--port $LLAMA_STACK_PORT
|
|
125
|
+
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
If you are using Llama Stack Safety / Shield APIs, use:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# You need a local checkout of llama-stack to run this, get it using
|
|
132
|
+
# git clone https://github.com/meta-llama/llama-stack.git
|
|
133
|
+
cd /path/to/llama-stack
|
|
134
|
+
|
|
135
|
+
export SAFETY_INFERENCE_PORT=8282
|
|
136
|
+
export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
|
|
137
|
+
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
|
138
|
+
|
|
139
|
+
docker run \
|
|
140
|
+
-it \
|
|
141
|
+
--pull always \
|
|
142
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
143
|
+
-v $HOME/.llama:/root/.llama \
|
|
144
|
+
-v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-config.yaml \
|
|
145
|
+
-e INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
146
|
+
-e DEH_URL=$DEH_URL \
|
|
147
|
+
-e SAFETY_MODEL=$SAFETY_MODEL \
|
|
148
|
+
-e DEH_SAFETY_URL=$DEH_SAFETY_URL \
|
|
149
|
+
-e CHROMA_URL=$CHROMA_URL \
|
|
150
|
+
llamastack/distribution-{{ name }} \
|
|
151
|
+
--config /root/my-config.yaml \
|
|
152
|
+
--port $LLAMA_STACK_PORT
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Via Docker with Custom Run Configuration
|
|
156
|
+
|
|
157
|
+
You can also run the Docker container with a custom run configuration file by mounting it into the container:
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
# Set the path to your custom config.yaml file
|
|
161
|
+
CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
|
|
162
|
+
|
|
163
|
+
docker run -it \
|
|
164
|
+
--pull always \
|
|
165
|
+
--network host \
|
|
166
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
167
|
+
-v $HOME/.llama:/root/.llama \
|
|
168
|
+
-v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
|
|
169
|
+
-e RUN_CONFIG_PATH=/app/custom-config.yaml \
|
|
170
|
+
-e INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
171
|
+
-e DEH_URL=$DEH_URL \
|
|
172
|
+
-e CHROMA_URL=$CHROMA_URL \
|
|
173
|
+
llamastack/distribution-{{ name }} \
|
|
174
|
+
--port $LLAMA_STACK_PORT
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
|
|
178
|
+
|
|
179
|
+
{% if run_configs %}
|
|
180
|
+
Available run configurations for this distribution:
|
|
181
|
+
{% for config in run_configs %}
|
|
182
|
+
- `{{ config }}`
|
|
183
|
+
{% endfor %}
|
|
184
|
+
{% endif %}
|
|
185
|
+
|
|
186
|
+
### Via Conda
|
|
187
|
+
|
|
188
|
+
Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
llama stack list-deps {{ name }} | xargs -L1 pip install
|
|
192
|
+
INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
193
|
+
DEH_URL=$DEH_URL \
|
|
194
|
+
CHROMA_URL=$CHROMA_URL \
|
|
195
|
+
llama stack run {{ name }} \
|
|
196
|
+
--port $LLAMA_STACK_PORT
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
If you are using Llama Stack Safety / Shield APIs, use:
|
|
200
|
+
|
|
201
|
+
```bash
|
|
202
|
+
INFERENCE_MODEL=$INFERENCE_MODEL \
|
|
203
|
+
DEH_URL=$DEH_URL \
|
|
204
|
+
SAFETY_MODEL=$SAFETY_MODEL \
|
|
205
|
+
DEH_SAFETY_URL=$DEH_SAFETY_URL \
|
|
206
|
+
CHROMA_URL=$CHROMA_URL \
|
|
207
|
+
llama stack run ./run-with-safety.yaml \
|
|
208
|
+
--port $LLAMA_STACK_PORT
|
|
209
|
+
```
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
---
|
|
2
|
+
orphan: true
|
|
3
|
+
---
|
|
4
|
+
# Meta Reference GPU Distribution
|
|
5
|
+
|
|
6
|
+
```{toctree}
|
|
7
|
+
:maxdepth: 2
|
|
8
|
+
:hidden:
|
|
9
|
+
|
|
10
|
+
self
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
|
|
14
|
+
|
|
15
|
+
{{ providers_table }}
|
|
16
|
+
|
|
17
|
+
Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
|
|
18
|
+
|
|
19
|
+
{% if run_config_env_vars %}
|
|
20
|
+
### Environment Variables
|
|
21
|
+
|
|
22
|
+
The following environment variables can be configured:
|
|
23
|
+
|
|
24
|
+
{% for var, (default_value, description) in run_config_env_vars.items() %}
|
|
25
|
+
- `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
|
|
26
|
+
{% endfor %}
|
|
27
|
+
{% endif %}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
## Prerequisite: Downloading Models
|
|
31
|
+
|
|
32
|
+
Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Running the Distribution
|
|
36
|
+
|
|
37
|
+
You can do this via venv or Docker which has a pre-built image.
|
|
38
|
+
|
|
39
|
+
### Via Docker
|
|
40
|
+
|
|
41
|
+
This method allows you to get started quickly without having to build the distribution code.
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
LLAMA_STACK_PORT=8321
|
|
45
|
+
docker run \
|
|
46
|
+
-it \
|
|
47
|
+
--pull always \
|
|
48
|
+
--gpu all \
|
|
49
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
50
|
+
-v ~/.llama:/root/.llama \
|
|
51
|
+
-e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
|
52
|
+
llamastack/distribution-{{ name }} \
|
|
53
|
+
--port $LLAMA_STACK_PORT
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
If you are using Llama Stack Safety / Shield APIs, use:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
docker run \
|
|
60
|
+
-it \
|
|
61
|
+
--pull always \
|
|
62
|
+
--gpu all \
|
|
63
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
64
|
+
-v ~/.llama:/root/.llama \
|
|
65
|
+
-e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
|
66
|
+
-e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
|
|
67
|
+
llamastack/distribution-{{ name }} \
|
|
68
|
+
--port $LLAMA_STACK_PORT
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Via Docker with Custom Run Configuration
|
|
72
|
+
|
|
73
|
+
You can also run the Docker container with a custom run configuration file by mounting it into the container:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Set the path to your custom config.yaml file
|
|
77
|
+
CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
|
|
78
|
+
LLAMA_STACK_PORT=8321
|
|
79
|
+
|
|
80
|
+
docker run \
|
|
81
|
+
-it \
|
|
82
|
+
--pull always \
|
|
83
|
+
--gpu all \
|
|
84
|
+
-p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
|
|
85
|
+
-v ~/.llama:/root/.llama \
|
|
86
|
+
-v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
|
|
87
|
+
-e RUN_CONFIG_PATH=/app/custom-config.yaml \
|
|
88
|
+
llamastack/distribution-{{ name }} \
|
|
89
|
+
--port $LLAMA_STACK_PORT
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
|
|
93
|
+
|
|
94
|
+
{% if run_configs %}
|
|
95
|
+
Available run configurations for this distribution:
|
|
96
|
+
{% for config in run_configs %}
|
|
97
|
+
- `{{ config }}`
|
|
98
|
+
{% endfor %}
|
|
99
|
+
{% endif %}
|
|
100
|
+
|
|
101
|
+
### Via venv
|
|
102
|
+
|
|
103
|
+
Make sure you have the Llama Stack CLI available.
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
|
|
107
|
+
INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
|
108
|
+
llama stack run distributions/{{ name }}/config.yaml \
|
|
109
|
+
--port 8321
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
If you are using Llama Stack Safety / Shield APIs, use:
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
|
|
116
|
+
SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
|
|
117
|
+
llama stack run distributions/{{ name }}/run-with-safety.yaml \
|
|
118
|
+
--port 8321
|
|
119
|
+
```
|