llama-stack 0.4.2__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. llama_stack/core/library_client.py +80 -3
  2. llama_stack/core/routing_tables/common.py +11 -0
  3. llama_stack/core/routing_tables/vector_stores.py +4 -0
  4. llama_stack/core/stack.py +16 -1
  5. llama_stack/core/storage/kvstore/kvstore.py +11 -0
  6. llama_stack/core/storage/kvstore/mongodb/mongodb.py +5 -0
  7. llama_stack/core/storage/kvstore/postgres/postgres.py +8 -0
  8. llama_stack/core/storage/kvstore/redis/redis.py +5 -0
  9. llama_stack/core/storage/sqlstore/sqlalchemy_sqlstore.py +8 -0
  10. llama_stack/core/storage/sqlstore/sqlstore.py +8 -0
  11. llama_stack/distributions/dell/doc_template.md +209 -0
  12. llama_stack/distributions/meta-reference-gpu/doc_template.md +119 -0
  13. llama_stack/distributions/nvidia/doc_template.md +170 -0
  14. llama_stack/distributions/oci/doc_template.md +140 -0
  15. llama_stack/models/llama/llama3/dog.jpg +0 -0
  16. llama_stack/models/llama/llama3/pasta.jpeg +0 -0
  17. llama_stack/models/llama/resources/dog.jpg +0 -0
  18. llama_stack/models/llama/resources/pasta.jpeg +0 -0
  19. llama_stack/models/llama/resources/small_dog.jpg +0 -0
  20. llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +184 -33
  21. llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +4 -0
  22. llama_stack/providers/inline/agents/meta_reference/responses/tool_executor.py +9 -1
  23. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.h +9 -0
  24. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/LocalInference.swift +189 -0
  25. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/Parsing.swift +238 -0
  26. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/PromptTemplate.swift +12 -0
  27. llama_stack/providers/inline/ios/inference/LocalInferenceImpl/SystemPrompts.swift +89 -0
  28. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.pbxproj +550 -0
  29. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  30. llama_stack/providers/inline/ios/inference/LocalInferenceImpl.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
  31. llama_stack/providers/remote/datasetio/nvidia/README.md +74 -0
  32. llama_stack/providers/remote/eval/nvidia/README.md +134 -0
  33. llama_stack/providers/remote/files/s3/README.md +266 -0
  34. llama_stack/providers/remote/inference/nvidia/NVIDIA.md +203 -0
  35. llama_stack/providers/remote/post_training/nvidia/README.md +151 -0
  36. llama_stack/providers/remote/safety/nvidia/README.md +78 -0
  37. llama_stack/providers/remote/vector_io/pgvector/pgvector.py +13 -1
  38. llama_stack/providers/utils/inference/embedding_mixin.py +20 -16
  39. llama_stack/providers/utils/memory/openai_vector_store_mixin.py +33 -0
  40. llama_stack/providers/utils/responses/responses_store.py +34 -0
  41. llama_stack/providers/utils/tools/mcp.py +258 -16
  42. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/METADATA +2 -2
  43. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/RECORD +47 -158
  44. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/WHEEL +1 -1
  45. llama_stack-0.4.4.dist-info/top_level.txt +1 -0
  46. llama_stack-0.4.2.dist-info/top_level.txt +0 -2
  47. llama_stack_api/__init__.py +0 -945
  48. llama_stack_api/admin/__init__.py +0 -45
  49. llama_stack_api/admin/api.py +0 -72
  50. llama_stack_api/admin/fastapi_routes.py +0 -117
  51. llama_stack_api/admin/models.py +0 -113
  52. llama_stack_api/agents.py +0 -173
  53. llama_stack_api/batches/__init__.py +0 -40
  54. llama_stack_api/batches/api.py +0 -53
  55. llama_stack_api/batches/fastapi_routes.py +0 -113
  56. llama_stack_api/batches/models.py +0 -78
  57. llama_stack_api/benchmarks/__init__.py +0 -43
  58. llama_stack_api/benchmarks/api.py +0 -39
  59. llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  60. llama_stack_api/benchmarks/models.py +0 -109
  61. llama_stack_api/common/__init__.py +0 -5
  62. llama_stack_api/common/content_types.py +0 -101
  63. llama_stack_api/common/errors.py +0 -95
  64. llama_stack_api/common/job_types.py +0 -38
  65. llama_stack_api/common/responses.py +0 -77
  66. llama_stack_api/common/training_types.py +0 -47
  67. llama_stack_api/common/type_system.py +0 -146
  68. llama_stack_api/connectors.py +0 -146
  69. llama_stack_api/conversations.py +0 -270
  70. llama_stack_api/datasetio.py +0 -55
  71. llama_stack_api/datasets/__init__.py +0 -61
  72. llama_stack_api/datasets/api.py +0 -35
  73. llama_stack_api/datasets/fastapi_routes.py +0 -104
  74. llama_stack_api/datasets/models.py +0 -152
  75. llama_stack_api/datatypes.py +0 -373
  76. llama_stack_api/eval.py +0 -137
  77. llama_stack_api/file_processors/__init__.py +0 -27
  78. llama_stack_api/file_processors/api.py +0 -64
  79. llama_stack_api/file_processors/fastapi_routes.py +0 -78
  80. llama_stack_api/file_processors/models.py +0 -42
  81. llama_stack_api/files/__init__.py +0 -35
  82. llama_stack_api/files/api.py +0 -51
  83. llama_stack_api/files/fastapi_routes.py +0 -124
  84. llama_stack_api/files/models.py +0 -107
  85. llama_stack_api/inference.py +0 -1169
  86. llama_stack_api/inspect_api/__init__.py +0 -37
  87. llama_stack_api/inspect_api/api.py +0 -25
  88. llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  89. llama_stack_api/inspect_api/models.py +0 -28
  90. llama_stack_api/internal/__init__.py +0 -9
  91. llama_stack_api/internal/kvstore.py +0 -26
  92. llama_stack_api/internal/sqlstore.py +0 -79
  93. llama_stack_api/llama_stack_api/__init__.py +0 -945
  94. llama_stack_api/llama_stack_api/admin/__init__.py +0 -45
  95. llama_stack_api/llama_stack_api/admin/api.py +0 -72
  96. llama_stack_api/llama_stack_api/admin/fastapi_routes.py +0 -117
  97. llama_stack_api/llama_stack_api/admin/models.py +0 -113
  98. llama_stack_api/llama_stack_api/agents.py +0 -173
  99. llama_stack_api/llama_stack_api/batches/__init__.py +0 -40
  100. llama_stack_api/llama_stack_api/batches/api.py +0 -53
  101. llama_stack_api/llama_stack_api/batches/fastapi_routes.py +0 -113
  102. llama_stack_api/llama_stack_api/batches/models.py +0 -78
  103. llama_stack_api/llama_stack_api/benchmarks/__init__.py +0 -43
  104. llama_stack_api/llama_stack_api/benchmarks/api.py +0 -39
  105. llama_stack_api/llama_stack_api/benchmarks/fastapi_routes.py +0 -109
  106. llama_stack_api/llama_stack_api/benchmarks/models.py +0 -109
  107. llama_stack_api/llama_stack_api/common/__init__.py +0 -5
  108. llama_stack_api/llama_stack_api/common/content_types.py +0 -101
  109. llama_stack_api/llama_stack_api/common/errors.py +0 -95
  110. llama_stack_api/llama_stack_api/common/job_types.py +0 -38
  111. llama_stack_api/llama_stack_api/common/responses.py +0 -77
  112. llama_stack_api/llama_stack_api/common/training_types.py +0 -47
  113. llama_stack_api/llama_stack_api/common/type_system.py +0 -146
  114. llama_stack_api/llama_stack_api/connectors.py +0 -146
  115. llama_stack_api/llama_stack_api/conversations.py +0 -270
  116. llama_stack_api/llama_stack_api/datasetio.py +0 -55
  117. llama_stack_api/llama_stack_api/datasets/__init__.py +0 -61
  118. llama_stack_api/llama_stack_api/datasets/api.py +0 -35
  119. llama_stack_api/llama_stack_api/datasets/fastapi_routes.py +0 -104
  120. llama_stack_api/llama_stack_api/datasets/models.py +0 -152
  121. llama_stack_api/llama_stack_api/datatypes.py +0 -373
  122. llama_stack_api/llama_stack_api/eval.py +0 -137
  123. llama_stack_api/llama_stack_api/file_processors/__init__.py +0 -27
  124. llama_stack_api/llama_stack_api/file_processors/api.py +0 -64
  125. llama_stack_api/llama_stack_api/file_processors/fastapi_routes.py +0 -78
  126. llama_stack_api/llama_stack_api/file_processors/models.py +0 -42
  127. llama_stack_api/llama_stack_api/files/__init__.py +0 -35
  128. llama_stack_api/llama_stack_api/files/api.py +0 -51
  129. llama_stack_api/llama_stack_api/files/fastapi_routes.py +0 -124
  130. llama_stack_api/llama_stack_api/files/models.py +0 -107
  131. llama_stack_api/llama_stack_api/inference.py +0 -1169
  132. llama_stack_api/llama_stack_api/inspect_api/__init__.py +0 -37
  133. llama_stack_api/llama_stack_api/inspect_api/api.py +0 -25
  134. llama_stack_api/llama_stack_api/inspect_api/fastapi_routes.py +0 -76
  135. llama_stack_api/llama_stack_api/inspect_api/models.py +0 -28
  136. llama_stack_api/llama_stack_api/internal/__init__.py +0 -9
  137. llama_stack_api/llama_stack_api/internal/kvstore.py +0 -26
  138. llama_stack_api/llama_stack_api/internal/sqlstore.py +0 -79
  139. llama_stack_api/llama_stack_api/models.py +0 -171
  140. llama_stack_api/llama_stack_api/openai_responses.py +0 -1468
  141. llama_stack_api/llama_stack_api/post_training.py +0 -370
  142. llama_stack_api/llama_stack_api/prompts.py +0 -203
  143. llama_stack_api/llama_stack_api/providers/__init__.py +0 -33
  144. llama_stack_api/llama_stack_api/providers/api.py +0 -16
  145. llama_stack_api/llama_stack_api/providers/fastapi_routes.py +0 -57
  146. llama_stack_api/llama_stack_api/providers/models.py +0 -24
  147. llama_stack_api/llama_stack_api/py.typed +0 -0
  148. llama_stack_api/llama_stack_api/rag_tool.py +0 -168
  149. llama_stack_api/llama_stack_api/resource.py +0 -37
  150. llama_stack_api/llama_stack_api/router_utils.py +0 -160
  151. llama_stack_api/llama_stack_api/safety.py +0 -132
  152. llama_stack_api/llama_stack_api/schema_utils.py +0 -208
  153. llama_stack_api/llama_stack_api/scoring.py +0 -93
  154. llama_stack_api/llama_stack_api/scoring_functions.py +0 -211
  155. llama_stack_api/llama_stack_api/shields.py +0 -93
  156. llama_stack_api/llama_stack_api/tools.py +0 -226
  157. llama_stack_api/llama_stack_api/vector_io.py +0 -941
  158. llama_stack_api/llama_stack_api/vector_stores.py +0 -51
  159. llama_stack_api/llama_stack_api/version.py +0 -9
  160. llama_stack_api/models.py +0 -171
  161. llama_stack_api/openai_responses.py +0 -1468
  162. llama_stack_api/post_training.py +0 -370
  163. llama_stack_api/prompts.py +0 -203
  164. llama_stack_api/providers/__init__.py +0 -33
  165. llama_stack_api/providers/api.py +0 -16
  166. llama_stack_api/providers/fastapi_routes.py +0 -57
  167. llama_stack_api/providers/models.py +0 -24
  168. llama_stack_api/py.typed +0 -0
  169. llama_stack_api/rag_tool.py +0 -168
  170. llama_stack_api/resource.py +0 -37
  171. llama_stack_api/router_utils.py +0 -160
  172. llama_stack_api/safety.py +0 -132
  173. llama_stack_api/schema_utils.py +0 -208
  174. llama_stack_api/scoring.py +0 -93
  175. llama_stack_api/scoring_functions.py +0 -211
  176. llama_stack_api/shields.py +0 -93
  177. llama_stack_api/tools.py +0 -226
  178. llama_stack_api/vector_io.py +0 -941
  179. llama_stack_api/vector_stores.py +0 -51
  180. llama_stack_api/version.py +0 -9
  181. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/entry_points.txt +0 -0
  182. {llama_stack-0.4.2.dist-info → llama_stack-0.4.4.dist-info}/licenses/LICENSE +0 -0
@@ -161,6 +161,45 @@ class LlamaStackAsLibraryClient(LlamaStackClient):
161
161
  """
162
162
  pass
163
163
 
164
+ def shutdown(self) -> None:
165
+ """Shutdown the client and release all resources.
166
+
167
+ This method should be called when you're done using the client to properly
168
+ close database connections and release other resources. Failure to call this
169
+ method may result in the program hanging on exit while waiting for background
170
+ threads to complete.
171
+
172
+ This method is idempotent and can be called multiple times safely.
173
+
174
+ Example:
175
+ client = LlamaStackAsLibraryClient("starter")
176
+ # ... use the client ...
177
+ client.shutdown()
178
+ """
179
+ loop = self.loop
180
+ asyncio.set_event_loop(loop)
181
+ try:
182
+ loop.run_until_complete(self.async_client.shutdown())
183
+ finally:
184
+ loop.close()
185
+ asyncio.set_event_loop(None)
186
+
187
+ def __enter__(self) -> "LlamaStackAsLibraryClient":
188
+ """Enter the context manager.
189
+
190
+ The client is already initialized in __init__, so this just returns self.
191
+
192
+ Example:
193
+ with LlamaStackAsLibraryClient("starter") as client:
194
+ response = client.models.list()
195
+ # Client is automatically shut down here
196
+ """
197
+ return self
198
+
199
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
200
+ """Exit the context manager and shut down the client."""
201
+ self.shutdown()
202
+
164
203
  def request(self, *args, **kwargs):
165
204
  loop = self.loop
166
205
  asyncio.set_event_loop(loop)
@@ -224,6 +263,7 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
224
263
  self.custom_provider_registry = custom_provider_registry
225
264
  self.provider_data = provider_data
226
265
  self.route_impls: RouteImpls | None = None # Initialize to None to prevent AttributeError
266
+ self.stack: Stack | None = None
227
267
 
228
268
  def _remove_root_logger_handlers(self):
229
269
  """
@@ -246,9 +286,9 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
246
286
  try:
247
287
  self.route_impls = None
248
288
 
249
- stack = Stack(self.config, self.custom_provider_registry)
250
- await stack.initialize()
251
- self.impls = stack.impls
289
+ self.stack = Stack(self.config, self.custom_provider_registry)
290
+ await self.stack.initialize()
291
+ self.impls = self.stack.impls
252
292
  except ModuleNotFoundError as _e:
253
293
  cprint(_e.msg, color="red", file=sys.stderr)
254
294
  cprint(
@@ -283,6 +323,43 @@ class AsyncLlamaStackAsLibraryClient(AsyncLlamaStackClient):
283
323
  self.route_impls = initialize_route_impls(self.impls)
284
324
  return True
285
325
 
326
+ async def shutdown(self) -> None:
327
+ """Shutdown the client and release all resources.
328
+
329
+ This method should be called when you're done using the client to properly
330
+ close database connections and release other resources. Failure to call this
331
+ method may result in the program hanging on exit while waiting for background
332
+ threads to complete.
333
+
334
+ This method is idempotent and can be called multiple times safely.
335
+
336
+ Example:
337
+ client = AsyncLlamaStackAsLibraryClient("starter")
338
+ await client.initialize()
339
+ # ... use the client ...
340
+ await client.shutdown()
341
+ """
342
+ if self.stack:
343
+ await self.stack.shutdown()
344
+ self.stack = None
345
+
346
+ async def __aenter__(self) -> "AsyncLlamaStackAsLibraryClient":
347
+ """Enter the async context manager.
348
+
349
+ Initializes the client and returns it.
350
+
351
+ Example:
352
+ async with AsyncLlamaStackAsLibraryClient("starter") as client:
353
+ response = await client.models.list()
354
+ # Client is automatically shut down here
355
+ """
356
+ await self.initialize()
357
+ return self
358
+
359
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
360
+ """Exit the async context manager and shut down the client."""
361
+ await self.shutdown()
362
+
286
363
  async def request(
287
364
  self,
288
365
  cast_to: Any,
@@ -209,6 +209,17 @@ class CommonRoutingTableImpl(RoutingTable):
209
209
  logger.info(f"Setting owner for {obj.type} '{obj.identifier}' to {obj.owner.principal}")
210
210
 
211
211
  registered_obj = await register_object_with_provider(obj, p)
212
+
213
+ # Ensure OpenAI metadata exists for vector stores
214
+ if obj.type == ResourceType.vector_store.value:
215
+ if hasattr(p, "_ensure_openai_metadata_exists"):
216
+ await p._ensure_openai_metadata_exists(obj)
217
+ else:
218
+ logger.warning(
219
+ f"Provider {obj.provider_id} does not support OpenAI metadata creation. "
220
+ f"Vector store {obj.identifier} may not work with OpenAI-compatible APIs."
221
+ )
222
+
212
223
  # TODO: This needs to be fixed for all APIs once they return the registered object
213
224
  if obj.type == ResourceType.model.value:
214
225
  await self.dist_registry.register(registered_obj)
@@ -55,6 +55,10 @@ class VectorStoresRoutingTable(CommonRoutingTableImpl):
55
55
 
56
56
  # Internal methods only - no public API exposure
57
57
 
58
+ async def list_vector_stores(self) -> list[VectorStoreWithOwner]:
59
+ """List all registered vector stores."""
60
+ return await self.get_all_with_type(ResourceType.vector_store.value)
61
+
58
62
  async def register_vector_store(
59
63
  self,
60
64
  vector_store_id: str,
llama_stack/core/stack.py CHANGED
@@ -108,6 +108,7 @@ RESOURCES = [
108
108
  ),
109
109
  ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks", RegisterBenchmarkRequest),
110
110
  ("tool_groups", Api.tool_groups, "register_tool_group", "list_tool_groups", None),
111
+ ("vector_stores", Api.vector_stores, "register_vector_store", "list_vector_stores", None),
111
112
  ]
112
113
 
113
114
 
@@ -620,7 +621,7 @@ class Stack:
620
621
  async def shutdown(self):
621
622
  for impl in self.impls.values():
622
623
  impl_name = impl.__class__.__name__
623
- logger.info(f"Shutting down {impl_name}")
624
+ logger.debug(f"Shutting down {impl_name}")
624
625
  try:
625
626
  if hasattr(impl, "shutdown"):
626
627
  await asyncio.wait_for(impl.shutdown(), timeout=5)
@@ -642,6 +643,20 @@ class Stack:
642
643
  if REGISTRY_REFRESH_TASK:
643
644
  REGISTRY_REFRESH_TASK.cancel()
644
645
 
646
+ # Shutdown storage backends
647
+ from llama_stack.core.storage.kvstore.kvstore import shutdown_kvstore_backends
648
+ from llama_stack.core.storage.sqlstore.sqlstore import shutdown_sqlstore_backends
649
+
650
+ try:
651
+ await shutdown_kvstore_backends()
652
+ except Exception as e:
653
+ logger.exception(f"Failed to shutdown KV store backends: {e}")
654
+
655
+ try:
656
+ await shutdown_sqlstore_backends()
657
+ except Exception as e:
658
+ logger.exception(f"Failed to shutdown SQL store backends: {e}")
659
+
645
660
 
646
661
  async def refresh_registry_once(impls: dict[Api, Any]):
647
662
  logger.debug("refreshing registry")
@@ -62,6 +62,9 @@ class InmemoryKVStoreImpl(KVStore):
62
62
  async def delete(self, key: str) -> None:
63
63
  del self._store[key]
64
64
 
65
+ async def shutdown(self) -> None:
66
+ self._store.clear()
67
+
65
68
 
66
69
  _KVSTORE_BACKENDS: dict[str, KVStoreConfig] = {}
67
70
  _KVSTORE_INSTANCES: dict[tuple[str, str], KVStore] = {}
@@ -126,3 +129,11 @@ async def kvstore_impl(reference: KVStoreReference) -> KVStore:
126
129
  await impl.initialize()
127
130
  _KVSTORE_INSTANCES[cache_key] = impl
128
131
  return impl
132
+
133
+
134
+ async def shutdown_kvstore_backends() -> None:
135
+ """Shutdown all cached KV store instances."""
136
+ global _KVSTORE_INSTANCES
137
+ for instance in _KVSTORE_INSTANCES.values():
138
+ await instance.shutdown()
139
+ _KVSTORE_INSTANCES.clear()
@@ -83,3 +83,8 @@ class MongoDBKVStoreImpl(KVStore):
83
83
  async for doc in cursor:
84
84
  result.append(doc["key"])
85
85
  return result
86
+
87
+ async def shutdown(self) -> None:
88
+ if self.conn:
89
+ await self.conn.close()
90
+ self.conn = None
@@ -123,3 +123,11 @@ class PostgresKVStoreImpl(KVStore):
123
123
  (start_key, end_key),
124
124
  )
125
125
  return [row[0] for row in cursor.fetchall()]
126
+
127
+ async def shutdown(self) -> None:
128
+ if self._cursor:
129
+ self._cursor.close()
130
+ self._cursor = None
131
+ if self._conn:
132
+ self._conn.close()
133
+ self._conn = None
@@ -99,3 +99,8 @@ class RedisKVStoreImpl(KVStore):
99
99
  if cursor == 0:
100
100
  break
101
101
  return result
102
+
103
+ async def shutdown(self) -> None:
104
+ if self._redis:
105
+ await self._redis.close()
106
+ self._redis = None
@@ -107,6 +107,14 @@ class SqlAlchemySqlStoreImpl(SqlStore):
107
107
 
108
108
  return engine
109
109
 
110
+ async def shutdown(self) -> None:
111
+ """Dispose the session maker's engine and close all connections."""
112
+ # The async_session holds a reference to the engine created in __init__
113
+ if self.async_session:
114
+ engine = self.async_session.kw.get("bind")
115
+ if engine:
116
+ await engine.dispose()
117
+
110
118
  async def create_table(
111
119
  self,
112
120
  table: str,
@@ -85,3 +85,11 @@ def register_sqlstore_backends(backends: dict[str, StorageBackendConfig]) -> Non
85
85
  _SQLSTORE_LOCKS.clear()
86
86
  for name, cfg in backends.items():
87
87
  _SQLSTORE_BACKENDS[name] = cfg
88
+
89
+
90
+ async def shutdown_sqlstore_backends() -> None:
91
+ """Shutdown all cached SQL store instances."""
92
+ global _SQLSTORE_INSTANCES
93
+ for instance in _SQLSTORE_INSTANCES.values():
94
+ await instance.shutdown()
95
+ _SQLSTORE_INSTANCES.clear()
@@ -0,0 +1,209 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+
5
+ # Dell Distribution of Llama Stack
6
+
7
+ ```{toctree}
8
+ :maxdepth: 2
9
+ :hidden:
10
+
11
+ self
12
+ ```
13
+
14
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations.
15
+
16
+ {{ providers_table }}
17
+
18
+ You can use this distribution if you have GPUs and want to run an independent TGI or Dell Enterprise Hub container for running inference.
19
+
20
+ {% if run_config_env_vars %}
21
+ ### Environment Variables
22
+
23
+ The following environment variables can be configured:
24
+
25
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
26
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
27
+ {% endfor %}
28
+ {% endif %}
29
+
30
+
31
+ ## Setting up Inference server using Dell Enterprise Hub's custom TGI container.
32
+
33
+ NOTE: This is a placeholder to run inference with TGI. This will be updated to use [Dell Enterprise Hub's containers](https://dell.huggingface.co/authenticated/models) once verified.
34
+
35
+ ```bash
36
+ export INFERENCE_PORT=8181
37
+ export DEH_URL=http://0.0.0.0:$INFERENCE_PORT
38
+ export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
39
+ export CHROMADB_HOST=localhost
40
+ export CHROMADB_PORT=6601
41
+ export CHROMA_URL=http://$CHROMADB_HOST:$CHROMADB_PORT
42
+ export CUDA_VISIBLE_DEVICES=0
43
+ export LLAMA_STACK_PORT=8321
44
+
45
+ docker run --rm -it \
46
+ --pull always \
47
+ --network host \
48
+ -v $HOME/.cache/huggingface:/data \
49
+ -e HF_TOKEN=$HF_TOKEN \
50
+ -p $INFERENCE_PORT:$INFERENCE_PORT \
51
+ --gpus $CUDA_VISIBLE_DEVICES \
52
+ ghcr.io/huggingface/text-generation-inference \
53
+ --dtype bfloat16 \
54
+ --usage-stats off \
55
+ --sharded false \
56
+ --cuda-memory-fraction 0.7 \
57
+ --model-id $INFERENCE_MODEL \
58
+ --port $INFERENCE_PORT --hostname 0.0.0.0
59
+ ```
60
+
61
+ If you are using Llama Stack Safety / Shield APIs, then you will need to also run another instance of a TGI with a corresponding safety model like `meta-llama/Llama-Guard-3-1B` using a script like:
62
+
63
+ ```bash
64
+ export SAFETY_INFERENCE_PORT=8282
65
+ export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
66
+ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
67
+ export CUDA_VISIBLE_DEVICES=1
68
+
69
+ docker run --rm -it \
70
+ --pull always \
71
+ --network host \
72
+ -v $HOME/.cache/huggingface:/data \
73
+ -e HF_TOKEN=$HF_TOKEN \
74
+ -p $SAFETY_INFERENCE_PORT:$SAFETY_INFERENCE_PORT \
75
+ --gpus $CUDA_VISIBLE_DEVICES \
76
+ ghcr.io/huggingface/text-generation-inference \
77
+ --dtype bfloat16 \
78
+ --usage-stats off \
79
+ --sharded false \
80
+ --cuda-memory-fraction 0.7 \
81
+ --model-id $SAFETY_MODEL \
82
+ --hostname 0.0.0.0 \
83
+ --port $SAFETY_INFERENCE_PORT
84
+ ```
85
+
86
+ ## Dell distribution relies on ChromaDB for vector database usage
87
+
88
+ You can start a chroma-db easily using docker.
89
+ ```bash
90
+ # This is where the indices are persisted
91
+ mkdir -p $HOME/chromadb
92
+
93
+ podman run --rm -it \
94
+ --network host \
95
+ --name chromadb \
96
+ -v $HOME/chromadb:/chroma/chroma \
97
+ -e IS_PERSISTENT=TRUE \
98
+ chromadb/chroma:latest \
99
+ --port $CHROMADB_PORT \
100
+ --host $CHROMADB_HOST
101
+ ```
102
+
103
+ ## Running Llama Stack
104
+
105
+ Now you are ready to run Llama Stack with TGI as the inference provider. You can do this via Conda (build code) or Docker which has a pre-built image.
106
+
107
+ ### Via Docker
108
+
109
+ This method allows you to get started quickly without having to build the distribution code.
110
+
111
+ ```bash
112
+ docker run -it \
113
+ --pull always \
114
+ --network host \
115
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
116
+ -v $HOME/.llama:/root/.llama \
117
+ # NOTE: mount the llama-stack directory if testing local changes else not needed
118
+ -v $HOME/git/llama-stack:/app/llama-stack-source \
119
+ # localhost/distribution-dell:dev if building / testing locally
120
+ -e INFERENCE_MODEL=$INFERENCE_MODEL \
121
+ -e DEH_URL=$DEH_URL \
122
+ -e CHROMA_URL=$CHROMA_URL \
123
+ llamastack/distribution-{{ name }}\
124
+ --port $LLAMA_STACK_PORT
125
+
126
+ ```
127
+
128
+ If you are using Llama Stack Safety / Shield APIs, use:
129
+
130
+ ```bash
131
+ # You need a local checkout of llama-stack to run this, get it using
132
+ # git clone https://github.com/meta-llama/llama-stack.git
133
+ cd /path/to/llama-stack
134
+
135
+ export SAFETY_INFERENCE_PORT=8282
136
+ export DEH_SAFETY_URL=http://0.0.0.0:$SAFETY_INFERENCE_PORT
137
+ export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
138
+
139
+ docker run \
140
+ -it \
141
+ --pull always \
142
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
143
+ -v $HOME/.llama:/root/.llama \
144
+ -v ./llama_stack/distributions/tgi/run-with-safety.yaml:/root/my-config.yaml \
145
+ -e INFERENCE_MODEL=$INFERENCE_MODEL \
146
+ -e DEH_URL=$DEH_URL \
147
+ -e SAFETY_MODEL=$SAFETY_MODEL \
148
+ -e DEH_SAFETY_URL=$DEH_SAFETY_URL \
149
+ -e CHROMA_URL=$CHROMA_URL \
150
+ llamastack/distribution-{{ name }} \
151
+ --config /root/my-config.yaml \
152
+ --port $LLAMA_STACK_PORT
153
+ ```
154
+
155
+ ### Via Docker with Custom Run Configuration
156
+
157
+ You can also run the Docker container with a custom run configuration file by mounting it into the container:
158
+
159
+ ```bash
160
+ # Set the path to your custom config.yaml file
161
+ CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
162
+
163
+ docker run -it \
164
+ --pull always \
165
+ --network host \
166
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
167
+ -v $HOME/.llama:/root/.llama \
168
+ -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
169
+ -e RUN_CONFIG_PATH=/app/custom-config.yaml \
170
+ -e INFERENCE_MODEL=$INFERENCE_MODEL \
171
+ -e DEH_URL=$DEH_URL \
172
+ -e CHROMA_URL=$CHROMA_URL \
173
+ llamastack/distribution-{{ name }} \
174
+ --port $LLAMA_STACK_PORT
175
+ ```
176
+
177
+ **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
178
+
179
+ {% if run_configs %}
180
+ Available run configurations for this distribution:
181
+ {% for config in run_configs %}
182
+ - `{{ config }}`
183
+ {% endfor %}
184
+ {% endif %}
185
+
186
+ ### Via Conda
187
+
188
+ Make sure you have done `pip install llama-stack` and have the Llama Stack CLI available.
189
+
190
+ ```bash
191
+ llama stack list-deps {{ name }} | xargs -L1 pip install
192
+ INFERENCE_MODEL=$INFERENCE_MODEL \
193
+ DEH_URL=$DEH_URL \
194
+ CHROMA_URL=$CHROMA_URL \
195
+ llama stack run {{ name }} \
196
+ --port $LLAMA_STACK_PORT
197
+ ```
198
+
199
+ If you are using Llama Stack Safety / Shield APIs, use:
200
+
201
+ ```bash
202
+ INFERENCE_MODEL=$INFERENCE_MODEL \
203
+ DEH_URL=$DEH_URL \
204
+ SAFETY_MODEL=$SAFETY_MODEL \
205
+ DEH_SAFETY_URL=$DEH_SAFETY_URL \
206
+ CHROMA_URL=$CHROMA_URL \
207
+ llama stack run ./run-with-safety.yaml \
208
+ --port $LLAMA_STACK_PORT
209
+ ```
@@ -0,0 +1,119 @@
1
+ ---
2
+ orphan: true
3
+ ---
4
+ # Meta Reference GPU Distribution
5
+
6
+ ```{toctree}
7
+ :maxdepth: 2
8
+ :hidden:
9
+
10
+ self
11
+ ```
12
+
13
+ The `llamastack/distribution-{{ name }}` distribution consists of the following provider configurations:
14
+
15
+ {{ providers_table }}
16
+
17
+ Note that you need access to nvidia GPUs to run this distribution. This distribution is not compatible with CPU-only machines or machines with AMD GPUs.
18
+
19
+ {% if run_config_env_vars %}
20
+ ### Environment Variables
21
+
22
+ The following environment variables can be configured:
23
+
24
+ {% for var, (default_value, description) in run_config_env_vars.items() %}
25
+ - `{{ var }}`: {{ description }} (default: `{{ default_value }}`)
26
+ {% endfor %}
27
+ {% endif %}
28
+
29
+
30
+ ## Prerequisite: Downloading Models
31
+
32
+ Please check that you have llama model checkpoints downloaded in `~/.llama` before proceeding. See [installation guide](../../references/llama_cli_reference/download_models.md) here to download the models using the Hugging Face CLI.
33
+ ```
34
+
35
+ ## Running the Distribution
36
+
37
+ You can do this via venv or Docker which has a pre-built image.
38
+
39
+ ### Via Docker
40
+
41
+ This method allows you to get started quickly without having to build the distribution code.
42
+
43
+ ```bash
44
+ LLAMA_STACK_PORT=8321
45
+ docker run \
46
+ -it \
47
+ --pull always \
48
+ --gpu all \
49
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
50
+ -v ~/.llama:/root/.llama \
51
+ -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
52
+ llamastack/distribution-{{ name }} \
53
+ --port $LLAMA_STACK_PORT
54
+ ```
55
+
56
+ If you are using Llama Stack Safety / Shield APIs, use:
57
+
58
+ ```bash
59
+ docker run \
60
+ -it \
61
+ --pull always \
62
+ --gpu all \
63
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
64
+ -v ~/.llama:/root/.llama \
65
+ -e INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
66
+ -e SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
67
+ llamastack/distribution-{{ name }} \
68
+ --port $LLAMA_STACK_PORT
69
+ ```
70
+
71
+ ### Via Docker with Custom Run Configuration
72
+
73
+ You can also run the Docker container with a custom run configuration file by mounting it into the container:
74
+
75
+ ```bash
76
+ # Set the path to your custom config.yaml file
77
+ CUSTOM_RUN_CONFIG=/path/to/your/custom-config.yaml
78
+ LLAMA_STACK_PORT=8321
79
+
80
+ docker run \
81
+ -it \
82
+ --pull always \
83
+ --gpu all \
84
+ -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
85
+ -v ~/.llama:/root/.llama \
86
+ -v $CUSTOM_RUN_CONFIG:/app/custom-config.yaml \
87
+ -e RUN_CONFIG_PATH=/app/custom-config.yaml \
88
+ llamastack/distribution-{{ name }} \
89
+ --port $LLAMA_STACK_PORT
90
+ ```
91
+
92
+ **Note**: The run configuration must be mounted into the container before it can be used. The `-v` flag mounts your local file into the container, and the `RUN_CONFIG_PATH` environment variable tells the entrypoint script which configuration to use.
93
+
94
+ {% if run_configs %}
95
+ Available run configurations for this distribution:
96
+ {% for config in run_configs %}
97
+ - `{{ config }}`
98
+ {% endfor %}
99
+ {% endif %}
100
+
101
+ ### Via venv
102
+
103
+ Make sure you have the Llama Stack CLI available.
104
+
105
+ ```bash
106
+ llama stack list-deps meta-reference-gpu | xargs -L1 uv pip install
107
+ INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
108
+ llama stack run distributions/{{ name }}/config.yaml \
109
+ --port 8321
110
+ ```
111
+
112
+ If you are using Llama Stack Safety / Shield APIs, use:
113
+
114
+ ```bash
115
+ INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct \
116
+ SAFETY_MODEL=meta-llama/Llama-Guard-3-1B \
117
+ llama stack run distributions/{{ name }}/run-with-safety.yaml \
118
+ --port 8321
119
+ ```