strands-diffusers 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- strands_diffusers/__init__.py +41 -0
- strands_diffusers/_version.py +24 -0
- strands_diffusers/core/__init__.py +4 -0
- strands_diffusers/core/engine.py +163 -0
- strands_diffusers/core/io.py +552 -0
- strands_diffusers/core/registry.py +349 -0
- strands_diffusers/core/viz.py +256 -0
- strands_diffusers/tools/__init__.py +4 -0
- strands_diffusers/tools/use_diffusers.py +420 -0
- strands_diffusers-0.1.0.dist-info/METADATA +199 -0
- strands_diffusers-0.1.0.dist-info/RECORD +13 -0
- strands_diffusers-0.1.0.dist-info/WHEEL +5 -0
- strands_diffusers-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""use_diffusers — THE universal entrypoint to all of HuggingFace diffusers.
|
|
2
|
+
|
|
3
|
+
Like `use_aws` wraps boto3, `use_lerobot` wraps lerobot, and `use_transformers`
|
|
4
|
+
wraps the transformers task taxonomy, this wraps the entire diffusers library with
|
|
5
|
+
ZERO hardcoded operations. It is the single tool an agent needs to run any of
|
|
6
|
+
diffusers' 300+ pipelines across every modality:
|
|
7
|
+
|
|
8
|
+
text / image / video / robot-state IN
|
|
9
|
+
image / video / audio / ACTIONS OUT — natively.
|
|
10
|
+
|
|
11
|
+
It has two layers:
|
|
12
|
+
|
|
13
|
+
1. RUN (high-level): construct a pipeline class via from_pretrained and call it.
|
|
14
|
+
Inputs are coerced (paths/URLs/base64 → PIL/video). Outputs (images, video,
|
|
15
|
+
audio, robot actions) are auto-serialized to disk and returned by path.
|
|
16
|
+
|
|
17
|
+
use_diffusers(action="run", pipeline="StableDiffusionPipeline",
|
|
18
|
+
model="runwayml/stable-diffusion-v1-5",
|
|
19
|
+
parameters={"prompt": "a robot in a kitchen",
|
|
20
|
+
"num_inference_steps": 25})
|
|
21
|
+
|
|
22
|
+
# World-foundation model action-policy rollout (Cosmos3): returns BOTH a
|
|
23
|
+
# generated world video AND the predicted robot action chunk.
|
|
24
|
+
use_diffusers(action="run", pipeline="Cosmos3OmniPipeline",
|
|
25
|
+
model="nvidia/Cosmos3-Nano",
|
|
26
|
+
parameters={"prompt": "Put the pot to the left of the cup.",
|
|
27
|
+
"action": "cached:act_cond", "fps": 5,
|
|
28
|
+
"num_inference_steps": 30, "guidance_scale": 1.0})
|
|
29
|
+
|
|
30
|
+
2. CALL (low-level): dynamically resolve & call ANY diffusers class / function /
|
|
31
|
+
method — DiffusionPipeline, schedulers, VAEs, CosmosActionCondition, the
|
|
32
|
+
export_to_video util, or a cached pipeline's method.
|
|
33
|
+
|
|
34
|
+
use_diffusers(action="call", target="CosmosActionCondition",
|
|
35
|
+
parameters={"mode": "policy", "chunk_size": 16,
|
|
36
|
+
"domain_name": "bridge_orig_lerobot",
|
|
37
|
+
"video": "robot.mp4"},
|
|
38
|
+
cache_key="act_cond")
|
|
39
|
+
use_diffusers(action="call", target="cached:pipe.enable_model_cpu_offload")
|
|
40
|
+
|
|
41
|
+
Discovery (so the agent never guesses):
|
|
42
|
+
use_diffusers(action="pipelines") # all pipelines + modality
|
|
43
|
+
use_diffusers(action="models") # all model classes
|
|
44
|
+
use_diffusers(action="schedulers") # all schedulers
|
|
45
|
+
use_diffusers(action="tasks") # AutoPipeline task → class maps
|
|
46
|
+
use_diffusers(action="modalities") # pipelines grouped by modality
|
|
47
|
+
use_diffusers(action="wfm") # world-foundation/action models
|
|
48
|
+
use_diffusers(action="pipeline_info", target="Cosmos3OmniPipeline")
|
|
49
|
+
use_diffusers(action="inspect", target="...") # signature + docs of anything
|
|
50
|
+
use_diffusers(action="cache" | "clear_cache")
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
from __future__ import annotations
|
|
54
|
+
|
|
55
|
+
import json
|
|
56
|
+
import logging
|
|
57
|
+
import subprocess
|
|
58
|
+
import sys
|
|
59
|
+
import traceback
|
|
60
|
+
from typing import Any, Dict, Optional
|
|
61
|
+
|
|
62
|
+
from strands import tool
|
|
63
|
+
|
|
64
|
+
from strands_diffusers.core import engine, io, registry
|
|
65
|
+
|
|
66
|
+
logger = logging.getLogger(__name__)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _ensure(package: str) -> None:
|
|
70
|
+
import importlib
|
|
71
|
+
try:
|
|
72
|
+
importlib.import_module(package)
|
|
73
|
+
except ImportError:
|
|
74
|
+
logger.info("Installing %s ...", package)
|
|
75
|
+
subprocess.run([sys.executable, "-m", "pip", "install", package],
|
|
76
|
+
check=True, timeout=600)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _ok(text: str, **extra: Any) -> Dict[str, Any]:
|
|
80
|
+
payload = {"status": "success", "content": [{"text": text}]}
|
|
81
|
+
payload.update(extra)
|
|
82
|
+
return payload
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _err(text: str) -> Dict[str, Any]:
|
|
86
|
+
return {"status": "error", "content": [{"text": text}]}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@tool
|
|
90
|
+
def use_diffusers(
|
|
91
|
+
action: str = "pipelines",
|
|
92
|
+
pipeline: Optional[str] = None,
|
|
93
|
+
model: Optional[str] = None,
|
|
94
|
+
inputs: Any = None,
|
|
95
|
+
target: Optional[str] = None,
|
|
96
|
+
parameters: Optional[Dict[str, Any]] = None,
|
|
97
|
+
cache_key: Optional[str] = None,
|
|
98
|
+
device: Optional[str] = None,
|
|
99
|
+
dtype: Optional[str] = None,
|
|
100
|
+
save_artifacts: bool = True,
|
|
101
|
+
fps: float = 24.0,
|
|
102
|
+
label: str = "",
|
|
103
|
+
) -> Dict[str, Any]:
|
|
104
|
+
"""Universal access to ALL diffusers functionality — no hardcoding.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
action: What to do:
|
|
108
|
+
run — load a `pipeline` class from `model` and call it on `parameters`
|
|
109
|
+
call — dynamically call any diffusers class/function/method via `target`
|
|
110
|
+
pipelines — list every pipeline class with its derived modality
|
|
111
|
+
models — list every model class (VAEs, transformers, controlnets)
|
|
112
|
+
schedulers — list every scheduler class
|
|
113
|
+
tasks — diffusers' AutoPipeline task → {family: class} maps
|
|
114
|
+
modalities — pipelines grouped by modality (text-to-image/video/world/...)
|
|
115
|
+
wfm — world-foundation / action-capable pipelines (Cosmos, Wan, ...)
|
|
116
|
+
pipeline_info— modality + __call__ signature for one `target` pipeline class
|
|
117
|
+
inspect — signature + docstring of any `target`
|
|
118
|
+
cache — list cached pipelines/objects
|
|
119
|
+
clear_cache — free a `cache_key` (or everything)
|
|
120
|
+
visualize — render a robot action chunk to plots + an animation (SEE it)
|
|
121
|
+
pipeline: diffusers pipeline class name for action="run" (e.g.
|
|
122
|
+
"StableDiffusionPipeline", "Cosmos3OmniPipeline"). Use
|
|
123
|
+
"DiffusionPipeline"/"AutoPipelineForText2Image" for auto-detect.
|
|
124
|
+
model: HF repo id or local path to load weights from.
|
|
125
|
+
inputs: convenience positional input merged into the pipeline call (rarely
|
|
126
|
+
needed — most diffusers pipelines take keyword args via `parameters`).
|
|
127
|
+
target: For action="call"/"inspect"/"pipeline_info": dotted path into
|
|
128
|
+
diffusers, e.g. "CosmosActionCondition", "utils.export_to_video",
|
|
129
|
+
"cached:key.method".
|
|
130
|
+
parameters: kwargs for the pipeline call / the dynamic call. Values that are
|
|
131
|
+
"cached:key" resolve to live cached objects; path/URL/base64
|
|
132
|
+
values are coerced to PIL/video automatically.
|
|
133
|
+
cache_key: name to cache (or fetch) a loaded object under.
|
|
134
|
+
device: "cuda" / "mps" / "cpu" / "auto".
|
|
135
|
+
dtype: "bfloat16" / "float16" / "float32" (default: device-appropriate).
|
|
136
|
+
save_artifacts: write generated images/video/audio/actions to disk.
|
|
137
|
+
fps: frames-per-second used when exporting generated video.
|
|
138
|
+
label: human-readable description for logging.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Dict with status + content; "run"/"call" also include "data" (JSON-safe
|
|
142
|
+
result) and "artifacts" (paths to generated media + action JSON).
|
|
143
|
+
"""
|
|
144
|
+
# LLM tool-calls may serialize the `parameters` object to a JSON string;
|
|
145
|
+
# also accept that gracefully (else dict(str) raises a cryptic ValueError).
|
|
146
|
+
if isinstance(parameters, str):
|
|
147
|
+
try:
|
|
148
|
+
parameters = json.loads(parameters)
|
|
149
|
+
except (ValueError, TypeError):
|
|
150
|
+
return _err("`parameters` must be a JSON object/dict; got an "
|
|
151
|
+
f"unparseable string: {parameters[:120]!r}")
|
|
152
|
+
if parameters is not None and not isinstance(parameters, dict):
|
|
153
|
+
return _err(f"`parameters` must be a dict, got {type(parameters).__name__}.")
|
|
154
|
+
params = dict(parameters or {})
|
|
155
|
+
try:
|
|
156
|
+
# ───────── discovery ─────────
|
|
157
|
+
if action == "pipelines":
|
|
158
|
+
pipes = registry.pipelines()
|
|
159
|
+
lines = [f"🎨 diffusers exposes {len(pipes)} pipelines (100% coverage):\n"]
|
|
160
|
+
for p in pipes:
|
|
161
|
+
lines.append(f" • {p} [{registry.modality_of(p)}]")
|
|
162
|
+
lines.append('\n💡 run: use_diffusers(action="run", pipeline="<class>", '
|
|
163
|
+
'model="<repo>", parameters={...})')
|
|
164
|
+
return _ok("\n".join(lines), data=pipes)
|
|
165
|
+
|
|
166
|
+
if action == "models":
|
|
167
|
+
ms = registry.models()
|
|
168
|
+
return _ok(f"🧩 {len(ms)} model classes:\n " + "\n ".join(ms), data=ms)
|
|
169
|
+
|
|
170
|
+
if action == "schedulers":
|
|
171
|
+
sc = registry.schedulers()
|
|
172
|
+
return _ok(f"⏱️ {len(sc)} schedulers:\n " + "\n ".join(sc), data=sc)
|
|
173
|
+
|
|
174
|
+
if action == "tasks":
|
|
175
|
+
t = registry.auto_pipeline_tasks()
|
|
176
|
+
return _ok("🗂️ AutoPipeline task maps:\n" + json.dumps(t, indent=2), data=t)
|
|
177
|
+
|
|
178
|
+
if action == "modalities":
|
|
179
|
+
groups = registry.tasks_by_modality()
|
|
180
|
+
lines = ["🎛️ Pipelines by modality:\n"]
|
|
181
|
+
for mod in sorted(groups):
|
|
182
|
+
lines.append(f" {mod} ({len(groups[mod])}):")
|
|
183
|
+
for p in groups[mod]:
|
|
184
|
+
lines.append(f" • {p}")
|
|
185
|
+
return _ok("\n".join(lines), data=groups)
|
|
186
|
+
|
|
187
|
+
if action == "wfm":
|
|
188
|
+
wfm = registry.world_foundation_models()
|
|
189
|
+
lines = ["🌍 World-foundation / action-capable pipelines:\n"]
|
|
190
|
+
for p in wfm:
|
|
191
|
+
lines.append(f" • {p} [{registry.modality_of(p)}]")
|
|
192
|
+
lines.append("\n💡 Cosmos action-policy runs return BOTH a world video "
|
|
193
|
+
"and a robot action chunk.\n"
|
|
194
|
+
" Pass a CosmosActionCondition via parameters={'action': "
|
|
195
|
+
"'cached:cond'}.")
|
|
196
|
+
return _ok("\n".join(lines), data=wfm)
|
|
197
|
+
|
|
198
|
+
if action == "pipeline_info":
|
|
199
|
+
if not target:
|
|
200
|
+
return _err("Provide `target` (a pipeline class name).")
|
|
201
|
+
info = registry.pipeline_info(target)
|
|
202
|
+
if not info:
|
|
203
|
+
return _err(f"Unknown pipeline '{target}'. Use action='pipelines'.")
|
|
204
|
+
return _ok(f"🔍 {target}\n{json.dumps(info, indent=2, default=str)}", data=info)
|
|
205
|
+
|
|
206
|
+
if action == "inspect":
|
|
207
|
+
if not target:
|
|
208
|
+
return _err("Provide `target` (e.g. 'StableDiffusionPipeline').")
|
|
209
|
+
obj = _resolve_target(target)
|
|
210
|
+
info = registry.describe(obj)
|
|
211
|
+
return _ok(f"🔍 {target}\n{json.dumps(info, indent=2, default=str)}", data=info)
|
|
212
|
+
|
|
213
|
+
if action == "cache":
|
|
214
|
+
c = engine.cache_list()
|
|
215
|
+
if not c:
|
|
216
|
+
return _ok("📦 cache empty")
|
|
217
|
+
return _ok("📦 cached:\n" + "\n".join(f" • {k}: {v}" for k, v in c.items()),
|
|
218
|
+
data=c)
|
|
219
|
+
|
|
220
|
+
if action == "clear_cache":
|
|
221
|
+
n = engine.cache_clear(cache_key)
|
|
222
|
+
return _ok(f"🧹 cleared {n} object(s)")
|
|
223
|
+
|
|
224
|
+
if action == "visualize":
|
|
225
|
+
# Turn a robot ACTION chunk into plots + an animation you can watch.
|
|
226
|
+
# `target` may be a path to an action .json artifact, or pass the raw
|
|
227
|
+
# action via `inputs` (nested list / serialized dict).
|
|
228
|
+
from strands_diffusers.core import viz
|
|
229
|
+
act = inputs
|
|
230
|
+
if act is None and target:
|
|
231
|
+
if target.startswith("cached:"):
|
|
232
|
+
act = _resolve_target(target)
|
|
233
|
+
elif target.lstrip().startswith("["):
|
|
234
|
+
act = json.loads(target) # inline JSON action
|
|
235
|
+
else:
|
|
236
|
+
with open(target) as f:
|
|
237
|
+
act = json.load(f)
|
|
238
|
+
if act is None:
|
|
239
|
+
return _err("Provide an action via `inputs` (list/dict) or `target` "
|
|
240
|
+
"(path to an action .json, or cached:key).")
|
|
241
|
+
vp = params or {}
|
|
242
|
+
res = viz.visualize_action(
|
|
243
|
+
act,
|
|
244
|
+
save_prefix=vp.get("save_prefix", "action"),
|
|
245
|
+
interpret_xyz=vp.get("interpret_xyz", True),
|
|
246
|
+
gripper_index=vp.get("gripper_index", -1),
|
|
247
|
+
cumulative_xyz=vp.get("cumulative_xyz", True),
|
|
248
|
+
world_video=vp.get("world_video"),
|
|
249
|
+
fps=int(vp.get("fps", fps)),
|
|
250
|
+
dim_labels=vp.get("dim_labels"),
|
|
251
|
+
)
|
|
252
|
+
arts = res["artifacts"]
|
|
253
|
+
head = "🎬 action visualization\n📎 artifacts:\n" + "\n".join(
|
|
254
|
+
f" • {a}" for a in arts)
|
|
255
|
+
return _ok(f"{head}\n{json.dumps(res['summary'], indent=2)}",
|
|
256
|
+
data=res["summary"], artifacts=arts)
|
|
257
|
+
|
|
258
|
+
# ───────── run (pipeline) ─────────
|
|
259
|
+
if action == "run":
|
|
260
|
+
if not pipeline:
|
|
261
|
+
return _err("Provide `pipeline` (a class name). Use action='pipelines'.")
|
|
262
|
+
if not model:
|
|
263
|
+
return _err("Provide `model` (HF repo id or local path).")
|
|
264
|
+
# Modular pipelines have a different lifecycle: from_pretrained loads
|
|
265
|
+
# CONFIG ONLY, components must be loaded via load_components(), and
|
|
266
|
+
# __call__(state, output) takes a PipelineState — not prompt=... So the
|
|
267
|
+
# generic run path (from_pretrained -> .to() -> pipe(**kwargs)) won't work.
|
|
268
|
+
if pipeline.endswith("ModularPipeline"):
|
|
269
|
+
return _err(
|
|
270
|
+
f"'{pipeline}' is a Modular pipeline with a different lifecycle "
|
|
271
|
+
"(from_pretrained loads config only; call load_components() then "
|
|
272
|
+
"invoke with a PipelineState, not prompt=...). The high-level "
|
|
273
|
+
"`run` path doesn't support Modular pipelines yet. Use action='call' "
|
|
274
|
+
"to drive it manually, or pick the non-modular variant "
|
|
275
|
+
f"(e.g. '{pipeline.replace('Modular','')}').")
|
|
276
|
+
_ensure("diffusers")
|
|
277
|
+
from_pretrained_kwargs = params.pop("from_pretrained_kwargs", {}) \
|
|
278
|
+
if isinstance(params, dict) else {}
|
|
279
|
+
pipe, key = engine.get_pipeline(
|
|
280
|
+
pipeline, model, device=device, dtype=dtype,
|
|
281
|
+
cache_key=cache_key, **from_pretrained_kwargs)
|
|
282
|
+
call_kwargs = _coerce_kwargs(params)
|
|
283
|
+
if inputs is not None:
|
|
284
|
+
call_args = [_coerce_param(inputs)]
|
|
285
|
+
else:
|
|
286
|
+
call_args = []
|
|
287
|
+
if label:
|
|
288
|
+
logger.info("run %s (%s): %s", pipeline, model, label)
|
|
289
|
+
result = pipe(*call_args, **call_kwargs)
|
|
290
|
+
sr = _infer_sample_rate(pipe)
|
|
291
|
+
out = io.serialize_output(result, save_artifacts=save_artifacts,
|
|
292
|
+
fps=call_kwargs.get("fps", fps),
|
|
293
|
+
audio_sample_rate=sr)
|
|
294
|
+
return _ok(_summarize(pipeline, out, key),
|
|
295
|
+
data=out.get("result"), artifacts=out.get("artifacts", []))
|
|
296
|
+
|
|
297
|
+
# ───────── call (dynamic) ─────────
|
|
298
|
+
if action == "call":
|
|
299
|
+
if not target:
|
|
300
|
+
return _err("Provide `target` (e.g. 'CosmosActionCondition' or "
|
|
301
|
+
"'utils.export_to_video').")
|
|
302
|
+
_ensure("diffusers")
|
|
303
|
+
obj = _resolve_target(target)
|
|
304
|
+
if not callable(obj):
|
|
305
|
+
return _ok(f"📋 {target} = {str(obj)[:500]}", data=str(obj)[:2000])
|
|
306
|
+
coerced = _coerce_kwargs(params)
|
|
307
|
+
unpacked = coerced.pop("**", None)
|
|
308
|
+
if unpacked is not None:
|
|
309
|
+
try:
|
|
310
|
+
coerced = {**dict(unpacked), **coerced}
|
|
311
|
+
except (TypeError, ValueError) as ue:
|
|
312
|
+
return _err(f"❌ '**' value is not a mapping: {ue}")
|
|
313
|
+
result = obj(**coerced)
|
|
314
|
+
if cache_key:
|
|
315
|
+
engine._CACHE[cache_key] = result
|
|
316
|
+
return _ok(f"✅ {target}() → cached as '{cache_key}' "
|
|
317
|
+
f"({type(result).__name__})",
|
|
318
|
+
data={"cached": cache_key, "type": type(result).__name__})
|
|
319
|
+
out = io.serialize_output(result, save_artifacts=save_artifacts, fps=fps)
|
|
320
|
+
preview = json.dumps(out.get("result"), indent=2, default=str)
|
|
321
|
+
if len(preview) > 2000:
|
|
322
|
+
preview = preview[:2000] + " …"
|
|
323
|
+
arts = out.get("artifacts", [])
|
|
324
|
+
head = f"✅ {target}() → {type(result).__name__}"
|
|
325
|
+
if arts:
|
|
326
|
+
head += "\n📎 artifacts:\n" + "\n".join(f" • {a}" for a in arts)
|
|
327
|
+
return _ok(f"{head}\n{preview}", data=out.get("result"), artifacts=arts)
|
|
328
|
+
|
|
329
|
+
return _err(f"Unknown action '{action}'. Try: pipelines, models, schedulers, "
|
|
330
|
+
f"tasks, modalities, wfm, pipeline_info, inspect, run, call, "
|
|
331
|
+
f"visualize, cache, clear_cache.")
|
|
332
|
+
|
|
333
|
+
except TypeError as e:
|
|
334
|
+
hint = ""
|
|
335
|
+
try:
|
|
336
|
+
if target:
|
|
337
|
+
hint = "\n\nExpected:\n" + json.dumps(
|
|
338
|
+
registry.describe(_resolve_target(target)), indent=2, default=str)
|
|
339
|
+
elif pipeline:
|
|
340
|
+
hint = "\n\nExpected:\n" + json.dumps(
|
|
341
|
+
registry.pipeline_info(pipeline), indent=2, default=str)
|
|
342
|
+
except Exception:
|
|
343
|
+
pass
|
|
344
|
+
return _err(f"❌ TypeError: {e}{hint}")
|
|
345
|
+
except Exception as e:
|
|
346
|
+
logger.error("use_diffusers(%s) failed: %s", action, e, exc_info=True)
|
|
347
|
+
return _err(f"❌ {type(e).__name__}: {e}\n\n{traceback.format_exc()[-800:]}")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _resolve_target(target: str) -> Any:
|
|
351
|
+
"""Resolve a target which may reference a cached object."""
|
|
352
|
+
if target.startswith("cached:"):
|
|
353
|
+
ref = target[len("cached:"):]
|
|
354
|
+
head, _, tail = ref.partition(".")
|
|
355
|
+
obj = engine.cache_get(head)
|
|
356
|
+
if obj is None:
|
|
357
|
+
raise ValueError(f"No cached object '{head}'. Use action='cache' to list.")
|
|
358
|
+
for attr in filter(None, tail.split(".")):
|
|
359
|
+
obj = getattr(obj, attr)
|
|
360
|
+
return obj
|
|
361
|
+
return registry.resolve_attr(target)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
_OUTPUT_PATH_KEYS = ("output_path", "output_video_path", "output_obj_path",
|
|
365
|
+
"output_ply_path", "save_path", "out_path")
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _coerce_kwargs(params: dict) -> dict:
|
|
369
|
+
"""Coerce param values, but leave OUTPUT path keys untouched — coercing an
|
|
370
|
+
existing output path would load it as media (a subtle idempotency bug)."""
|
|
371
|
+
out = {}
|
|
372
|
+
for k, v in params.items():
|
|
373
|
+
if k in _OUTPUT_PATH_KEYS:
|
|
374
|
+
out[k] = v
|
|
375
|
+
else:
|
|
376
|
+
out[k] = _coerce_param(v)
|
|
377
|
+
return out
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _coerce_param(value: Any) -> Any:
|
|
381
|
+
"""Coerce a single parameter value.
|
|
382
|
+
|
|
383
|
+
Resolves "cached:key[.attr]" strings to live cached objects (so pipelines can
|
|
384
|
+
receive e.g. action=cached:cond or scheduler=cached:sched), then applies
|
|
385
|
+
multimodal input coercion (paths/URLs/base64 → PIL/video) to everything else.
|
|
386
|
+
"""
|
|
387
|
+
if isinstance(value, str) and value.startswith("cached:"):
|
|
388
|
+
return _resolve_target(value)
|
|
389
|
+
if isinstance(value, list):
|
|
390
|
+
return [_coerce_param(v) for v in value]
|
|
391
|
+
if isinstance(value, dict):
|
|
392
|
+
return {k: _coerce_param(v) for k, v in value.items()}
|
|
393
|
+
return io.coerce_input(value)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _infer_sample_rate(pipe: Any, default: int = 16000) -> int:
|
|
397
|
+
"""Best-effort audio sample-rate discovery for pipelines that emit sound."""
|
|
398
|
+
for attr in ("sound_tokenizer", "vocoder", "audio_encoder"):
|
|
399
|
+
comp = getattr(pipe, attr, None)
|
|
400
|
+
cfg = getattr(comp, "config", None)
|
|
401
|
+
sr = getattr(cfg, "sampling_rate", None) or getattr(cfg, "sample_rate", None)
|
|
402
|
+
if sr:
|
|
403
|
+
return int(sr)
|
|
404
|
+
cfg = getattr(pipe, "config", None)
|
|
405
|
+
sr = getattr(cfg, "sampling_rate", None)
|
|
406
|
+
return int(sr) if sr else default
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _summarize(pipeline: str, out: Dict[str, Any], key: str) -> str:
|
|
410
|
+
arts = out.get("artifacts", [])
|
|
411
|
+
head = f"✅ {pipeline} ({key})"
|
|
412
|
+
if arts:
|
|
413
|
+
head += "\n📎 artifacts:\n" + "\n".join(f" • {a}" for a in arts)
|
|
414
|
+
preview = json.dumps(out.get("result"), indent=2, default=str)
|
|
415
|
+
if len(preview) > 2000:
|
|
416
|
+
preview = preview[:2000] + " …"
|
|
417
|
+
return f"{head}\n{preview}"
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
__all__ = ["use_diffusers"]
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: strands-diffusers
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The universal entrypoint to HuggingFace diffusers for Strands agents — 100% pipeline & modality coverage, zero hardcoding. Special focus on Physical-AI world-foundation models (Cosmos) with robot action outputs.
|
|
5
|
+
Author-email: Cagatay Cali <cagataycali@icloud.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/cagataycali/strands-diffusers
|
|
8
|
+
Project-URL: Repository, https://github.com/cagataycali/strands-diffusers
|
|
9
|
+
Project-URL: Issues, https://github.com/cagataycali/strands-diffusers/issues
|
|
10
|
+
Keywords: strands,diffusers,huggingface,ai,agents,diffusion,video,image,vla,wfm,world-foundation-model,cosmos,robotics,physical-ai
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: strands-agents
|
|
23
|
+
Requires-Dist: diffusers>=0.30
|
|
24
|
+
Requires-Dist: transformers>=4.40
|
|
25
|
+
Requires-Dist: torch
|
|
26
|
+
Requires-Dist: pillow
|
|
27
|
+
Requires-Dist: numpy
|
|
28
|
+
Requires-Dist: accelerate
|
|
29
|
+
Provides-Extra: video
|
|
30
|
+
Requires-Dist: imageio[ffmpeg]; extra == "video"
|
|
31
|
+
Requires-Dist: opencv-python; extra == "video"
|
|
32
|
+
Requires-Dist: av; extra == "video"
|
|
33
|
+
Provides-Extra: audio
|
|
34
|
+
Requires-Dist: soundfile; extra == "audio"
|
|
35
|
+
Requires-Dist: librosa; extra == "audio"
|
|
36
|
+
Provides-Extra: cosmos
|
|
37
|
+
Requires-Dist: cosmos_guardrail; extra == "cosmos"
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
40
|
+
Requires-Dist: black; extra == "dev"
|
|
41
|
+
Requires-Dist: ruff; extra == "dev"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: strands-diffusers[audio,dev,video]; extra == "all"
|
|
44
|
+
|
|
45
|
+
# 🎨 strands-diffusers
|
|
46
|
+
|
|
47
|
+
**The universal entrypoint to HuggingFace `diffusers` for Strands agents — 100%
|
|
48
|
+
pipeline & modality coverage, zero hardcoding.**
|
|
49
|
+
|
|
50
|
+
Just like [`use_aws`](https://github.com/strands-agents) wraps boto3,
|
|
51
|
+
[`use_lerobot`](https://github.com/cagataycali) wraps lerobot, and
|
|
52
|
+
[`use_transformers`](https://github.com/cagataycali/strands-transformers) wraps the
|
|
53
|
+
transformers task taxonomy, **`use_diffusers`** wraps the *entire* diffusers
|
|
54
|
+
library behind a single tool. Discover, don't hardcode: the registry is built at
|
|
55
|
+
runtime from `diffusers._import_structure`, so when diffusers ships a new pipeline
|
|
56
|
+
(say, a fresh Cosmos world-foundation model), strands-diffusers supports it
|
|
57
|
+
**automatically — no code change required**.
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
text / image / video / robot-state IN
|
|
61
|
+
image / video / audio / ACTIONS OUT — natively.
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## 🌍 Physical-AI focus: world-foundation models with action outputs
|
|
65
|
+
|
|
66
|
+
The headline use-case is **NVIDIA Cosmos** and other world-foundation models
|
|
67
|
+
(WFMs). A Cosmos 3 *action-policy* rollout doesn't just generate a plausible
|
|
68
|
+
future video — it predicts the **robot action chunk** that produces it. A single
|
|
69
|
+
`use_diffusers(action="run", ...)` call returns BOTH:
|
|
70
|
+
|
|
71
|
+
- a playable world **video** (`.mp4`)
|
|
72
|
+
- the predicted **action** chunk in model-normalized action space (`.json`,
|
|
73
|
+
shape `[num_chunks, T, action_dim]`)
|
|
74
|
+
- (optionally) synchronized **sound** (`.wav`)
|
|
75
|
+
|
|
76
|
+
— all surfaced as artifact paths, ready to hand to a robot controller or the user.
|
|
77
|
+
|
|
78
|
+
> **Verified end-to-end** on NVIDIA Thor (diffusers `0.39.0.dev0`, `nvidia/Cosmos3-Nano`,
|
|
79
|
+
> bf16/cuda): one `use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", ...)`
|
|
80
|
+
> call produced a world video `(17, 480, 640, 3)` **and** a robot action chunk
|
|
81
|
+
> `(1, 16, 10)` = `(num_chunks, T, action_dim)`, normalized to `[-1, 1]`.
|
|
82
|
+
> See [`examples/cosmos_action_policy.py`](examples/cosmos_action_policy.py) and
|
|
83
|
+
> [`examples/SETUP_COSMOS.md`](examples/SETUP_COSMOS.md).
|
|
84
|
+
|
|
85
|
+
## Install
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install -e .
|
|
89
|
+
# optional extras:
|
|
90
|
+
pip install -e ".[video,audio]" # mp4 export, wav I/O
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Quick start
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from strands import Agent
|
|
97
|
+
from strands_diffusers import use_diffusers
|
|
98
|
+
|
|
99
|
+
agent = Agent(tools=[use_diffusers])
|
|
100
|
+
agent("Generate an image of a robot arm in a kitchen")
|
|
101
|
+
agent("Run a Cosmos action-policy rollout on robot.mp4 and give me the actions")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Or drive it directly:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from strands_diffusers import use_diffusers
|
|
108
|
+
|
|
109
|
+
# text → image
|
|
110
|
+
use_diffusers(
|
|
111
|
+
action="run",
|
|
112
|
+
pipeline="StableDiffusionPipeline",
|
|
113
|
+
model="stabilityai/stable-diffusion-2-1",
|
|
114
|
+
parameters={"prompt": "a robot arm in a kitchen", "num_inference_steps": 25},
|
|
115
|
+
)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Two layers
|
|
119
|
+
|
|
120
|
+
### 1. `run` — high-level pipeline runner
|
|
121
|
+
|
|
122
|
+
Loads a pipeline class via `from_pretrained` and calls it. Inputs are coerced
|
|
123
|
+
(paths / URLs / base64 → PIL / video); outputs (image / video / audio / action)
|
|
124
|
+
are auto-saved and returned by path.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
use_diffusers(action="run", pipeline="WanPipeline", model="...",
|
|
128
|
+
parameters={"prompt": "...", "num_frames": 81}, fps=16)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### 2. `call` — low-level dynamic dispatch
|
|
132
|
+
|
|
133
|
+
Resolve & call *any* diffusers class / function / method — schedulers, VAEs,
|
|
134
|
+
`CosmosActionCondition`, `utils.export_to_video`, or a cached pipeline's method.
|
|
135
|
+
`cached:key` references resolve to live objects; the `"**"` key unpacks a cached
|
|
136
|
+
mapping into kwargs (the `pipe(**inputs)` pattern).
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
# Build a Cosmos action condition, cache it, then run an action-policy rollout.
|
|
140
|
+
use_diffusers(action="call", target="CosmosActionCondition",
|
|
141
|
+
parameters={"mode": "policy", "chunk_size": 16,
|
|
142
|
+
"domain_name": "bridge_orig_lerobot",
|
|
143
|
+
"resolution_tier": 480, "video": "robot.mp4",
|
|
144
|
+
"view_point": "ego_view"},
|
|
145
|
+
cache_key="act_cond")
|
|
146
|
+
|
|
147
|
+
use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", model="nvidia/Cosmos3-Nano",
|
|
148
|
+
parameters={"prompt": "Put the pot to the left of the purple item.",
|
|
149
|
+
"action": "cached:act_cond", "fps": 5,
|
|
150
|
+
"num_inference_steps": 30, "guidance_scale": 1.0,
|
|
151
|
+
"use_system_prompt": False},
|
|
152
|
+
dtype="bfloat16", device="cuda")
|
|
153
|
+
# → artifacts: cosmos_world.mp4 + action chunk .json ([1, 16, action_dim])
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Discovery (the agent never guesses)
|
|
157
|
+
|
|
158
|
+
| action | what it returns |
|
|
159
|
+
|---|---|
|
|
160
|
+
| `pipelines` | all 300+ pipeline classes + derived modality |
|
|
161
|
+
| `models` | every model class (VAEs, transformers, controlnets) |
|
|
162
|
+
| `schedulers` | every scheduler class |
|
|
163
|
+
| `tasks` | diffusers' `AutoPipeline` task → `{family: class}` maps |
|
|
164
|
+
| `modalities` | pipelines grouped by modality (image / video / world / audio / **3d** mesh) |
|
|
165
|
+
| `wfm` | world-foundation / action-capable pipelines (Cosmos, Wan, Hunyuan) |
|
|
166
|
+
| `pipeline_info` | modality + `__call__` signature for one pipeline class |
|
|
167
|
+
| `inspect` | signature + docstring of any target |
|
|
168
|
+
| `visualize` | render a robot ACTION chunk → time-series + 3D trajectory + animation (mp4/gif) |
|
|
169
|
+
| `cache` / `clear_cache` | manage loaded pipelines (free GPU memory) |
|
|
170
|
+
|
|
171
|
+
## Architecture
|
|
172
|
+
|
|
173
|
+
```
|
|
174
|
+
strands_diffusers/
|
|
175
|
+
├── core/
|
|
176
|
+
│ ├── registry.py # zero-hardcode taxonomy from diffusers._import_structure
|
|
177
|
+
│ ├── engine.py # load/cache pipelines, auto device+dtype
|
|
178
|
+
│ └── io.py # coerce inputs; serialize video/image/audio/ACTION outputs
|
|
179
|
+
└── tools/
|
|
180
|
+
└── use_diffusers.py # the single @tool: run + call + discovery
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Testing
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pip install -e ".[video,audio,dev]"
|
|
187
|
+
pytest tests/ -q # 26 unit tests — no GPU, no model downloads
|
|
188
|
+
python examples/smoke.py # E2E gate on tiny HF fixtures
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
`tests/` covers the registry classifier (golden modalities + a guard that no
|
|
192
|
+
video/WFM pipeline is ever mislabeled as a still image), and the multimodal I/O
|
|
193
|
+
serializers — image, video (incl. `list[ndarray]`), **stereo audio** (channels-
|
|
194
|
+
first *and* channels-last), the robot **action** chunk, and **3D mesh** output
|
|
195
|
+
(ShapE → `.ply`/`.obj`/`.npz`). CI runs both on py3.10 + py3.12.
|
|
196
|
+
|
|
197
|
+
## License
|
|
198
|
+
|
|
199
|
+
MIT
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
strands_diffusers/__init__.py,sha256=Q4NNtYiKM6vfSWNT6bELwT4jHXKLURfU3EODyg91Er0,1626
|
|
2
|
+
strands_diffusers/_version.py,sha256=n_5vdJsPNu7wZ57LGuRL585uvll-hiuvZUBWzdG0RQU,520
|
|
3
|
+
strands_diffusers/core/__init__.py,sha256=JkPoeqWE7chElrH2vX7nQLudvtEYCJEI_vm-3zfEi0Q,155
|
|
4
|
+
strands_diffusers/core/engine.py,sha256=GNivOysffOuwcqI3eTIiO-upyCWhSRpLqe3O8UekPV4,5288
|
|
5
|
+
strands_diffusers/core/io.py,sha256=sDfGm7AjIoVv1r-tkiCDtdQp-ZwRVz8dCcfD8B7pX5I,19606
|
|
6
|
+
strands_diffusers/core/registry.py,sha256=2Rur8lZ2vM6xSoJn2Y_yOoGtvRJWiBa97wvBuYMlGeE,13911
|
|
7
|
+
strands_diffusers/core/viz.py,sha256=rVHm6zP7spXdmbWIAJmuRO8wyuBA5AJKSYrrPp28d1I,10228
|
|
8
|
+
strands_diffusers/tools/__init__.py,sha256=JLlbY6iyRf_pZaLZfuMYL67H81iKVyHSSqM9YB_nzjE,128
|
|
9
|
+
strands_diffusers/tools/use_diffusers.py,sha256=my0Ht2LYykEO5KMoTDKV9IFgpfNMH9ZDx6qVivEZ9aQ,19742
|
|
10
|
+
strands_diffusers-0.1.0.dist-info/METADATA,sha256=aG4auqNO2W3-yVr2DUnJ5GvAoXF4KFDDeBisXyLijro,8115
|
|
11
|
+
strands_diffusers-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
strands_diffusers-0.1.0.dist-info/top_level.txt,sha256=MekZfW7p0w1JaIoXVYGwojdd2guaVG54KVuZKmjosH4,18
|
|
13
|
+
strands_diffusers-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
strands_diffusers
|