strands-diffusers 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,420 @@
1
+ """use_diffusers — THE universal entrypoint to all of HuggingFace diffusers.
2
+
3
+ Like `use_aws` wraps boto3, `use_lerobot` wraps lerobot, and `use_transformers`
4
+ wraps the transformers task taxonomy, this wraps the entire diffusers library with
5
+ ZERO hardcoded operations. It is the single tool an agent needs to run any of
6
+ diffusers' 300+ pipelines across every modality:
7
+
8
+ text / image / video / robot-state IN
9
+ image / video / audio / ACTIONS OUT — natively.
10
+
11
+ It has two layers:
12
+
13
+ 1. RUN (high-level): construct a pipeline class via from_pretrained and call it.
14
+ Inputs are coerced (paths/URLs/base64 → PIL/video). Outputs (images, video,
15
+ audio, robot actions) are auto-serialized to disk and returned by path.
16
+
17
+ use_diffusers(action="run", pipeline="StableDiffusionPipeline",
18
+ model="runwayml/stable-diffusion-v1-5",
19
+ parameters={"prompt": "a robot in a kitchen",
20
+ "num_inference_steps": 25})
21
+
22
+ # World-foundation model action-policy rollout (Cosmos3): returns BOTH a
23
+ # generated world video AND the predicted robot action chunk.
24
+ use_diffusers(action="run", pipeline="Cosmos3OmniPipeline",
25
+ model="nvidia/Cosmos3-Nano",
26
+ parameters={"prompt": "Put the pot to the left of the cup.",
27
+ "action": "cached:act_cond", "fps": 5,
28
+ "num_inference_steps": 30, "guidance_scale": 1.0})
29
+
30
+ 2. CALL (low-level): dynamically resolve & call ANY diffusers class / function /
31
+ method — DiffusionPipeline, schedulers, VAEs, CosmosActionCondition, the
32
+ export_to_video util, or a cached pipeline's method.
33
+
34
+ use_diffusers(action="call", target="CosmosActionCondition",
35
+ parameters={"mode": "policy", "chunk_size": 16,
36
+ "domain_name": "bridge_orig_lerobot",
37
+ "video": "robot.mp4"},
38
+ cache_key="act_cond")
39
+ use_diffusers(action="call", target="cached:pipe.enable_model_cpu_offload")
40
+
41
+ Discovery (so the agent never guesses):
42
+ use_diffusers(action="pipelines") # all pipelines + modality
43
+ use_diffusers(action="models") # all model classes
44
+ use_diffusers(action="schedulers") # all schedulers
45
+ use_diffusers(action="tasks") # AutoPipeline task → class maps
46
+ use_diffusers(action="modalities") # pipelines grouped by modality
47
+ use_diffusers(action="wfm") # world-foundation/action models
48
+ use_diffusers(action="pipeline_info", target="Cosmos3OmniPipeline")
49
+ use_diffusers(action="inspect", target="...") # signature + docs of anything
50
+ use_diffusers(action="cache" | "clear_cache")
51
+ """
52
+
53
+ from __future__ import annotations
54
+
55
+ import json
56
+ import logging
57
+ import subprocess
58
+ import sys
59
+ import traceback
60
+ from typing import Any, Dict, Optional
61
+
62
+ from strands import tool
63
+
64
+ from strands_diffusers.core import engine, io, registry
65
+
66
+ logger = logging.getLogger(__name__)
67
+
68
+
69
+ def _ensure(package: str) -> None:
70
+ import importlib
71
+ try:
72
+ importlib.import_module(package)
73
+ except ImportError:
74
+ logger.info("Installing %s ...", package)
75
+ subprocess.run([sys.executable, "-m", "pip", "install", package],
76
+ check=True, timeout=600)
77
+
78
+
79
+ def _ok(text: str, **extra: Any) -> Dict[str, Any]:
80
+ payload = {"status": "success", "content": [{"text": text}]}
81
+ payload.update(extra)
82
+ return payload
83
+
84
+
85
+ def _err(text: str) -> Dict[str, Any]:
86
+ return {"status": "error", "content": [{"text": text}]}
87
+
88
+
89
+ @tool
90
+ def use_diffusers(
91
+ action: str = "pipelines",
92
+ pipeline: Optional[str] = None,
93
+ model: Optional[str] = None,
94
+ inputs: Any = None,
95
+ target: Optional[str] = None,
96
+ parameters: Optional[Dict[str, Any]] = None,
97
+ cache_key: Optional[str] = None,
98
+ device: Optional[str] = None,
99
+ dtype: Optional[str] = None,
100
+ save_artifacts: bool = True,
101
+ fps: float = 24.0,
102
+ label: str = "",
103
+ ) -> Dict[str, Any]:
104
+ """Universal access to ALL diffusers functionality — no hardcoding.
105
+
106
+ Args:
107
+ action: What to do:
108
+ run — load a `pipeline` class from `model` and call it on `parameters`
109
+ call — dynamically call any diffusers class/function/method via `target`
110
+ pipelines — list every pipeline class with its derived modality
111
+ models — list every model class (VAEs, transformers, controlnets)
112
+ schedulers — list every scheduler class
113
+ tasks — diffusers' AutoPipeline task → {family: class} maps
114
+ modalities — pipelines grouped by modality (text-to-image/video/world/...)
115
+ wfm — world-foundation / action-capable pipelines (Cosmos, Wan, ...)
116
+ pipeline_info— modality + __call__ signature for one `target` pipeline class
117
+ inspect — signature + docstring of any `target`
118
+ cache — list cached pipelines/objects
119
+ clear_cache — free a `cache_key` (or everything)
120
+ visualize — render a robot action chunk to plots + an animation (SEE it)
121
+ pipeline: diffusers pipeline class name for action="run" (e.g.
122
+ "StableDiffusionPipeline", "Cosmos3OmniPipeline"). Use
123
+ "DiffusionPipeline"/"AutoPipelineForText2Image" for auto-detect.
124
+ model: HF repo id or local path to load weights from.
125
+ inputs: convenience positional input merged into the pipeline call (rarely
126
+ needed — most diffusers pipelines take keyword args via `parameters`).
127
+ target: For action="call"/"inspect"/"pipeline_info": dotted path into
128
+ diffusers, e.g. "CosmosActionCondition", "utils.export_to_video",
129
+ "cached:key.method".
130
+ parameters: kwargs for the pipeline call / the dynamic call. Values that are
131
+ "cached:key" resolve to live cached objects; path/URL/base64
132
+ values are coerced to PIL/video automatically.
133
+ cache_key: name to cache (or fetch) a loaded object under.
134
+ device: "cuda" / "mps" / "cpu" / "auto".
135
+ dtype: "bfloat16" / "float16" / "float32" (default: device-appropriate).
136
+ save_artifacts: write generated images/video/audio/actions to disk.
137
+ fps: frames-per-second used when exporting generated video.
138
+ label: human-readable description for logging.
139
+
140
+ Returns:
141
+ Dict with status + content; "run"/"call" also include "data" (JSON-safe
142
+ result) and "artifacts" (paths to generated media + action JSON).
143
+ """
144
+ # LLM tool-calls may serialize the `parameters` object to a JSON string;
145
+ # also accept that gracefully (else dict(str) raises a cryptic ValueError).
146
+ if isinstance(parameters, str):
147
+ try:
148
+ parameters = json.loads(parameters)
149
+ except (ValueError, TypeError):
150
+ return _err("`parameters` must be a JSON object/dict; got an "
151
+ f"unparseable string: {parameters[:120]!r}")
152
+ if parameters is not None and not isinstance(parameters, dict):
153
+ return _err(f"`parameters` must be a dict, got {type(parameters).__name__}.")
154
+ params = dict(parameters or {})
155
+ try:
156
+ # ───────── discovery ─────────
157
+ if action == "pipelines":
158
+ pipes = registry.pipelines()
159
+ lines = [f"🎨 diffusers exposes {len(pipes)} pipelines (100% coverage):\n"]
160
+ for p in pipes:
161
+ lines.append(f" • {p} [{registry.modality_of(p)}]")
162
+ lines.append('\n💡 run: use_diffusers(action="run", pipeline="<class>", '
163
+ 'model="<repo>", parameters={...})')
164
+ return _ok("\n".join(lines), data=pipes)
165
+
166
+ if action == "models":
167
+ ms = registry.models()
168
+ return _ok(f"🧩 {len(ms)} model classes:\n " + "\n ".join(ms), data=ms)
169
+
170
+ if action == "schedulers":
171
+ sc = registry.schedulers()
172
+ return _ok(f"⏱️ {len(sc)} schedulers:\n " + "\n ".join(sc), data=sc)
173
+
174
+ if action == "tasks":
175
+ t = registry.auto_pipeline_tasks()
176
+ return _ok("🗂️ AutoPipeline task maps:\n" + json.dumps(t, indent=2), data=t)
177
+
178
+ if action == "modalities":
179
+ groups = registry.tasks_by_modality()
180
+ lines = ["🎛️ Pipelines by modality:\n"]
181
+ for mod in sorted(groups):
182
+ lines.append(f" {mod} ({len(groups[mod])}):")
183
+ for p in groups[mod]:
184
+ lines.append(f" • {p}")
185
+ return _ok("\n".join(lines), data=groups)
186
+
187
+ if action == "wfm":
188
+ wfm = registry.world_foundation_models()
189
+ lines = ["🌍 World-foundation / action-capable pipelines:\n"]
190
+ for p in wfm:
191
+ lines.append(f" • {p} [{registry.modality_of(p)}]")
192
+ lines.append("\n💡 Cosmos action-policy runs return BOTH a world video "
193
+ "and a robot action chunk.\n"
194
+ " Pass a CosmosActionCondition via parameters={'action': "
195
+ "'cached:cond'}.")
196
+ return _ok("\n".join(lines), data=wfm)
197
+
198
+ if action == "pipeline_info":
199
+ if not target:
200
+ return _err("Provide `target` (a pipeline class name).")
201
+ info = registry.pipeline_info(target)
202
+ if not info:
203
+ return _err(f"Unknown pipeline '{target}'. Use action='pipelines'.")
204
+ return _ok(f"🔍 {target}\n{json.dumps(info, indent=2, default=str)}", data=info)
205
+
206
+ if action == "inspect":
207
+ if not target:
208
+ return _err("Provide `target` (e.g. 'StableDiffusionPipeline').")
209
+ obj = _resolve_target(target)
210
+ info = registry.describe(obj)
211
+ return _ok(f"🔍 {target}\n{json.dumps(info, indent=2, default=str)}", data=info)
212
+
213
+ if action == "cache":
214
+ c = engine.cache_list()
215
+ if not c:
216
+ return _ok("📦 cache empty")
217
+ return _ok("📦 cached:\n" + "\n".join(f" • {k}: {v}" for k, v in c.items()),
218
+ data=c)
219
+
220
+ if action == "clear_cache":
221
+ n = engine.cache_clear(cache_key)
222
+ return _ok(f"🧹 cleared {n} object(s)")
223
+
224
+ if action == "visualize":
225
+ # Turn a robot ACTION chunk into plots + an animation you can watch.
226
+ # `target` may be a path to an action .json artifact, or pass the raw
227
+ # action via `inputs` (nested list / serialized dict).
228
+ from strands_diffusers.core import viz
229
+ act = inputs
230
+ if act is None and target:
231
+ if target.startswith("cached:"):
232
+ act = _resolve_target(target)
233
+ elif target.lstrip().startswith("["):
234
+ act = json.loads(target) # inline JSON action
235
+ else:
236
+ with open(target) as f:
237
+ act = json.load(f)
238
+ if act is None:
239
+ return _err("Provide an action via `inputs` (list/dict) or `target` "
240
+ "(path to an action .json, or cached:key).")
241
+ vp = params or {}
242
+ res = viz.visualize_action(
243
+ act,
244
+ save_prefix=vp.get("save_prefix", "action"),
245
+ interpret_xyz=vp.get("interpret_xyz", True),
246
+ gripper_index=vp.get("gripper_index", -1),
247
+ cumulative_xyz=vp.get("cumulative_xyz", True),
248
+ world_video=vp.get("world_video"),
249
+ fps=int(vp.get("fps", fps)),
250
+ dim_labels=vp.get("dim_labels"),
251
+ )
252
+ arts = res["artifacts"]
253
+ head = "🎬 action visualization\n📎 artifacts:\n" + "\n".join(
254
+ f" • {a}" for a in arts)
255
+ return _ok(f"{head}\n{json.dumps(res['summary'], indent=2)}",
256
+ data=res["summary"], artifacts=arts)
257
+
258
+ # ───────── run (pipeline) ─────────
259
+ if action == "run":
260
+ if not pipeline:
261
+ return _err("Provide `pipeline` (a class name). Use action='pipelines'.")
262
+ if not model:
263
+ return _err("Provide `model` (HF repo id or local path).")
264
+ # Modular pipelines have a different lifecycle: from_pretrained loads
265
+ # CONFIG ONLY, components must be loaded via load_components(), and
266
+ # __call__(state, output) takes a PipelineState — not prompt=... So the
267
+ # generic run path (from_pretrained -> .to() -> pipe(**kwargs)) won't work.
268
+ if pipeline.endswith("ModularPipeline"):
269
+ return _err(
270
+ f"'{pipeline}' is a Modular pipeline with a different lifecycle "
271
+ "(from_pretrained loads config only; call load_components() then "
272
+ "invoke with a PipelineState, not prompt=...). The high-level "
273
+ "`run` path doesn't support Modular pipelines yet. Use action='call' "
274
+ "to drive it manually, or pick the non-modular variant "
275
+ f"(e.g. '{pipeline.replace('Modular','')}').")
276
+ _ensure("diffusers")
277
+ from_pretrained_kwargs = params.pop("from_pretrained_kwargs", {}) \
278
+ if isinstance(params, dict) else {}
279
+ pipe, key = engine.get_pipeline(
280
+ pipeline, model, device=device, dtype=dtype,
281
+ cache_key=cache_key, **from_pretrained_kwargs)
282
+ call_kwargs = _coerce_kwargs(params)
283
+ if inputs is not None:
284
+ call_args = [_coerce_param(inputs)]
285
+ else:
286
+ call_args = []
287
+ if label:
288
+ logger.info("run %s (%s): %s", pipeline, model, label)
289
+ result = pipe(*call_args, **call_kwargs)
290
+ sr = _infer_sample_rate(pipe)
291
+ out = io.serialize_output(result, save_artifacts=save_artifacts,
292
+ fps=call_kwargs.get("fps", fps),
293
+ audio_sample_rate=sr)
294
+ return _ok(_summarize(pipeline, out, key),
295
+ data=out.get("result"), artifacts=out.get("artifacts", []))
296
+
297
+ # ───────── call (dynamic) ─────────
298
+ if action == "call":
299
+ if not target:
300
+ return _err("Provide `target` (e.g. 'CosmosActionCondition' or "
301
+ "'utils.export_to_video').")
302
+ _ensure("diffusers")
303
+ obj = _resolve_target(target)
304
+ if not callable(obj):
305
+ return _ok(f"📋 {target} = {str(obj)[:500]}", data=str(obj)[:2000])
306
+ coerced = _coerce_kwargs(params)
307
+ unpacked = coerced.pop("**", None)
308
+ if unpacked is not None:
309
+ try:
310
+ coerced = {**dict(unpacked), **coerced}
311
+ except (TypeError, ValueError) as ue:
312
+ return _err(f"❌ '**' value is not a mapping: {ue}")
313
+ result = obj(**coerced)
314
+ if cache_key:
315
+ engine._CACHE[cache_key] = result
316
+ return _ok(f"✅ {target}() → cached as '{cache_key}' "
317
+ f"({type(result).__name__})",
318
+ data={"cached": cache_key, "type": type(result).__name__})
319
+ out = io.serialize_output(result, save_artifacts=save_artifacts, fps=fps)
320
+ preview = json.dumps(out.get("result"), indent=2, default=str)
321
+ if len(preview) > 2000:
322
+ preview = preview[:2000] + " …"
323
+ arts = out.get("artifacts", [])
324
+ head = f"✅ {target}() → {type(result).__name__}"
325
+ if arts:
326
+ head += "\n📎 artifacts:\n" + "\n".join(f" • {a}" for a in arts)
327
+ return _ok(f"{head}\n{preview}", data=out.get("result"), artifacts=arts)
328
+
329
+ return _err(f"Unknown action '{action}'. Try: pipelines, models, schedulers, "
330
+ f"tasks, modalities, wfm, pipeline_info, inspect, run, call, "
331
+ f"visualize, cache, clear_cache.")
332
+
333
+ except TypeError as e:
334
+ hint = ""
335
+ try:
336
+ if target:
337
+ hint = "\n\nExpected:\n" + json.dumps(
338
+ registry.describe(_resolve_target(target)), indent=2, default=str)
339
+ elif pipeline:
340
+ hint = "\n\nExpected:\n" + json.dumps(
341
+ registry.pipeline_info(pipeline), indent=2, default=str)
342
+ except Exception:
343
+ pass
344
+ return _err(f"❌ TypeError: {e}{hint}")
345
+ except Exception as e:
346
+ logger.error("use_diffusers(%s) failed: %s", action, e, exc_info=True)
347
+ return _err(f"❌ {type(e).__name__}: {e}\n\n{traceback.format_exc()[-800:]}")
348
+
349
+
350
+ def _resolve_target(target: str) -> Any:
351
+ """Resolve a target which may reference a cached object."""
352
+ if target.startswith("cached:"):
353
+ ref = target[len("cached:"):]
354
+ head, _, tail = ref.partition(".")
355
+ obj = engine.cache_get(head)
356
+ if obj is None:
357
+ raise ValueError(f"No cached object '{head}'. Use action='cache' to list.")
358
+ for attr in filter(None, tail.split(".")):
359
+ obj = getattr(obj, attr)
360
+ return obj
361
+ return registry.resolve_attr(target)
362
+
363
+
364
+ _OUTPUT_PATH_KEYS = ("output_path", "output_video_path", "output_obj_path",
365
+ "output_ply_path", "save_path", "out_path")
366
+
367
+
368
+ def _coerce_kwargs(params: dict) -> dict:
369
+ """Coerce param values, but leave OUTPUT path keys untouched — coercing an
370
+ existing output path would load it as media (a subtle idempotency bug)."""
371
+ out = {}
372
+ for k, v in params.items():
373
+ if k in _OUTPUT_PATH_KEYS:
374
+ out[k] = v
375
+ else:
376
+ out[k] = _coerce_param(v)
377
+ return out
378
+
379
+
380
+ def _coerce_param(value: Any) -> Any:
381
+ """Coerce a single parameter value.
382
+
383
+ Resolves "cached:key[.attr]" strings to live cached objects (so pipelines can
384
+ receive e.g. action=cached:cond or scheduler=cached:sched), then applies
385
+ multimodal input coercion (paths/URLs/base64 → PIL/video) to everything else.
386
+ """
387
+ if isinstance(value, str) and value.startswith("cached:"):
388
+ return _resolve_target(value)
389
+ if isinstance(value, list):
390
+ return [_coerce_param(v) for v in value]
391
+ if isinstance(value, dict):
392
+ return {k: _coerce_param(v) for k, v in value.items()}
393
+ return io.coerce_input(value)
394
+
395
+
396
+ def _infer_sample_rate(pipe: Any, default: int = 16000) -> int:
397
+ """Best-effort audio sample-rate discovery for pipelines that emit sound."""
398
+ for attr in ("sound_tokenizer", "vocoder", "audio_encoder"):
399
+ comp = getattr(pipe, attr, None)
400
+ cfg = getattr(comp, "config", None)
401
+ sr = getattr(cfg, "sampling_rate", None) or getattr(cfg, "sample_rate", None)
402
+ if sr:
403
+ return int(sr)
404
+ cfg = getattr(pipe, "config", None)
405
+ sr = getattr(cfg, "sampling_rate", None)
406
+ return int(sr) if sr else default
407
+
408
+
409
+ def _summarize(pipeline: str, out: Dict[str, Any], key: str) -> str:
410
+ arts = out.get("artifacts", [])
411
+ head = f"✅ {pipeline} ({key})"
412
+ if arts:
413
+ head += "\n📎 artifacts:\n" + "\n".join(f" • {a}" for a in arts)
414
+ preview = json.dumps(out.get("result"), indent=2, default=str)
415
+ if len(preview) > 2000:
416
+ preview = preview[:2000] + " …"
417
+ return f"{head}\n{preview}"
418
+
419
+
420
+ __all__ = ["use_diffusers"]
@@ -0,0 +1,199 @@
1
+ Metadata-Version: 2.4
2
+ Name: strands-diffusers
3
+ Version: 0.1.0
4
+ Summary: The universal entrypoint to HuggingFace diffusers for Strands agents — 100% pipeline & modality coverage, zero hardcoding. Special focus on Physical-AI world-foundation models (Cosmos) with robot action outputs.
5
+ Author-email: Cagatay Cali <cagataycali@icloud.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/cagataycali/strands-diffusers
8
+ Project-URL: Repository, https://github.com/cagataycali/strands-diffusers
9
+ Project-URL: Issues, https://github.com/cagataycali/strands-diffusers/issues
10
+ Keywords: strands,diffusers,huggingface,ai,agents,diffusion,video,image,vla,wfm,world-foundation-model,cosmos,robotics,physical-ai
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: strands-agents
23
+ Requires-Dist: diffusers>=0.30
24
+ Requires-Dist: transformers>=4.40
25
+ Requires-Dist: torch
26
+ Requires-Dist: pillow
27
+ Requires-Dist: numpy
28
+ Requires-Dist: accelerate
29
+ Provides-Extra: video
30
+ Requires-Dist: imageio[ffmpeg]; extra == "video"
31
+ Requires-Dist: opencv-python; extra == "video"
32
+ Requires-Dist: av; extra == "video"
33
+ Provides-Extra: audio
34
+ Requires-Dist: soundfile; extra == "audio"
35
+ Requires-Dist: librosa; extra == "audio"
36
+ Provides-Extra: cosmos
37
+ Requires-Dist: cosmos_guardrail; extra == "cosmos"
38
+ Provides-Extra: dev
39
+ Requires-Dist: pytest>=7.0; extra == "dev"
40
+ Requires-Dist: black; extra == "dev"
41
+ Requires-Dist: ruff; extra == "dev"
42
+ Provides-Extra: all
43
+ Requires-Dist: strands-diffusers[audio,dev,video]; extra == "all"
44
+
45
+ # 🎨 strands-diffusers
46
+
47
+ **The universal entrypoint to HuggingFace `diffusers` for Strands agents — 100%
48
+ pipeline & modality coverage, zero hardcoding.**
49
+
50
+ Just like [`use_aws`](https://github.com/strands-agents) wraps boto3,
51
+ [`use_lerobot`](https://github.com/cagataycali) wraps lerobot, and
52
+ [`use_transformers`](https://github.com/cagataycali/strands-transformers) wraps the
53
+ transformers task taxonomy, **`use_diffusers`** wraps the *entire* diffusers
54
+ library behind a single tool. Discover, don't hardcode: the registry is built at
55
+ runtime from `diffusers._import_structure`, so when diffusers ships a new pipeline
56
+ (say, a fresh Cosmos world-foundation model), strands-diffusers supports it
57
+ **automatically — no code change required**.
58
+
59
+ ```
60
+ text / image / video / robot-state IN
61
+ image / video / audio / ACTIONS OUT — natively.
62
+ ```
63
+
64
+ ## 🌍 Physical-AI focus: world-foundation models with action outputs
65
+
66
+ The headline use-case is **NVIDIA Cosmos** and other world-foundation models
67
+ (WFMs). A Cosmos 3 *action-policy* rollout doesn't just generate a plausible
68
+ future video — it predicts the **robot action chunk** that produces it. A single
69
+ `use_diffusers(action="run", ...)` call returns BOTH:
70
+
71
+ - a playable world **video** (`.mp4`)
72
+ - the predicted **action** chunk in model-normalized action space (`.json`,
73
+ shape `[num_chunks, T, action_dim]`)
74
+ - (optionally) synchronized **sound** (`.wav`)
75
+
76
+ — all surfaced as artifact paths, ready to hand to a robot controller or the user.
77
+
78
+ > **Verified end-to-end** on NVIDIA Thor (diffusers `0.39.0.dev0`, `nvidia/Cosmos3-Nano`,
79
+ > bf16/cuda): one `use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", ...)`
80
+ > call produced a world video `(17, 480, 640, 3)` **and** a robot action chunk
81
+ > `(1, 16, 10)` = `(num_chunks, T, action_dim)`, normalized to `[-1, 1]`.
82
+ > See [`examples/cosmos_action_policy.py`](examples/cosmos_action_policy.py) and
83
+ > [`examples/SETUP_COSMOS.md`](examples/SETUP_COSMOS.md).
84
+
85
+ ## Install
86
+
87
+ ```bash
88
+ pip install -e .
89
+ # optional extras:
90
+ pip install -e ".[video,audio]" # mp4 export, wav I/O
91
+ ```
92
+
93
+ ## Quick start
94
+
95
+ ```python
96
+ from strands import Agent
97
+ from strands_diffusers import use_diffusers
98
+
99
+ agent = Agent(tools=[use_diffusers])
100
+ agent("Generate an image of a robot arm in a kitchen")
101
+ agent("Run a Cosmos action-policy rollout on robot.mp4 and give me the actions")
102
+ ```
103
+
104
+ Or drive it directly:
105
+
106
+ ```python
107
+ from strands_diffusers import use_diffusers
108
+
109
+ # text → image
110
+ use_diffusers(
111
+ action="run",
112
+ pipeline="StableDiffusionPipeline",
113
+ model="stabilityai/stable-diffusion-2-1",
114
+ parameters={"prompt": "a robot arm in a kitchen", "num_inference_steps": 25},
115
+ )
116
+ ```
117
+
118
+ ## Two layers
119
+
120
+ ### 1. `run` — high-level pipeline runner
121
+
122
+ Loads a pipeline class via `from_pretrained` and calls it. Inputs are coerced
123
+ (paths / URLs / base64 → PIL / video); outputs (image / video / audio / action)
124
+ are auto-saved and returned by path.
125
+
126
+ ```python
127
+ use_diffusers(action="run", pipeline="WanPipeline", model="...",
128
+ parameters={"prompt": "...", "num_frames": 81}, fps=16)
129
+ ```
130
+
131
+ ### 2. `call` — low-level dynamic dispatch
132
+
133
+ Resolve & call *any* diffusers class / function / method — schedulers, VAEs,
134
+ `CosmosActionCondition`, `utils.export_to_video`, or a cached pipeline's method.
135
+ `cached:key` references resolve to live objects; the `"**"` key unpacks a cached
136
+ mapping into kwargs (the `pipe(**inputs)` pattern).
137
+
138
+ ```python
139
+ # Build a Cosmos action condition, cache it, then run an action-policy rollout.
140
+ use_diffusers(action="call", target="CosmosActionCondition",
141
+ parameters={"mode": "policy", "chunk_size": 16,
142
+ "domain_name": "bridge_orig_lerobot",
143
+ "resolution_tier": 480, "video": "robot.mp4",
144
+ "view_point": "ego_view"},
145
+ cache_key="act_cond")
146
+
147
+ use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", model="nvidia/Cosmos3-Nano",
148
+ parameters={"prompt": "Put the pot to the left of the purple item.",
149
+ "action": "cached:act_cond", "fps": 5,
150
+ "num_inference_steps": 30, "guidance_scale": 1.0,
151
+ "use_system_prompt": False},
152
+ dtype="bfloat16", device="cuda")
153
+ # → artifacts: cosmos_world.mp4 + action chunk .json ([1, 16, action_dim])
154
+ ```
155
+
156
+ ## Discovery (the agent never guesses)
157
+
158
+ | action | what it returns |
159
+ |---|---|
160
+ | `pipelines` | all 300+ pipeline classes + derived modality |
161
+ | `models` | every model class (VAEs, transformers, controlnets) |
162
+ | `schedulers` | every scheduler class |
163
+ | `tasks` | diffusers' `AutoPipeline` task → `{family: class}` maps |
164
+ | `modalities` | pipelines grouped by modality (image / video / world / audio / **3d** mesh) |
165
+ | `wfm` | world-foundation / action-capable pipelines (Cosmos, Wan, Hunyuan) |
166
+ | `pipeline_info` | modality + `__call__` signature for one pipeline class |
167
+ | `inspect` | signature + docstring of any target |
168
+ | `visualize` | render a robot ACTION chunk → time-series + 3D trajectory + animation (mp4/gif) |
169
+ | `cache` / `clear_cache` | manage loaded pipelines (free GPU memory) |
170
+
171
+ ## Architecture
172
+
173
+ ```
174
+ strands_diffusers/
175
+ ├── core/
176
+ │ ├── registry.py # zero-hardcode taxonomy from diffusers._import_structure
177
+ │ ├── engine.py # load/cache pipelines, auto device+dtype
178
+ │ └── io.py # coerce inputs; serialize video/image/audio/ACTION outputs
179
+ └── tools/
180
+ └── use_diffusers.py # the single @tool: run + call + discovery
181
+ ```
182
+
183
+ ## Testing
184
+
185
+ ```bash
186
+ pip install -e ".[video,audio,dev]"
187
+ pytest tests/ -q # 26 unit tests — no GPU, no model downloads
188
+ python examples/smoke.py # E2E gate on tiny HF fixtures
189
+ ```
190
+
191
+ `tests/` covers the registry classifier (golden modalities + a guard that no
192
+ video/WFM pipeline is ever mislabeled as a still image), and the multimodal I/O
193
+ serializers — image, video (incl. `list[ndarray]`), **stereo audio** (channels-
194
+ first *and* channels-last), the robot **action** chunk, and **3D mesh** output
195
+ (ShapE → `.ply`/`.obj`/`.npz`). CI runs both on py3.10 + py3.12.
196
+
197
+ ## License
198
+
199
+ MIT
@@ -0,0 +1,13 @@
1
+ strands_diffusers/__init__.py,sha256=Q4NNtYiKM6vfSWNT6bELwT4jHXKLURfU3EODyg91Er0,1626
2
+ strands_diffusers/_version.py,sha256=n_5vdJsPNu7wZ57LGuRL585uvll-hiuvZUBWzdG0RQU,520
3
+ strands_diffusers/core/__init__.py,sha256=JkPoeqWE7chElrH2vX7nQLudvtEYCJEI_vm-3zfEi0Q,155
4
+ strands_diffusers/core/engine.py,sha256=GNivOysffOuwcqI3eTIiO-upyCWhSRpLqe3O8UekPV4,5288
5
+ strands_diffusers/core/io.py,sha256=sDfGm7AjIoVv1r-tkiCDtdQp-ZwRVz8dCcfD8B7pX5I,19606
6
+ strands_diffusers/core/registry.py,sha256=2Rur8lZ2vM6xSoJn2Y_yOoGtvRJWiBa97wvBuYMlGeE,13911
7
+ strands_diffusers/core/viz.py,sha256=rVHm6zP7spXdmbWIAJmuRO8wyuBA5AJKSYrrPp28d1I,10228
8
+ strands_diffusers/tools/__init__.py,sha256=JLlbY6iyRf_pZaLZfuMYL67H81iKVyHSSqM9YB_nzjE,128
9
+ strands_diffusers/tools/use_diffusers.py,sha256=my0Ht2LYykEO5KMoTDKV9IFgpfNMH9ZDx6qVivEZ9aQ,19742
10
+ strands_diffusers-0.1.0.dist-info/METADATA,sha256=aG4auqNO2W3-yVr2DUnJ5GvAoXF4KFDDeBisXyLijro,8115
11
+ strands_diffusers-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
12
+ strands_diffusers-0.1.0.dist-info/top_level.txt,sha256=MekZfW7p0w1JaIoXVYGwojdd2guaVG54KVuZKmjosH4,18
13
+ strands_diffusers-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ strands_diffusers