strands-diffusers 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- strands_diffusers-0.1.0/.github/workflows/auto-release.yml +90 -0
- strands_diffusers-0.1.0/.github/workflows/ci.yml +28 -0
- strands_diffusers-0.1.0/.gitignore +21 -0
- strands_diffusers-0.1.0/PKG-INFO +199 -0
- strands_diffusers-0.1.0/README.md +155 -0
- strands_diffusers-0.1.0/examples/README.md +52 -0
- strands_diffusers-0.1.0/examples/SETUP_COSMOS.md +56 -0
- strands_diffusers-0.1.0/examples/cosmos_action_policy.py +107 -0
- strands_diffusers-0.1.0/examples/gallery_20.py +105 -0
- strands_diffusers-0.1.0/examples/smoke.py +94 -0
- strands_diffusers-0.1.0/examples/text_to_image.py +36 -0
- strands_diffusers-0.1.0/examples/text_to_video.py +39 -0
- strands_diffusers-0.1.0/examples/visualize_actions.py +66 -0
- strands_diffusers-0.1.0/pyproject.toml +69 -0
- strands_diffusers-0.1.0/requirements.txt +7 -0
- strands_diffusers-0.1.0/setup.cfg +4 -0
- strands_diffusers-0.1.0/setup.py +5 -0
- strands_diffusers-0.1.0/strands_diffusers/__init__.py +41 -0
- strands_diffusers-0.1.0/strands_diffusers/_version.py +24 -0
- strands_diffusers-0.1.0/strands_diffusers/core/__init__.py +4 -0
- strands_diffusers-0.1.0/strands_diffusers/core/engine.py +163 -0
- strands_diffusers-0.1.0/strands_diffusers/core/io.py +552 -0
- strands_diffusers-0.1.0/strands_diffusers/core/registry.py +349 -0
- strands_diffusers-0.1.0/strands_diffusers/core/viz.py +256 -0
- strands_diffusers-0.1.0/strands_diffusers/tools/__init__.py +4 -0
- strands_diffusers-0.1.0/strands_diffusers/tools/use_diffusers.py +420 -0
- strands_diffusers-0.1.0/strands_diffusers.egg-info/PKG-INFO +199 -0
- strands_diffusers-0.1.0/strands_diffusers.egg-info/SOURCES.txt +31 -0
- strands_diffusers-0.1.0/strands_diffusers.egg-info/dependency_links.txt +1 -0
- strands_diffusers-0.1.0/strands_diffusers.egg-info/requires.txt +27 -0
- strands_diffusers-0.1.0/strands_diffusers.egg-info/top_level.txt +1 -0
- strands_diffusers-0.1.0/tests/test_action_io.py +197 -0
- strands_diffusers-0.1.0/tests/test_registry.py +77 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
name: Auto Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*.*.*'
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: write
|
|
10
|
+
id-token: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
auto-release:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- name: Checkout code
|
|
18
|
+
uses: actions/checkout@v4
|
|
19
|
+
with:
|
|
20
|
+
fetch-depth: 0 # full history + tags for setuptools-scm
|
|
21
|
+
token: ${{ secrets.GITHUB_TOKEN }}
|
|
22
|
+
|
|
23
|
+
- name: Set up Python
|
|
24
|
+
uses: actions/setup-python@v5
|
|
25
|
+
with:
|
|
26
|
+
python-version: "3.11"
|
|
27
|
+
|
|
28
|
+
- name: Install build tooling
|
|
29
|
+
run: |
|
|
30
|
+
python -m pip install --upgrade pip
|
|
31
|
+
pip install build twine
|
|
32
|
+
|
|
33
|
+
- name: Extract version from tag
|
|
34
|
+
id: get_version
|
|
35
|
+
run: |
|
|
36
|
+
VERSION=${GITHUB_REF#refs/tags/v}
|
|
37
|
+
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
|
38
|
+
echo "Version: $VERSION"
|
|
39
|
+
|
|
40
|
+
- name: Build package (version derived from git tag via setuptools-scm)
|
|
41
|
+
run: python -m build
|
|
42
|
+
|
|
43
|
+
- name: Verify built version matches tag
|
|
44
|
+
run: |
|
|
45
|
+
ls -l dist/
|
|
46
|
+
if ! ls dist/ | grep -q "${{ steps.get_version.outputs.version }}"; then
|
|
47
|
+
echo "::error::Built artifact does not match tag version ${{ steps.get_version.outputs.version }}"
|
|
48
|
+
ls dist/
|
|
49
|
+
exit 1
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
- name: Publish to PyPI
|
|
53
|
+
env:
|
|
54
|
+
TWINE_USERNAME: __token__
|
|
55
|
+
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
|
|
56
|
+
run: twine upload dist/*
|
|
57
|
+
|
|
58
|
+
- name: Create GitHub Release
|
|
59
|
+
uses: softprops/action-gh-release@v2
|
|
60
|
+
with:
|
|
61
|
+
tag_name: ${{ github.ref_name }}
|
|
62
|
+
name: strands-diffusers v${{ steps.get_version.outputs.version }}
|
|
63
|
+
body: |
|
|
64
|
+
## π¨ strands-diffusers v${{ steps.get_version.outputs.version }}
|
|
65
|
+
|
|
66
|
+
The universal entrypoint to HuggingFace `diffusers` for Strands agents β
|
|
67
|
+
100% pipeline & modality coverage, zero hardcoding.
|
|
68
|
+
|
|
69
|
+
### π¦ Installation
|
|
70
|
+
```bash
|
|
71
|
+
pip install strands-diffusers==${{ steps.get_version.outputs.version }}
|
|
72
|
+
# extras: pip install "strands-diffusers[video,audio]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### π Upgrade
|
|
76
|
+
```bash
|
|
77
|
+
pip install --upgrade strands-diffusers
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
**Full Changelog**: https://github.com/cagataycali/strands-diffusers/releases
|
|
81
|
+
draft: false
|
|
82
|
+
prerelease: false
|
|
83
|
+
|
|
84
|
+
- name: Summary
|
|
85
|
+
run: |
|
|
86
|
+
echo "## π Release Summary" >> $GITHUB_STEP_SUMMARY
|
|
87
|
+
echo "" >> $GITHUB_STEP_SUMMARY
|
|
88
|
+
echo "- **Version:** v${{ steps.get_version.outputs.version }}" >> $GITHUB_STEP_SUMMARY
|
|
89
|
+
echo "- **PyPI:** https://pypi.org/project/strands-diffusers/${{ steps.get_version.outputs.version }}/" >> $GITHUB_STEP_SUMMARY
|
|
90
|
+
echo "- **GitHub Release:** https://github.com/cagataycali/strands-diffusers/releases/tag/v${{ steps.get_version.outputs.version }}" >> $GITHUB_STEP_SUMMARY
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches: [main]
|
|
5
|
+
pull_request:
|
|
6
|
+
jobs:
|
|
7
|
+
test:
|
|
8
|
+
runs-on: ubuntu-latest
|
|
9
|
+
strategy:
|
|
10
|
+
matrix:
|
|
11
|
+
python-version: ["3.10", "3.12"]
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
with:
|
|
15
|
+
fetch-depth: 0 # full history+tags so setuptools-scm can derive version
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: ${{ matrix.python-version }}
|
|
19
|
+
- name: Install
|
|
20
|
+
run: |
|
|
21
|
+
python -m pip install --upgrade pip
|
|
22
|
+
pip install -e ".[video,audio,dev]"
|
|
23
|
+
- name: Unit tests (no GPU, no model downloads)
|
|
24
|
+
run: pytest tests/ -q
|
|
25
|
+
- name: Smoke E2E (tiny fixtures)
|
|
26
|
+
env:
|
|
27
|
+
TOKENIZERS_PARALLELISM: "false"
|
|
28
|
+
run: python examples/smoke.py
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.pyc
|
|
3
|
+
*.egg-info/
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
.venv/
|
|
7
|
+
*.mp4
|
|
8
|
+
*.png
|
|
9
|
+
*.jpg
|
|
10
|
+
*.wav
|
|
11
|
+
*.json.tmp
|
|
12
|
+
assets/
|
|
13
|
+
.coverage
|
|
14
|
+
system_prompt.prompt
|
|
15
|
+
__pycache__/
|
|
16
|
+
*.pyc
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
strands_diffusers/_version.py
|
|
19
|
+
build/
|
|
20
|
+
.pytest_cache/
|
|
21
|
+
__pycache__/
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: strands-diffusers
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The universal entrypoint to HuggingFace diffusers for Strands agents β 100% pipeline & modality coverage, zero hardcoding. Special focus on Physical-AI world-foundation models (Cosmos) with robot action outputs.
|
|
5
|
+
Author-email: Cagatay Cali <cagataycali@icloud.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/cagataycali/strands-diffusers
|
|
8
|
+
Project-URL: Repository, https://github.com/cagataycali/strands-diffusers
|
|
9
|
+
Project-URL: Issues, https://github.com/cagataycali/strands-diffusers/issues
|
|
10
|
+
Keywords: strands,diffusers,huggingface,ai,agents,diffusion,video,image,vla,wfm,world-foundation-model,cosmos,robotics,physical-ai
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: strands-agents
|
|
23
|
+
Requires-Dist: diffusers>=0.30
|
|
24
|
+
Requires-Dist: transformers>=4.40
|
|
25
|
+
Requires-Dist: torch
|
|
26
|
+
Requires-Dist: pillow
|
|
27
|
+
Requires-Dist: numpy
|
|
28
|
+
Requires-Dist: accelerate
|
|
29
|
+
Provides-Extra: video
|
|
30
|
+
Requires-Dist: imageio[ffmpeg]; extra == "video"
|
|
31
|
+
Requires-Dist: opencv-python; extra == "video"
|
|
32
|
+
Requires-Dist: av; extra == "video"
|
|
33
|
+
Provides-Extra: audio
|
|
34
|
+
Requires-Dist: soundfile; extra == "audio"
|
|
35
|
+
Requires-Dist: librosa; extra == "audio"
|
|
36
|
+
Provides-Extra: cosmos
|
|
37
|
+
Requires-Dist: cosmos_guardrail; extra == "cosmos"
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
40
|
+
Requires-Dist: black; extra == "dev"
|
|
41
|
+
Requires-Dist: ruff; extra == "dev"
|
|
42
|
+
Provides-Extra: all
|
|
43
|
+
Requires-Dist: strands-diffusers[audio,dev,video]; extra == "all"
|
|
44
|
+
|
|
45
|
+
# π¨ strands-diffusers
|
|
46
|
+
|
|
47
|
+
**The universal entrypoint to HuggingFace `diffusers` for Strands agents β 100%
|
|
48
|
+
pipeline & modality coverage, zero hardcoding.**
|
|
49
|
+
|
|
50
|
+
Just like [`use_aws`](https://github.com/strands-agents) wraps boto3,
|
|
51
|
+
[`use_lerobot`](https://github.com/cagataycali) wraps lerobot, and
|
|
52
|
+
[`use_transformers`](https://github.com/cagataycali/strands-transformers) wraps the
|
|
53
|
+
transformers task taxonomy, **`use_diffusers`** wraps the *entire* diffusers
|
|
54
|
+
library behind a single tool. Discover, don't hardcode: the registry is built at
|
|
55
|
+
runtime from `diffusers._import_structure`, so when diffusers ships a new pipeline
|
|
56
|
+
(say, a fresh Cosmos world-foundation model), strands-diffusers supports it
|
|
57
|
+
**automatically β no code change required**.
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
text / image / video / robot-state IN
|
|
61
|
+
image / video / audio / ACTIONS OUT β natively.
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## π Physical-AI focus: world-foundation models with action outputs
|
|
65
|
+
|
|
66
|
+
The headline use-case is **NVIDIA Cosmos** and other world-foundation models
|
|
67
|
+
(WFMs). A Cosmos 3 *action-policy* rollout doesn't just generate a plausible
|
|
68
|
+
future video β it predicts the **robot action chunk** that produces it. A single
|
|
69
|
+
`use_diffusers(action="run", ...)` call returns BOTH:
|
|
70
|
+
|
|
71
|
+
- a playable world **video** (`.mp4`)
|
|
72
|
+
- the predicted **action** chunk in model-normalized action space (`.json`,
|
|
73
|
+
shape `[num_chunks, T, action_dim]`)
|
|
74
|
+
- (optionally) synchronized **sound** (`.wav`)
|
|
75
|
+
|
|
76
|
+
β all surfaced as artifact paths, ready to hand to a robot controller or the user.
|
|
77
|
+
|
|
78
|
+
> **Verified end-to-end** on NVIDIA Thor (diffusers `0.39.0.dev0`, `nvidia/Cosmos3-Nano`,
|
|
79
|
+
> bf16/cuda): one `use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", ...)`
|
|
80
|
+
> call produced a world video `(17, 480, 640, 3)` **and** a robot action chunk
|
|
81
|
+
> `(1, 16, 10)` = `(num_chunks, T, action_dim)`, normalized to `[-1, 1]`.
|
|
82
|
+
> See [`examples/cosmos_action_policy.py`](examples/cosmos_action_policy.py) and
|
|
83
|
+
> [`examples/SETUP_COSMOS.md`](examples/SETUP_COSMOS.md).
|
|
84
|
+
|
|
85
|
+
## Install
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install -e .
|
|
89
|
+
# optional extras:
|
|
90
|
+
pip install -e ".[video,audio]" # mp4 export, wav I/O
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Quick start
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from strands import Agent
|
|
97
|
+
from strands_diffusers import use_diffusers
|
|
98
|
+
|
|
99
|
+
agent = Agent(tools=[use_diffusers])
|
|
100
|
+
agent("Generate an image of a robot arm in a kitchen")
|
|
101
|
+
agent("Run a Cosmos action-policy rollout on robot.mp4 and give me the actions")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Or drive it directly:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from strands_diffusers import use_diffusers
|
|
108
|
+
|
|
109
|
+
# text β image
|
|
110
|
+
use_diffusers(
|
|
111
|
+
action="run",
|
|
112
|
+
pipeline="StableDiffusionPipeline",
|
|
113
|
+
model="stabilityai/stable-diffusion-2-1",
|
|
114
|
+
parameters={"prompt": "a robot arm in a kitchen", "num_inference_steps": 25},
|
|
115
|
+
)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Two layers
|
|
119
|
+
|
|
120
|
+
### 1. `run` β high-level pipeline runner
|
|
121
|
+
|
|
122
|
+
Loads a pipeline class via `from_pretrained` and calls it. Inputs are coerced
|
|
123
|
+
(paths / URLs / base64 β PIL / video); outputs (image / video / audio / action)
|
|
124
|
+
are auto-saved and returned by path.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
use_diffusers(action="run", pipeline="WanPipeline", model="...",
|
|
128
|
+
parameters={"prompt": "...", "num_frames": 81}, fps=16)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### 2. `call` β low-level dynamic dispatch
|
|
132
|
+
|
|
133
|
+
Resolve & call *any* diffusers class / function / method β schedulers, VAEs,
|
|
134
|
+
`CosmosActionCondition`, `utils.export_to_video`, or a cached pipeline's method.
|
|
135
|
+
`cached:key` references resolve to live objects; the `"**"` key unpacks a cached
|
|
136
|
+
mapping into kwargs (the `pipe(**inputs)` pattern).
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
# Build a Cosmos action condition, cache it, then run an action-policy rollout.
|
|
140
|
+
use_diffusers(action="call", target="CosmosActionCondition",
|
|
141
|
+
parameters={"mode": "policy", "chunk_size": 16,
|
|
142
|
+
"domain_name": "bridge_orig_lerobot",
|
|
143
|
+
"resolution_tier": 480, "video": "robot.mp4",
|
|
144
|
+
"view_point": "ego_view"},
|
|
145
|
+
cache_key="act_cond")
|
|
146
|
+
|
|
147
|
+
use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", model="nvidia/Cosmos3-Nano",
|
|
148
|
+
parameters={"prompt": "Put the pot to the left of the purple item.",
|
|
149
|
+
"action": "cached:act_cond", "fps": 5,
|
|
150
|
+
"num_inference_steps": 30, "guidance_scale": 1.0,
|
|
151
|
+
"use_system_prompt": False},
|
|
152
|
+
dtype="bfloat16", device="cuda")
|
|
153
|
+
# β artifacts: cosmos_world.mp4 + action chunk .json ([1, 16, action_dim])
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Discovery (the agent never guesses)
|
|
157
|
+
|
|
158
|
+
| action | what it returns |
|
|
159
|
+
|---|---|
|
|
160
|
+
| `pipelines` | all 300+ pipeline classes + derived modality |
|
|
161
|
+
| `models` | every model class (VAEs, transformers, controlnets) |
|
|
162
|
+
| `schedulers` | every scheduler class |
|
|
163
|
+
| `tasks` | diffusers' `AutoPipeline` task β `{family: class}` maps |
|
|
164
|
+
| `modalities` | pipelines grouped by modality (image / video / world / audio / **3d** mesh) |
|
|
165
|
+
| `wfm` | world-foundation / action-capable pipelines (Cosmos, Wan, Hunyuan) |
|
|
166
|
+
| `pipeline_info` | modality + `__call__` signature for one pipeline class |
|
|
167
|
+
| `inspect` | signature + docstring of any target |
|
|
168
|
+
| `visualize` | render a robot ACTION chunk β time-series + 3D trajectory + animation (mp4/gif) |
|
|
169
|
+
| `cache` / `clear_cache` | manage loaded pipelines (free GPU memory) |
|
|
170
|
+
|
|
171
|
+
## Architecture
|
|
172
|
+
|
|
173
|
+
```
|
|
174
|
+
strands_diffusers/
|
|
175
|
+
βββ core/
|
|
176
|
+
β βββ registry.py # zero-hardcode taxonomy from diffusers._import_structure
|
|
177
|
+
β βββ engine.py # load/cache pipelines, auto device+dtype
|
|
178
|
+
β βββ io.py # coerce inputs; serialize video/image/audio/ACTION outputs
|
|
179
|
+
βββ tools/
|
|
180
|
+
βββ use_diffusers.py # the single @tool: run + call + discovery
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Testing
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pip install -e ".[video,audio,dev]"
|
|
187
|
+
pytest tests/ -q # 26 unit tests β no GPU, no model downloads
|
|
188
|
+
python examples/smoke.py # E2E gate on tiny HF fixtures
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
`tests/` covers the registry classifier (golden modalities + a guard that no
|
|
192
|
+
video/WFM pipeline is ever mislabeled as a still image), and the multimodal I/O
|
|
193
|
+
serializers β image, video (incl. `list[ndarray]`), **stereo audio** (channels-
|
|
194
|
+
first *and* channels-last), the robot **action** chunk, and **3D mesh** output
|
|
195
|
+
(ShapE β `.ply`/`.obj`/`.npz`). CI runs both on py3.10 + py3.12.
|
|
196
|
+
|
|
197
|
+
## License
|
|
198
|
+
|
|
199
|
+
MIT
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# π¨ strands-diffusers
|
|
2
|
+
|
|
3
|
+
**The universal entrypoint to HuggingFace `diffusers` for Strands agents β 100%
|
|
4
|
+
pipeline & modality coverage, zero hardcoding.**
|
|
5
|
+
|
|
6
|
+
Just like [`use_aws`](https://github.com/strands-agents) wraps boto3,
|
|
7
|
+
[`use_lerobot`](https://github.com/cagataycali) wraps lerobot, and
|
|
8
|
+
[`use_transformers`](https://github.com/cagataycali/strands-transformers) wraps the
|
|
9
|
+
transformers task taxonomy, **`use_diffusers`** wraps the *entire* diffusers
|
|
10
|
+
library behind a single tool. Discover, don't hardcode: the registry is built at
|
|
11
|
+
runtime from `diffusers._import_structure`, so when diffusers ships a new pipeline
|
|
12
|
+
(say, a fresh Cosmos world-foundation model), strands-diffusers supports it
|
|
13
|
+
**automatically β no code change required**.
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
text / image / video / robot-state IN
|
|
17
|
+
image / video / audio / ACTIONS OUT β natively.
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## π Physical-AI focus: world-foundation models with action outputs
|
|
21
|
+
|
|
22
|
+
The headline use-case is **NVIDIA Cosmos** and other world-foundation models
|
|
23
|
+
(WFMs). A Cosmos 3 *action-policy* rollout doesn't just generate a plausible
|
|
24
|
+
future video β it predicts the **robot action chunk** that produces it. A single
|
|
25
|
+
`use_diffusers(action="run", ...)` call returns BOTH:
|
|
26
|
+
|
|
27
|
+
- a playable world **video** (`.mp4`)
|
|
28
|
+
- the predicted **action** chunk in model-normalized action space (`.json`,
|
|
29
|
+
shape `[num_chunks, T, action_dim]`)
|
|
30
|
+
- (optionally) synchronized **sound** (`.wav`)
|
|
31
|
+
|
|
32
|
+
β all surfaced as artifact paths, ready to hand to a robot controller or the user.
|
|
33
|
+
|
|
34
|
+
> **Verified end-to-end** on NVIDIA Thor (diffusers `0.39.0.dev0`, `nvidia/Cosmos3-Nano`,
|
|
35
|
+
> bf16/cuda): one `use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", ...)`
|
|
36
|
+
> call produced a world video `(17, 480, 640, 3)` **and** a robot action chunk
|
|
37
|
+
> `(1, 16, 10)` = `(num_chunks, T, action_dim)`, normalized to `[-1, 1]`.
|
|
38
|
+
> See [`examples/cosmos_action_policy.py`](examples/cosmos_action_policy.py) and
|
|
39
|
+
> [`examples/SETUP_COSMOS.md`](examples/SETUP_COSMOS.md).
|
|
40
|
+
|
|
41
|
+
## Install
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install -e .
|
|
45
|
+
# optional extras:
|
|
46
|
+
pip install -e ".[video,audio]" # mp4 export, wav I/O
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Quick start
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from strands import Agent
|
|
53
|
+
from strands_diffusers import use_diffusers
|
|
54
|
+
|
|
55
|
+
agent = Agent(tools=[use_diffusers])
|
|
56
|
+
agent("Generate an image of a robot arm in a kitchen")
|
|
57
|
+
agent("Run a Cosmos action-policy rollout on robot.mp4 and give me the actions")
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Or drive it directly:
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from strands_diffusers import use_diffusers
|
|
64
|
+
|
|
65
|
+
# text β image
|
|
66
|
+
use_diffusers(
|
|
67
|
+
action="run",
|
|
68
|
+
pipeline="StableDiffusionPipeline",
|
|
69
|
+
model="stabilityai/stable-diffusion-2-1",
|
|
70
|
+
parameters={"prompt": "a robot arm in a kitchen", "num_inference_steps": 25},
|
|
71
|
+
)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Two layers
|
|
75
|
+
|
|
76
|
+
### 1. `run` β high-level pipeline runner
|
|
77
|
+
|
|
78
|
+
Loads a pipeline class via `from_pretrained` and calls it. Inputs are coerced
|
|
79
|
+
(paths / URLs / base64 β PIL / video); outputs (image / video / audio / action)
|
|
80
|
+
are auto-saved and returned by path.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
use_diffusers(action="run", pipeline="WanPipeline", model="...",
|
|
84
|
+
parameters={"prompt": "...", "num_frames": 81}, fps=16)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### 2. `call` β low-level dynamic dispatch
|
|
88
|
+
|
|
89
|
+
Resolve & call *any* diffusers class / function / method β schedulers, VAEs,
|
|
90
|
+
`CosmosActionCondition`, `utils.export_to_video`, or a cached pipeline's method.
|
|
91
|
+
`cached:key` references resolve to live objects; the `"**"` key unpacks a cached
|
|
92
|
+
mapping into kwargs (the `pipe(**inputs)` pattern).
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# Build a Cosmos action condition, cache it, then run an action-policy rollout.
|
|
96
|
+
use_diffusers(action="call", target="CosmosActionCondition",
|
|
97
|
+
parameters={"mode": "policy", "chunk_size": 16,
|
|
98
|
+
"domain_name": "bridge_orig_lerobot",
|
|
99
|
+
"resolution_tier": 480, "video": "robot.mp4",
|
|
100
|
+
"view_point": "ego_view"},
|
|
101
|
+
cache_key="act_cond")
|
|
102
|
+
|
|
103
|
+
use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", model="nvidia/Cosmos3-Nano",
|
|
104
|
+
parameters={"prompt": "Put the pot to the left of the purple item.",
|
|
105
|
+
"action": "cached:act_cond", "fps": 5,
|
|
106
|
+
"num_inference_steps": 30, "guidance_scale": 1.0,
|
|
107
|
+
"use_system_prompt": False},
|
|
108
|
+
dtype="bfloat16", device="cuda")
|
|
109
|
+
# β artifacts: cosmos_world.mp4 + action chunk .json ([1, 16, action_dim])
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Discovery (the agent never guesses)
|
|
113
|
+
|
|
114
|
+
| action | what it returns |
|
|
115
|
+
|---|---|
|
|
116
|
+
| `pipelines` | all 300+ pipeline classes + derived modality |
|
|
117
|
+
| `models` | every model class (VAEs, transformers, controlnets) |
|
|
118
|
+
| `schedulers` | every scheduler class |
|
|
119
|
+
| `tasks` | diffusers' `AutoPipeline` task β `{family: class}` maps |
|
|
120
|
+
| `modalities` | pipelines grouped by modality (image / video / world / audio / **3d** mesh) |
|
|
121
|
+
| `wfm` | world-foundation / action-capable pipelines (Cosmos, Wan, Hunyuan) |
|
|
122
|
+
| `pipeline_info` | modality + `__call__` signature for one pipeline class |
|
|
123
|
+
| `inspect` | signature + docstring of any target |
|
|
124
|
+
| `visualize` | render a robot ACTION chunk β time-series + 3D trajectory + animation (mp4/gif) |
|
|
125
|
+
| `cache` / `clear_cache` | manage loaded pipelines (free GPU memory) |
|
|
126
|
+
|
|
127
|
+
## Architecture
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
strands_diffusers/
|
|
131
|
+
βββ core/
|
|
132
|
+
β βββ registry.py # zero-hardcode taxonomy from diffusers._import_structure
|
|
133
|
+
β βββ engine.py # load/cache pipelines, auto device+dtype
|
|
134
|
+
β βββ io.py # coerce inputs; serialize video/image/audio/ACTION outputs
|
|
135
|
+
βββ tools/
|
|
136
|
+
βββ use_diffusers.py # the single @tool: run + call + discovery
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Testing
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
pip install -e ".[video,audio,dev]"
|
|
143
|
+
pytest tests/ -q # 26 unit tests β no GPU, no model downloads
|
|
144
|
+
python examples/smoke.py # E2E gate on tiny HF fixtures
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
`tests/` covers the registry classifier (golden modalities + a guard that no
|
|
148
|
+
video/WFM pipeline is ever mislabeled as a still image), and the multimodal I/O
|
|
149
|
+
serializers β image, video (incl. `list[ndarray]`), **stereo audio** (channels-
|
|
150
|
+
first *and* channels-last), the robot **action** chunk, and **3D mesh** output
|
|
151
|
+
(ShapE β `.ply`/`.obj`/`.npz`). CI runs both on py3.10 + py3.12.
|
|
152
|
+
|
|
153
|
+
## License
|
|
154
|
+
|
|
155
|
+
MIT
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# strands-diffusers examples
|
|
2
|
+
|
|
3
|
+
All examples import the real `use_diffusers` tool and run **real diffusion
|
|
4
|
+
inference** (no mocks). The image/video examples use tiny HF test fixtures so they
|
|
5
|
+
run fast on any machine β swap `model` for a full checkpoint to get real quality.
|
|
6
|
+
|
|
7
|
+
| example | what it shows | path used | model |
|
|
8
|
+
|---|---|---|---|
|
|
9
|
+
| `text_to_image.py` | text β image, imageβpng artifact | `run` | tiny-stable-diffusion-pipe |
|
|
10
|
+
| `text_to_video.py` | text β video, videoβmp4 artifact | `run` | tiny-random-ltx-video |
|
|
11
|
+
| `cosmos_action_policy.py` | **WFM action-policy: video + robot ACTION out** | `call` + `run` | nvidia/Cosmos3-Nano |
|
|
12
|
+
| `smoke.py` | fast E2E gate (discovery + img + video + action serializer) | all | tiny fixtures |
|
|
13
|
+
|
|
14
|
+
## run vs call
|
|
15
|
+
|
|
16
|
+
- **`run`** β high-level. Give it a `pipeline` class name + `model` repo +
|
|
17
|
+
`parameters`. It loads (and caches) the pipeline, coerces inputs, runs it, and
|
|
18
|
+
serializes every output (image/video/audio/**action**) to an artifact path.
|
|
19
|
+
|
|
20
|
+
- **`call`** β low-level dynamic dispatch. Resolve & call *any* diffusers class,
|
|
21
|
+
function, or method: schedulers, VAEs, `CosmosActionCondition`,
|
|
22
|
+
`utils.export_to_video`, or a cached pipeline's method. Use `cache_key` to stash
|
|
23
|
+
a constructed object and `cached:key` (or `{"**": "cached:key"}`) to feed it
|
|
24
|
+
back into a later call. This is how the Cosmos example builds an action
|
|
25
|
+
condition and threads it into the pipeline run.
|
|
26
|
+
|
|
27
|
+
## The action payload (why this library exists)
|
|
28
|
+
|
|
29
|
+
World-foundation models like NVIDIA Cosmos 3 emit a `Cosmos3OmniPipelineOutput`
|
|
30
|
+
with `video`, optional `sound`, and **`action`** (a `list[torch.Tensor]`, each a
|
|
31
|
+
normalized action chunk `[T, action_dim]`). `core/io.py` serializes:
|
|
32
|
+
|
|
33
|
+
- `video` β `.mp4` (via `diffusers.utils.export_to_video`, imageio fallback, gif last resort)
|
|
34
|
+
- `sound` β `.wav` (soundfile or stdlib `wave`)
|
|
35
|
+
- `action` β `.json` (full nested list + `chunk_shape` / `num_chunks` metadata)
|
|
36
|
+
|
|
37
|
+
So one `use_diffusers(action="run", ...)` hands the agent both a playable world and
|
|
38
|
+
a robot-ready action vector.
|
|
39
|
+
|
|
40
|
+
## Cosmos3OmniPipeline availability
|
|
41
|
+
|
|
42
|
+
`Cosmos3OmniPipeline` ships in **diffusers from source** (>0.38). `use_diffusers`
|
|
43
|
+
resolves pipeline classes dynamically, so the moment your diffusers has it, the
|
|
44
|
+
example works unchanged:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install 'git+https://github.com/huggingface/diffusers'
|
|
48
|
+
python examples/cosmos_action_policy.py
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
On older diffusers the example degrades gracefully and still lists the
|
|
52
|
+
action-capable WFM pipelines available now (Cosmos2*, CosmosVideoToWorld, Wan, β¦).
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Running real Cosmos 3 (action-policy) with strands-diffusers
|
|
2
|
+
|
|
3
|
+
`Cosmos3OmniPipeline` + `CosmosActionCondition` ship in **diffusers from source**
|
|
4
|
+
(>0.38). `use_diffusers` resolves pipeline classes dynamically, so no code change
|
|
5
|
+
is needed once they're importable.
|
|
6
|
+
|
|
7
|
+
## Option A β install from source into your env
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install 'git+https://github.com/huggingface/diffusers'
|
|
11
|
+
python examples/cosmos_action_policy.py
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Option B β side-load from source (don't disturb a pinned diffusers)
|
|
15
|
+
|
|
16
|
+
Install only the diffusers source tree (no deps) and prepend it to `PYTHONPATH`,
|
|
17
|
+
reusing your existing torch / transformers:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install 'git+https://github.com/huggingface/diffusers' --no-deps --target /tmp/dmain
|
|
21
|
+
PYTHONPATH=/tmp/dmain python examples/cosmos_action_policy.py
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
This is exactly how the example above was verified end-to-end.
|
|
25
|
+
|
|
26
|
+
## What you get
|
|
27
|
+
|
|
28
|
+
A single `use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", ...)` returns
|
|
29
|
+
a `Cosmos3OmniPipelineOutput` that `core/io.py` serializes to artifacts:
|
|
30
|
+
|
|
31
|
+
```
|
|
32
|
+
π artifacts:
|
|
33
|
+
β’ /tmp/strands_diffusers/video_*.mp4 # world rollout (17, 480, 640, 3)
|
|
34
|
+
β’ /tmp/strands_diffusers/action_*.json # robot actions (1, 16, 10)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
The action JSON is the model-normalized action chunk `[num_chunks, T, action_dim]`
|
|
38
|
+
(values in `[-1, 1]`) β feed it straight to your embodiment's un-normalizer /
|
|
39
|
+
controller. Pick `domain_name` to match your robot (e.g. `bridge_orig_lerobot`).
|
|
40
|
+
|
|
41
|
+
## Action modes (see the Cosmos 3 docs)
|
|
42
|
+
|
|
43
|
+
- `policy` β predict future video **and** actions from the first frame + task.
|
|
44
|
+
- `forward_dynamics` β roll out video from a first frame + a given `raw_actions` seq.
|
|
45
|
+
- `inverse_dynamics` β infer the actions connecting the frames of a conditioning video.
|
|
46
|
+
|
|
47
|
+
Build the condition with `use_diffusers(action="call",
|
|
48
|
+
target="CosmosActionCondition", parameters={...}, cache_key="cond")` then pass
|
|
49
|
+
`parameters={"action": "cached:cond"}` to the run.
|
|
50
|
+
|
|
51
|
+
## Notes
|
|
52
|
+
|
|
53
|
+
- ~33 GB of weights for `nvidia/Cosmos3-Nano`; needs a CUDA GPU.
|
|
54
|
+
- The default NVIDIA guardrail (`cosmos_guardrail`) is on under the model license.
|
|
55
|
+
These demos pass `enable_safety_checker=False` for development; keep it enabled
|
|
56
|
+
for anything public-facing.
|