strands-diffusers 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. strands_diffusers-0.1.0/.github/workflows/auto-release.yml +90 -0
  2. strands_diffusers-0.1.0/.github/workflows/ci.yml +28 -0
  3. strands_diffusers-0.1.0/.gitignore +21 -0
  4. strands_diffusers-0.1.0/PKG-INFO +199 -0
  5. strands_diffusers-0.1.0/README.md +155 -0
  6. strands_diffusers-0.1.0/examples/README.md +52 -0
  7. strands_diffusers-0.1.0/examples/SETUP_COSMOS.md +56 -0
  8. strands_diffusers-0.1.0/examples/cosmos_action_policy.py +107 -0
  9. strands_diffusers-0.1.0/examples/gallery_20.py +105 -0
  10. strands_diffusers-0.1.0/examples/smoke.py +94 -0
  11. strands_diffusers-0.1.0/examples/text_to_image.py +36 -0
  12. strands_diffusers-0.1.0/examples/text_to_video.py +39 -0
  13. strands_diffusers-0.1.0/examples/visualize_actions.py +66 -0
  14. strands_diffusers-0.1.0/pyproject.toml +69 -0
  15. strands_diffusers-0.1.0/requirements.txt +7 -0
  16. strands_diffusers-0.1.0/setup.cfg +4 -0
  17. strands_diffusers-0.1.0/setup.py +5 -0
  18. strands_diffusers-0.1.0/strands_diffusers/__init__.py +41 -0
  19. strands_diffusers-0.1.0/strands_diffusers/_version.py +24 -0
  20. strands_diffusers-0.1.0/strands_diffusers/core/__init__.py +4 -0
  21. strands_diffusers-0.1.0/strands_diffusers/core/engine.py +163 -0
  22. strands_diffusers-0.1.0/strands_diffusers/core/io.py +552 -0
  23. strands_diffusers-0.1.0/strands_diffusers/core/registry.py +349 -0
  24. strands_diffusers-0.1.0/strands_diffusers/core/viz.py +256 -0
  25. strands_diffusers-0.1.0/strands_diffusers/tools/__init__.py +4 -0
  26. strands_diffusers-0.1.0/strands_diffusers/tools/use_diffusers.py +420 -0
  27. strands_diffusers-0.1.0/strands_diffusers.egg-info/PKG-INFO +199 -0
  28. strands_diffusers-0.1.0/strands_diffusers.egg-info/SOURCES.txt +31 -0
  29. strands_diffusers-0.1.0/strands_diffusers.egg-info/dependency_links.txt +1 -0
  30. strands_diffusers-0.1.0/strands_diffusers.egg-info/requires.txt +27 -0
  31. strands_diffusers-0.1.0/strands_diffusers.egg-info/top_level.txt +1 -0
  32. strands_diffusers-0.1.0/tests/test_action_io.py +197 -0
  33. strands_diffusers-0.1.0/tests/test_registry.py +77 -0
@@ -0,0 +1,90 @@
1
+ name: Auto Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*.*.*'
7
+
8
+ permissions:
9
+ contents: write
10
+ id-token: write
11
+
12
+ jobs:
13
+ auto-release:
14
+ runs-on: ubuntu-latest
15
+
16
+ steps:
17
+ - name: Checkout code
18
+ uses: actions/checkout@v4
19
+ with:
20
+ fetch-depth: 0 # full history + tags for setuptools-scm
21
+ token: ${{ secrets.GITHUB_TOKEN }}
22
+
23
+ - name: Set up Python
24
+ uses: actions/setup-python@v5
25
+ with:
26
+ python-version: "3.11"
27
+
28
+ - name: Install build tooling
29
+ run: |
30
+ python -m pip install --upgrade pip
31
+ pip install build twine
32
+
33
+ - name: Extract version from tag
34
+ id: get_version
35
+ run: |
36
+ VERSION=${GITHUB_REF#refs/tags/v}
37
+ echo "version=$VERSION" >> $GITHUB_OUTPUT
38
+ echo "Version: $VERSION"
39
+
40
+ - name: Build package (version derived from git tag via setuptools-scm)
41
+ run: python -m build
42
+
43
+ - name: Verify built version matches tag
44
+ run: |
45
+ ls -l dist/
46
+ if ! ls dist/ | grep -q "${{ steps.get_version.outputs.version }}"; then
47
+ echo "::error::Built artifact does not match tag version ${{ steps.get_version.outputs.version }}"
48
+ ls dist/
49
+ exit 1
50
+ fi
51
+
52
+ - name: Publish to PyPI
53
+ env:
54
+ TWINE_USERNAME: __token__
55
+ TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
56
+ run: twine upload dist/*
57
+
58
+ - name: Create GitHub Release
59
+ uses: softprops/action-gh-release@v2
60
+ with:
61
+ tag_name: ${{ github.ref_name }}
62
+ name: strands-diffusers v${{ steps.get_version.outputs.version }}
63
+ body: |
64
+ ## 🎨 strands-diffusers v${{ steps.get_version.outputs.version }}
65
+
66
+ The universal entrypoint to HuggingFace `diffusers` for Strands agents β€”
67
+ 100% pipeline & modality coverage, zero hardcoding.
68
+
69
+ ### πŸ“¦ Installation
70
+ ```bash
71
+ pip install strands-diffusers==${{ steps.get_version.outputs.version }}
72
+ # extras: pip install "strands-diffusers[video,audio]"
73
+ ```
74
+
75
+ ### πŸ”„ Upgrade
76
+ ```bash
77
+ pip install --upgrade strands-diffusers
78
+ ```
79
+
80
+ **Full Changelog**: https://github.com/cagataycali/strands-diffusers/releases
81
+ draft: false
82
+ prerelease: false
83
+
84
+ - name: Summary
85
+ run: |
86
+ echo "## πŸŽ‰ Release Summary" >> $GITHUB_STEP_SUMMARY
87
+ echo "" >> $GITHUB_STEP_SUMMARY
88
+ echo "- **Version:** v${{ steps.get_version.outputs.version }}" >> $GITHUB_STEP_SUMMARY
89
+ echo "- **PyPI:** https://pypi.org/project/strands-diffusers/${{ steps.get_version.outputs.version }}/" >> $GITHUB_STEP_SUMMARY
90
+ echo "- **GitHub Release:** https://github.com/cagataycali/strands-diffusers/releases/tag/v${{ steps.get_version.outputs.version }}" >> $GITHUB_STEP_SUMMARY
@@ -0,0 +1,28 @@
1
+ name: CI
2
+ on:
3
+ push:
4
+ branches: [main]
5
+ pull_request:
6
+ jobs:
7
+ test:
8
+ runs-on: ubuntu-latest
9
+ strategy:
10
+ matrix:
11
+ python-version: ["3.10", "3.12"]
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ with:
15
+ fetch-depth: 0 # full history+tags so setuptools-scm can derive version
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: ${{ matrix.python-version }}
19
+ - name: Install
20
+ run: |
21
+ python -m pip install --upgrade pip
22
+ pip install -e ".[video,audio,dev]"
23
+ - name: Unit tests (no GPU, no model downloads)
24
+ run: pytest tests/ -q
25
+ - name: Smoke E2E (tiny fixtures)
26
+ env:
27
+ TOKENIZERS_PARALLELISM: "false"
28
+ run: python examples/smoke.py
@@ -0,0 +1,21 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ build/
5
+ dist/
6
+ .venv/
7
+ *.mp4
8
+ *.png
9
+ *.jpg
10
+ *.wav
11
+ *.json.tmp
12
+ assets/
13
+ .coverage
14
+ system_prompt.prompt
15
+ __pycache__/
16
+ *.pyc
17
+ .pytest_cache/
18
+ strands_diffusers/_version.py
19
+ build/
20
+ .pytest_cache/
21
+ __pycache__/
@@ -0,0 +1,199 @@
1
+ Metadata-Version: 2.4
2
+ Name: strands-diffusers
3
+ Version: 0.1.0
4
+ Summary: The universal entrypoint to HuggingFace diffusers for Strands agents β€” 100% pipeline & modality coverage, zero hardcoding. Special focus on Physical-AI world-foundation models (Cosmos) with robot action outputs.
5
+ Author-email: Cagatay Cali <cagataycali@icloud.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/cagataycali/strands-diffusers
8
+ Project-URL: Repository, https://github.com/cagataycali/strands-diffusers
9
+ Project-URL: Issues, https://github.com/cagataycali/strands-diffusers/issues
10
+ Keywords: strands,diffusers,huggingface,ai,agents,diffusion,video,image,vla,wfm,world-foundation-model,cosmos,robotics,physical-ai
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown
22
+ Requires-Dist: strands-agents
23
+ Requires-Dist: diffusers>=0.30
24
+ Requires-Dist: transformers>=4.40
25
+ Requires-Dist: torch
26
+ Requires-Dist: pillow
27
+ Requires-Dist: numpy
28
+ Requires-Dist: accelerate
29
+ Provides-Extra: video
30
+ Requires-Dist: imageio[ffmpeg]; extra == "video"
31
+ Requires-Dist: opencv-python; extra == "video"
32
+ Requires-Dist: av; extra == "video"
33
+ Provides-Extra: audio
34
+ Requires-Dist: soundfile; extra == "audio"
35
+ Requires-Dist: librosa; extra == "audio"
36
+ Provides-Extra: cosmos
37
+ Requires-Dist: cosmos_guardrail; extra == "cosmos"
38
+ Provides-Extra: dev
39
+ Requires-Dist: pytest>=7.0; extra == "dev"
40
+ Requires-Dist: black; extra == "dev"
41
+ Requires-Dist: ruff; extra == "dev"
42
+ Provides-Extra: all
43
+ Requires-Dist: strands-diffusers[audio,dev,video]; extra == "all"
44
+
45
+ # 🎨 strands-diffusers
46
+
47
+ **The universal entrypoint to HuggingFace `diffusers` for Strands agents β€” 100%
48
+ pipeline & modality coverage, zero hardcoding.**
49
+
50
+ Just like [`use_aws`](https://github.com/strands-agents) wraps boto3,
51
+ [`use_lerobot`](https://github.com/cagataycali) wraps lerobot, and
52
+ [`use_transformers`](https://github.com/cagataycali/strands-transformers) wraps the
53
+ transformers task taxonomy, **`use_diffusers`** wraps the *entire* diffusers
54
+ library behind a single tool. Discover, don't hardcode: the registry is built at
55
+ runtime from `diffusers._import_structure`, so when diffusers ships a new pipeline
56
+ (say, a fresh Cosmos world-foundation model), strands-diffusers supports it
57
+ **automatically β€” no code change required**.
58
+
59
+ ```
60
+ text / image / video / robot-state IN
61
+ image / video / audio / ACTIONS OUT β€” natively.
62
+ ```
63
+
64
+ ## 🌍 Physical-AI focus: world-foundation models with action outputs
65
+
66
+ The headline use-case is **NVIDIA Cosmos** and other world-foundation models
67
+ (WFMs). A Cosmos 3 *action-policy* rollout doesn't just generate a plausible
68
+ future video β€” it predicts the **robot action chunk** that produces it. A single
69
+ `use_diffusers(action="run", ...)` call returns BOTH:
70
+
71
+ - a playable world **video** (`.mp4`)
72
+ - the predicted **action** chunk in model-normalized action space (`.json`,
73
+ shape `[num_chunks, T, action_dim]`)
74
+ - (optionally) synchronized **sound** (`.wav`)
75
+
76
+ β€” all surfaced as artifact paths, ready to hand to a robot controller or the user.
77
+
78
+ > **Verified end-to-end** on NVIDIA Thor (diffusers `0.39.0.dev0`, `nvidia/Cosmos3-Nano`,
79
+ > bf16/cuda): one `use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", ...)`
80
+ > call produced a world video `(17, 480, 640, 3)` **and** a robot action chunk
81
+ > `(1, 16, 10)` = `(num_chunks, T, action_dim)`, normalized to `[-1, 1]`.
82
+ > See [`examples/cosmos_action_policy.py`](examples/cosmos_action_policy.py) and
83
+ > [`examples/SETUP_COSMOS.md`](examples/SETUP_COSMOS.md).
84
+
85
+ ## Install
86
+
87
+ ```bash
88
+ pip install -e .
89
+ # optional extras:
90
+ pip install -e ".[video,audio]" # mp4 export, wav I/O
91
+ ```
92
+
93
+ ## Quick start
94
+
95
+ ```python
96
+ from strands import Agent
97
+ from strands_diffusers import use_diffusers
98
+
99
+ agent = Agent(tools=[use_diffusers])
100
+ agent("Generate an image of a robot arm in a kitchen")
101
+ agent("Run a Cosmos action-policy rollout on robot.mp4 and give me the actions")
102
+ ```
103
+
104
+ Or drive it directly:
105
+
106
+ ```python
107
+ from strands_diffusers import use_diffusers
108
+
109
+ # text β†’ image
110
+ use_diffusers(
111
+ action="run",
112
+ pipeline="StableDiffusionPipeline",
113
+ model="stabilityai/stable-diffusion-2-1",
114
+ parameters={"prompt": "a robot arm in a kitchen", "num_inference_steps": 25},
115
+ )
116
+ ```
117
+
118
+ ## Two layers
119
+
120
+ ### 1. `run` β€” high-level pipeline runner
121
+
122
+ Loads a pipeline class via `from_pretrained` and calls it. Inputs are coerced
123
+ (paths / URLs / base64 β†’ PIL / video); outputs (image / video / audio / action)
124
+ are auto-saved and returned by path.
125
+
126
+ ```python
127
+ use_diffusers(action="run", pipeline="WanPipeline", model="...",
128
+ parameters={"prompt": "...", "num_frames": 81}, fps=16)
129
+ ```
130
+
131
+ ### 2. `call` β€” low-level dynamic dispatch
132
+
133
+ Resolve & call *any* diffusers class / function / method β€” schedulers, VAEs,
134
+ `CosmosActionCondition`, `utils.export_to_video`, or a cached pipeline's method.
135
+ `cached:key` references resolve to live objects; the `"**"` key unpacks a cached
136
+ mapping into kwargs (the `pipe(**inputs)` pattern).
137
+
138
+ ```python
139
+ # Build a Cosmos action condition, cache it, then run an action-policy rollout.
140
+ use_diffusers(action="call", target="CosmosActionCondition",
141
+ parameters={"mode": "policy", "chunk_size": 16,
142
+ "domain_name": "bridge_orig_lerobot",
143
+ "resolution_tier": 480, "video": "robot.mp4",
144
+ "view_point": "ego_view"},
145
+ cache_key="act_cond")
146
+
147
+ use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", model="nvidia/Cosmos3-Nano",
148
+ parameters={"prompt": "Put the pot to the left of the purple item.",
149
+ "action": "cached:act_cond", "fps": 5,
150
+ "num_inference_steps": 30, "guidance_scale": 1.0,
151
+ "use_system_prompt": False},
152
+ dtype="bfloat16", device="cuda")
153
+ # β†’ artifacts: cosmos_world.mp4 + action chunk .json ([1, 16, action_dim])
154
+ ```
155
+
156
+ ## Discovery (the agent never guesses)
157
+
158
+ | action | what it returns |
159
+ |---|---|
160
+ | `pipelines` | all 300+ pipeline classes + derived modality |
161
+ | `models` | every model class (VAEs, transformers, controlnets) |
162
+ | `schedulers` | every scheduler class |
163
+ | `tasks` | diffusers' `AutoPipeline` task β†’ `{family: class}` maps |
164
+ | `modalities` | pipelines grouped by modality (image / video / world / audio / **3d** mesh) |
165
+ | `wfm` | world-foundation / action-capable pipelines (Cosmos, Wan, Hunyuan) |
166
+ | `pipeline_info` | modality + `__call__` signature for one pipeline class |
167
+ | `inspect` | signature + docstring of any target |
168
+ | `visualize` | render a robot ACTION chunk β†’ time-series + 3D trajectory + animation (mp4/gif) |
169
+ | `cache` / `clear_cache` | manage loaded pipelines (free GPU memory) |
170
+
171
+ ## Architecture
172
+
173
+ ```
174
+ strands_diffusers/
175
+ β”œβ”€β”€ core/
176
+ β”‚ β”œβ”€β”€ registry.py # zero-hardcode taxonomy from diffusers._import_structure
177
+ β”‚ β”œβ”€β”€ engine.py # load/cache pipelines, auto device+dtype
178
+ β”‚ └── io.py # coerce inputs; serialize video/image/audio/ACTION outputs
179
+ └── tools/
180
+ └── use_diffusers.py # the single @tool: run + call + discovery
181
+ ```
182
+
183
+ ## Testing
184
+
185
+ ```bash
186
+ pip install -e ".[video,audio,dev]"
187
+ pytest tests/ -q # 26 unit tests β€” no GPU, no model downloads
188
+ python examples/smoke.py # E2E gate on tiny HF fixtures
189
+ ```
190
+
191
+ `tests/` covers the registry classifier (golden modalities + a guard that no
192
+ video/WFM pipeline is ever mislabeled as a still image), and the multimodal I/O
193
+ serializers β€” image, video (incl. `list[ndarray]`), **stereo audio** (channels-
194
+ first *and* channels-last), the robot **action** chunk, and **3D mesh** output
195
+ (ShapE β†’ `.ply`/`.obj`/`.npz`). CI runs both on py3.10 + py3.12.
196
+
197
+ ## License
198
+
199
+ MIT
@@ -0,0 +1,155 @@
1
+ # 🎨 strands-diffusers
2
+
3
+ **The universal entrypoint to HuggingFace `diffusers` for Strands agents β€” 100%
4
+ pipeline & modality coverage, zero hardcoding.**
5
+
6
+ Just like [`use_aws`](https://github.com/strands-agents) wraps boto3,
7
+ [`use_lerobot`](https://github.com/cagataycali) wraps lerobot, and
8
+ [`use_transformers`](https://github.com/cagataycali/strands-transformers) wraps the
9
+ transformers task taxonomy, **`use_diffusers`** wraps the *entire* diffusers
10
+ library behind a single tool. Discover, don't hardcode: the registry is built at
11
+ runtime from `diffusers._import_structure`, so when diffusers ships a new pipeline
12
+ (say, a fresh Cosmos world-foundation model), strands-diffusers supports it
13
+ **automatically β€” no code change required**.
14
+
15
+ ```
16
+ text / image / video / robot-state IN
17
+ image / video / audio / ACTIONS OUT β€” natively.
18
+ ```
19
+
20
+ ## 🌍 Physical-AI focus: world-foundation models with action outputs
21
+
22
+ The headline use-case is **NVIDIA Cosmos** and other world-foundation models
23
+ (WFMs). A Cosmos 3 *action-policy* rollout doesn't just generate a plausible
24
+ future video β€” it predicts the **robot action chunk** that produces it. A single
25
+ `use_diffusers(action="run", ...)` call returns BOTH:
26
+
27
+ - a playable world **video** (`.mp4`)
28
+ - the predicted **action** chunk in model-normalized action space (`.json`,
29
+ shape `[num_chunks, T, action_dim]`)
30
+ - (optionally) synchronized **sound** (`.wav`)
31
+
32
+ β€” all surfaced as artifact paths, ready to hand to a robot controller or the user.
33
+
34
+ > **Verified end-to-end** on NVIDIA Thor (diffusers `0.39.0.dev0`, `nvidia/Cosmos3-Nano`,
35
+ > bf16/cuda): one `use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", ...)`
36
+ > call produced a world video `(17, 480, 640, 3)` **and** a robot action chunk
37
+ > `(1, 16, 10)` = `(num_chunks, T, action_dim)`, normalized to `[-1, 1]`.
38
+ > See [`examples/cosmos_action_policy.py`](examples/cosmos_action_policy.py) and
39
+ > [`examples/SETUP_COSMOS.md`](examples/SETUP_COSMOS.md).
40
+
41
+ ## Install
42
+
43
+ ```bash
44
+ pip install -e .
45
+ # optional extras:
46
+ pip install -e ".[video,audio]" # mp4 export, wav I/O
47
+ ```
48
+
49
+ ## Quick start
50
+
51
+ ```python
52
+ from strands import Agent
53
+ from strands_diffusers import use_diffusers
54
+
55
+ agent = Agent(tools=[use_diffusers])
56
+ agent("Generate an image of a robot arm in a kitchen")
57
+ agent("Run a Cosmos action-policy rollout on robot.mp4 and give me the actions")
58
+ ```
59
+
60
+ Or drive it directly:
61
+
62
+ ```python
63
+ from strands_diffusers import use_diffusers
64
+
65
+ # text β†’ image
66
+ use_diffusers(
67
+ action="run",
68
+ pipeline="StableDiffusionPipeline",
69
+ model="stabilityai/stable-diffusion-2-1",
70
+ parameters={"prompt": "a robot arm in a kitchen", "num_inference_steps": 25},
71
+ )
72
+ ```
73
+
74
+ ## Two layers
75
+
76
+ ### 1. `run` β€” high-level pipeline runner
77
+
78
+ Loads a pipeline class via `from_pretrained` and calls it. Inputs are coerced
79
+ (paths / URLs / base64 β†’ PIL / video); outputs (image / video / audio / action)
80
+ are auto-saved and returned by path.
81
+
82
+ ```python
83
+ use_diffusers(action="run", pipeline="WanPipeline", model="...",
84
+ parameters={"prompt": "...", "num_frames": 81}, fps=16)
85
+ ```
86
+
87
+ ### 2. `call` β€” low-level dynamic dispatch
88
+
89
+ Resolve & call *any* diffusers class / function / method β€” schedulers, VAEs,
90
+ `CosmosActionCondition`, `utils.export_to_video`, or a cached pipeline's method.
91
+ `cached:key` references resolve to live objects; the `"**"` key unpacks a cached
92
+ mapping into kwargs (the `pipe(**inputs)` pattern).
93
+
94
+ ```python
95
+ # Build a Cosmos action condition, cache it, then run an action-policy rollout.
96
+ use_diffusers(action="call", target="CosmosActionCondition",
97
+ parameters={"mode": "policy", "chunk_size": 16,
98
+ "domain_name": "bridge_orig_lerobot",
99
+ "resolution_tier": 480, "video": "robot.mp4",
100
+ "view_point": "ego_view"},
101
+ cache_key="act_cond")
102
+
103
+ use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", model="nvidia/Cosmos3-Nano",
104
+ parameters={"prompt": "Put the pot to the left of the purple item.",
105
+ "action": "cached:act_cond", "fps": 5,
106
+ "num_inference_steps": 30, "guidance_scale": 1.0,
107
+ "use_system_prompt": False},
108
+ dtype="bfloat16", device="cuda")
109
+ # β†’ artifacts: cosmos_world.mp4 + action chunk .json ([1, 16, action_dim])
110
+ ```
111
+
112
+ ## Discovery (the agent never guesses)
113
+
114
+ | action | what it returns |
115
+ |---|---|
116
+ | `pipelines` | all 300+ pipeline classes + derived modality |
117
+ | `models` | every model class (VAEs, transformers, controlnets) |
118
+ | `schedulers` | every scheduler class |
119
+ | `tasks` | diffusers' `AutoPipeline` task β†’ `{family: class}` maps |
120
+ | `modalities` | pipelines grouped by modality (image / video / world / audio / **3d** mesh) |
121
+ | `wfm` | world-foundation / action-capable pipelines (Cosmos, Wan, Hunyuan) |
122
+ | `pipeline_info` | modality + `__call__` signature for one pipeline class |
123
+ | `inspect` | signature + docstring of any target |
124
+ | `visualize` | render a robot ACTION chunk β†’ time-series + 3D trajectory + animation (mp4/gif) |
125
+ | `cache` / `clear_cache` | manage loaded pipelines (free GPU memory) |
126
+
127
+ ## Architecture
128
+
129
+ ```
130
+ strands_diffusers/
131
+ β”œβ”€β”€ core/
132
+ β”‚ β”œβ”€β”€ registry.py # zero-hardcode taxonomy from diffusers._import_structure
133
+ β”‚ β”œβ”€β”€ engine.py # load/cache pipelines, auto device+dtype
134
+ β”‚ └── io.py # coerce inputs; serialize video/image/audio/ACTION outputs
135
+ └── tools/
136
+ └── use_diffusers.py # the single @tool: run + call + discovery
137
+ ```
138
+
139
+ ## Testing
140
+
141
+ ```bash
142
+ pip install -e ".[video,audio,dev]"
143
+ pytest tests/ -q # 26 unit tests β€” no GPU, no model downloads
144
+ python examples/smoke.py # E2E gate on tiny HF fixtures
145
+ ```
146
+
147
+ `tests/` covers the registry classifier (golden modalities + a guard that no
148
+ video/WFM pipeline is ever mislabeled as a still image), and the multimodal I/O
149
+ serializers β€” image, video (incl. `list[ndarray]`), **stereo audio** (channels-
150
+ first *and* channels-last), the robot **action** chunk, and **3D mesh** output
151
+ (ShapE β†’ `.ply`/`.obj`/`.npz`). CI runs both on py3.10 + py3.12.
152
+
153
+ ## License
154
+
155
+ MIT
@@ -0,0 +1,52 @@
1
+ # strands-diffusers examples
2
+
3
+ All examples import the real `use_diffusers` tool and run **real diffusion
4
+ inference** (no mocks). The image/video examples use tiny HF test fixtures so they
5
+ run fast on any machine β€” swap `model` for a full checkpoint to get real quality.
6
+
7
+ | example | what it shows | path used | model |
8
+ |---|---|---|---|
9
+ | `text_to_image.py` | text → image, image→png artifact | `run` | tiny-stable-diffusion-pipe |
10
+ | `text_to_video.py` | text → video, video→mp4 artifact | `run` | tiny-random-ltx-video |
11
+ | `cosmos_action_policy.py` | **WFM action-policy: video + robot ACTION out** | `call` + `run` | nvidia/Cosmos3-Nano |
12
+ | `smoke.py` | fast E2E gate (discovery + img + video + action serializer) | all | tiny fixtures |
13
+
14
+ ## run vs call
15
+
16
+ - **`run`** β€” high-level. Give it a `pipeline` class name + `model` repo +
17
+ `parameters`. It loads (and caches) the pipeline, coerces inputs, runs it, and
18
+ serializes every output (image/video/audio/**action**) to an artifact path.
19
+
20
+ - **`call`** β€” low-level dynamic dispatch. Resolve & call *any* diffusers class,
21
+ function, or method: schedulers, VAEs, `CosmosActionCondition`,
22
+ `utils.export_to_video`, or a cached pipeline's method. Use `cache_key` to stash
23
+ a constructed object and `cached:key` (or `{"**": "cached:key"}`) to feed it
24
+ back into a later call. This is how the Cosmos example builds an action
25
+ condition and threads it into the pipeline run.
26
+
27
+ ## The action payload (why this library exists)
28
+
29
+ World-foundation models like NVIDIA Cosmos 3 emit a `Cosmos3OmniPipelineOutput`
30
+ with `video`, optional `sound`, and **`action`** (a `list[torch.Tensor]`, each a
31
+ normalized action chunk `[T, action_dim]`). `core/io.py` serializes:
32
+
33
+ - `video` β†’ `.mp4` (via `diffusers.utils.export_to_video`, imageio fallback, gif last resort)
34
+ - `sound` β†’ `.wav` (soundfile or stdlib `wave`)
35
+ - `action` β†’ `.json` (full nested list + `chunk_shape` / `num_chunks` metadata)
36
+
37
+ So one `use_diffusers(action="run", ...)` hands the agent both a playable world and
38
+ a robot-ready action vector.
39
+
40
+ ## Cosmos3OmniPipeline availability
41
+
42
+ `Cosmos3OmniPipeline` ships in **diffusers from source** (>0.38). `use_diffusers`
43
+ resolves pipeline classes dynamically, so the moment your diffusers has it, the
44
+ example works unchanged:
45
+
46
+ ```bash
47
+ pip install 'git+https://github.com/huggingface/diffusers'
48
+ python examples/cosmos_action_policy.py
49
+ ```
50
+
51
+ On older diffusers the example degrades gracefully and still lists the
52
+ action-capable WFM pipelines available now (Cosmos2*, CosmosVideoToWorld, Wan, …).
@@ -0,0 +1,56 @@
1
+ # Running real Cosmos 3 (action-policy) with strands-diffusers
2
+
3
+ `Cosmos3OmniPipeline` + `CosmosActionCondition` ship in **diffusers from source**
4
+ (>0.38). `use_diffusers` resolves pipeline classes dynamically, so no code change
5
+ is needed once they're importable.
6
+
7
+ ## Option A β€” install from source into your env
8
+
9
+ ```bash
10
+ pip install 'git+https://github.com/huggingface/diffusers'
11
+ python examples/cosmos_action_policy.py
12
+ ```
13
+
14
+ ## Option B β€” side-load from source (don't disturb a pinned diffusers)
15
+
16
+ Install only the diffusers source tree (no deps) and prepend it to `PYTHONPATH`,
17
+ reusing your existing torch / transformers:
18
+
19
+ ```bash
20
+ pip install 'git+https://github.com/huggingface/diffusers' --no-deps --target /tmp/dmain
21
+ PYTHONPATH=/tmp/dmain python examples/cosmos_action_policy.py
22
+ ```
23
+
24
+ This is exactly how the example above was verified end-to-end.
25
+
26
+ ## What you get
27
+
28
+ A single `use_diffusers(action="run", pipeline="Cosmos3OmniPipeline", ...)` returns
29
+ a `Cosmos3OmniPipelineOutput` that `core/io.py` serializes to artifacts:
30
+
31
+ ```
32
+ πŸ“Ž artifacts:
33
+ β€’ /tmp/strands_diffusers/video_*.mp4 # world rollout (17, 480, 640, 3)
34
+ β€’ /tmp/strands_diffusers/action_*.json # robot actions (1, 16, 10)
35
+ ```
36
+
37
+ The action JSON is the model-normalized action chunk `[num_chunks, T, action_dim]`
38
+ (values in `[-1, 1]`) β€” feed it straight to your embodiment's un-normalizer /
39
+ controller. Pick `domain_name` to match your robot (e.g. `bridge_orig_lerobot`).
40
+
41
+ ## Action modes (see the Cosmos 3 docs)
42
+
43
+ - `policy` β€” predict future video **and** actions from the first frame + task.
44
+ - `forward_dynamics` β€” roll out video from a first frame + a given `raw_actions` seq.
45
+ - `inverse_dynamics` β€” infer the actions connecting the frames of a conditioning video.
46
+
47
+ Build the condition with `use_diffusers(action="call",
48
+ target="CosmosActionCondition", parameters={...}, cache_key="cond")` then pass
49
+ `parameters={"action": "cached:cond"}` to the run.
50
+
51
+ ## Notes
52
+
53
+ - ~33 GB of weights for `nvidia/Cosmos3-Nano`; needs a CUDA GPU.
54
+ - The default NVIDIA guardrail (`cosmos_guardrail`) is on under the model license.
55
+ These demos pass `enable_safety_checker=False` for development; keep it enabled
56
+ for anything public-facing.