@swarmclawai/swarmclaw 1.2.1 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/README.md +9 -0
  2. package/package.json +2 -2
  3. package/skills/coding-agent/SKILL.md +111 -0
  4. package/skills/github/SKILL.md +140 -0
  5. package/skills/nano-banana-pro/SKILL.md +62 -0
  6. package/skills/nano-banana-pro/scripts/generate_image.py +235 -0
  7. package/skills/nano-pdf/SKILL.md +53 -0
  8. package/skills/openai-image-gen/SKILL.md +78 -0
  9. package/skills/openai-image-gen/scripts/gen.py +328 -0
  10. package/skills/resourceful-problem-solving/SKILL.md +49 -0
  11. package/skills/skill-creator/SKILL.md +147 -0
  12. package/skills/skill-creator/scripts/init_skill.py +378 -0
  13. package/skills/skill-creator/scripts/quick_validate.py +159 -0
  14. package/skills/summarize/SKILL.md +77 -0
  15. package/src/app/api/auth/route.ts +20 -5
  16. package/src/app/api/chats/[id]/devserver/route.ts +13 -19
  17. package/src/app/api/chats/[id]/messages/route.ts +13 -15
  18. package/src/app/api/chats/[id]/route.ts +9 -10
  19. package/src/app/api/chats/[id]/stop/route.ts +5 -7
  20. package/src/app/api/chats/messages-route.test.ts +8 -6
  21. package/src/app/api/chats/route.ts +9 -10
  22. package/src/app/api/ip/route.ts +2 -2
  23. package/src/app/api/preview-server/route.ts +1 -1
  24. package/src/app/api/projects/[id]/route.ts +7 -46
  25. package/src/components/chat/chat-area.tsx +45 -23
  26. package/src/components/chat/message-bubble.test.ts +35 -0
  27. package/src/components/chat/message-bubble.tsx +19 -9
  28. package/src/components/chat/message-list.tsx +37 -3
  29. package/src/components/input/chat-input.tsx +34 -14
  30. package/src/instrumentation.ts +1 -1
  31. package/src/lib/chat/assistant-render-id.ts +3 -0
  32. package/src/lib/chat/chat-streaming-state.test.ts +42 -3
  33. package/src/lib/chat/chat-streaming-state.ts +20 -8
  34. package/src/lib/chat/queued-message-queue.test.ts +23 -1
  35. package/src/lib/chat/queued-message-queue.ts +11 -2
  36. package/src/lib/providers/cli-utils.test.ts +124 -0
  37. package/src/lib/server/activity/activity-log.ts +21 -0
  38. package/src/lib/server/agents/agent-availability.test.ts +10 -5
  39. package/src/lib/server/agents/agent-cascade.ts +79 -59
  40. package/src/lib/server/agents/agent-registry.ts +3 -1
  41. package/src/lib/server/agents/agent-repository.ts +90 -0
  42. package/src/lib/server/agents/delegation-job-repository.ts +53 -0
  43. package/src/lib/server/agents/delegation-jobs.ts +11 -4
  44. package/src/lib/server/agents/guardian-checkpoint-repository.ts +35 -0
  45. package/src/lib/server/agents/guardian.ts +2 -2
  46. package/src/lib/server/agents/main-agent-loop.ts +10 -3
  47. package/src/lib/server/agents/main-loop-state-repository.ts +38 -0
  48. package/src/lib/server/agents/subagent-runtime.ts +9 -6
  49. package/src/lib/server/agents/subagent-swarm.ts +3 -2
  50. package/src/lib/server/agents/task-session.ts +3 -4
  51. package/src/lib/server/approvals/approval-repository.ts +30 -0
  52. package/src/lib/server/autonomy/supervisor-incident-repository.ts +42 -0
  53. package/src/lib/server/chat-execution/chat-execution-types.ts +38 -0
  54. package/src/lib/server/chat-execution/chat-execution-utils.ts +1 -1
  55. package/src/lib/server/chat-execution/chat-execution.ts +84 -1926
  56. package/src/lib/server/chat-execution/chat-turn-finalization.ts +620 -0
  57. package/src/lib/server/chat-execution/chat-turn-partial-persistence.ts +221 -0
  58. package/src/lib/server/chat-execution/chat-turn-preflight.ts +133 -0
  59. package/src/lib/server/chat-execution/chat-turn-preparation.ts +817 -0
  60. package/src/lib/server/chat-execution/chat-turn-stream-execution.ts +296 -0
  61. package/src/lib/server/chat-execution/chat-turn-tool-routing.ts +5 -5
  62. package/src/lib/server/chat-execution/message-classifier.test.ts +329 -0
  63. package/src/lib/server/chat-execution/post-stream-finalization.ts +1 -1
  64. package/src/lib/server/chat-execution/prompt-builder.ts +11 -0
  65. package/src/lib/server/chat-execution/prompt-sections.ts +5 -6
  66. package/src/lib/server/chat-execution/situational-awareness.ts +12 -7
  67. package/src/lib/server/chat-execution/stream-agent-chat.ts +16 -13
  68. package/src/lib/server/chatrooms/chatroom-repository.ts +32 -0
  69. package/src/lib/server/connectors/connector-repository.ts +58 -0
  70. package/src/lib/server/connectors/runtime-state.test.ts +117 -0
  71. package/src/lib/server/credentials/credential-repository.ts +7 -0
  72. package/src/lib/server/gateways/gateway-profile-repository.ts +4 -0
  73. package/src/lib/server/memory/memory-abstract.test.ts +59 -0
  74. package/src/lib/server/missions/mission-repository.ts +74 -0
  75. package/src/lib/server/missions/mission-service/actions.ts +6 -0
  76. package/src/lib/server/missions/mission-service/bindings.ts +9 -0
  77. package/src/lib/server/missions/mission-service/context.ts +4 -0
  78. package/src/lib/server/missions/mission-service/core.ts +2269 -0
  79. package/src/lib/server/missions/mission-service/queries.ts +12 -0
  80. package/src/lib/server/missions/mission-service/recovery.ts +5 -0
  81. package/src/lib/server/missions/mission-service/ticks.ts +9 -0
  82. package/src/lib/server/missions/mission-service.test.ts +9 -2
  83. package/src/lib/server/missions/mission-service.ts +6 -2266
  84. package/src/lib/server/persistence/repository-utils.ts +154 -0
  85. package/src/lib/server/persistence/storage-context.ts +51 -0
  86. package/src/lib/server/persistence/transaction.ts +1 -0
  87. package/src/lib/server/projects/project-repository.ts +36 -0
  88. package/src/lib/server/projects/project-service.ts +79 -0
  89. package/src/lib/server/protocols/protocol-normalization.test.ts +6 -4
  90. package/src/lib/server/runtime/alert-dispatch.ts +1 -1
  91. package/src/lib/server/runtime/daemon-policy.ts +1 -1
  92. package/src/lib/server/runtime/daemon-state/core.ts +1570 -0
  93. package/src/lib/server/runtime/daemon-state/health.ts +6 -0
  94. package/src/lib/server/runtime/daemon-state/policy.ts +7 -0
  95. package/src/lib/server/runtime/daemon-state/supervisor.ts +6 -0
  96. package/src/lib/server/runtime/daemon-state.test.ts +48 -0
  97. package/src/lib/server/runtime/daemon-state.ts +3 -1470
  98. package/src/lib/server/runtime/estop-repository.ts +4 -0
  99. package/src/lib/server/runtime/estop.ts +3 -1
  100. package/src/lib/server/runtime/heartbeat-service.test.ts +2 -2
  101. package/src/lib/server/runtime/heartbeat-service.ts +55 -34
  102. package/src/lib/server/runtime/heartbeat-wake.ts +6 -4
  103. package/src/lib/server/runtime/idle-window.ts +2 -2
  104. package/src/lib/server/runtime/network.ts +11 -0
  105. package/src/lib/server/runtime/orchestrator-events.ts +2 -2
  106. package/src/lib/server/runtime/queue/claims.ts +4 -0
  107. package/src/lib/server/runtime/queue/core.ts +2079 -0
  108. package/src/lib/server/runtime/queue/execution.ts +7 -0
  109. package/src/lib/server/runtime/queue/followups.ts +4 -0
  110. package/src/lib/server/runtime/queue/queries.ts +12 -0
  111. package/src/lib/server/runtime/queue/recovery.ts +7 -0
  112. package/src/lib/server/runtime/queue-recovery.test.ts +48 -13
  113. package/src/lib/server/runtime/queue-repository.ts +17 -0
  114. package/src/lib/server/runtime/queue.ts +5 -2061
  115. package/src/lib/server/runtime/run-ledger.ts +6 -5
  116. package/src/lib/server/runtime/run-repository.ts +73 -0
  117. package/src/lib/server/runtime/runtime-lock-repository.ts +8 -0
  118. package/src/lib/server/runtime/runtime-settings.ts +1 -1
  119. package/src/lib/server/runtime/runtime-state.ts +99 -0
  120. package/src/lib/server/runtime/scheduler.ts +4 -2
  121. package/src/lib/server/runtime/session-run-manager/cancellation.ts +157 -0
  122. package/src/lib/server/runtime/session-run-manager/drain.ts +246 -0
  123. package/src/lib/server/runtime/session-run-manager/enqueue.ts +287 -0
  124. package/src/lib/server/runtime/session-run-manager/queries.ts +117 -0
  125. package/src/lib/server/runtime/session-run-manager/recovery.ts +238 -0
  126. package/src/lib/server/runtime/session-run-manager/state.ts +441 -0
  127. package/src/lib/server/runtime/session-run-manager/types.ts +74 -0
  128. package/src/lib/server/runtime/session-run-manager.ts +72 -1377
  129. package/src/lib/server/runtime/watch-job-repository.ts +35 -0
  130. package/src/lib/server/runtime/watch-jobs.ts +3 -1
  131. package/src/lib/server/schedules/schedule-repository.ts +42 -0
  132. package/src/lib/server/sessions/session-repository.ts +85 -0
  133. package/src/lib/server/settings/settings-repository.ts +25 -0
  134. package/src/lib/server/skills/skill-discovery.test.ts +2 -2
  135. package/src/lib/server/skills/skill-discovery.ts +2 -2
  136. package/src/lib/server/skills/skill-repository.ts +14 -0
  137. package/src/lib/server/storage.ts +13 -24
  138. package/src/lib/server/tasks/task-repository.ts +54 -0
  139. package/src/lib/server/usage/usage-repository.ts +30 -0
  140. package/src/lib/server/webhooks/webhook-repository.ts +10 -0
  141. package/src/lib/strip-internal-metadata.test.ts +42 -41
  142. package/src/stores/use-chat-store.test.ts +54 -0
  143. package/src/stores/use-chat-store.ts +21 -5
  144. /package/{bundled-skills → skills}/google-workspace/SKILL.md +0 -0
package/README.md CHANGED
@@ -190,6 +190,15 @@ The building blocks are the same: **agents, tools, memory, delegation, schedules
190
190
 
191
191
  ## Release Notes
192
192
 
193
+ ### v1.2.2 Highlights
194
+
195
+ - **Modular chat execution pipeline**: decomposed the monolithic chat-execution module into 6 focused stages (preflight, preparation, stream execution, partial persistence, finalization, types) for maintainability and testability.
196
+ - **Repository pattern adoption**: extracted ~15 repository modules from `storage.ts`, giving each domain (agents, sessions, missions, credentials, tasks, etc.) its own data-access layer.
197
+ - **Runtime state encapsulation**: moved process-local state (active sessions, dev servers) from storage into `runtime-state.ts` with proper HMR singleton usage.
198
+ - **Streaming state improvements**: stable assistant render IDs, better live-row display logic, and smoother streaming phase transitions in the chat UI.
199
+ - **8 new skills**: coding-agent, github, nano-banana-pro, nano-pdf, openai-image-gen, resourceful-problem-solving, skill-creator, summarize.
200
+ - **Lint baseline improvements**: reduced lint violations from 414 to 396 (-18).
201
+
193
202
  ### v1.2.1 Highlights
194
203
 
195
204
  - **System health endpoint**: new `/api/system/status` route returns lightweight health summary for external monitoring and uptime checks.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@swarmclawai/swarmclaw",
3
- "version": "1.2.1",
3
+ "version": "1.2.2",
4
4
  "description": "Self-hosted AI runtime for OpenClaw, delegation, autonomy, runtime skills, crypto wallets, and chat platform connectors.",
5
5
  "license": "MIT",
6
6
  "publishConfig": {
@@ -30,7 +30,7 @@
30
30
  },
31
31
  "files": [
32
32
  "bin/",
33
- "bundled-skills/",
33
+ "skills/",
34
34
  "src/",
35
35
  "public/",
36
36
  "Dockerfile.sandbox-browser",
@@ -0,0 +1,111 @@
1
+ ---
2
+ name: coding-agent
3
+ description: 'Delegate coding tasks to external coding agents (Claude Code, Codex, Pi, OpenCode) via shell. Use when: (1) building new features or apps in a separate project, (2) reviewing PRs, (3) refactoring large codebases, (4) iterative coding that needs file exploration. NOT for: simple one-liner fixes (just edit directly), reading code (use read/file tools), or work inside the SwarmClaw workspace itself.'
4
+ metadata:
5
+ {
6
+ "openclaw": { "emoji": "🧩", "requires": { "anyBins": ["claude", "codex", "opencode", "pi"] } },
7
+ }
8
+ ---
9
+
10
+ # Coding Agent
11
+
12
+ Delegate coding tasks to external coding agents via shell tools.
13
+
14
+ ## Agent Execution Modes
15
+
16
+ ### Claude Code (recommended)
17
+
18
+ Use `--print --permission-mode bypassPermissions` for non-interactive execution:
19
+
20
+ ```bash
21
+ cd /path/to/project && claude --permission-mode bypassPermissions --print 'Your task here'
22
+ ```
23
+
24
+ For background execution, use the shell tool's background mode.
25
+
26
+ **Do NOT use PTY mode with Claude Code** — `--print` mode keeps full tool access and avoids interactive confirmation dialogs.
27
+
28
+ ### Codex
29
+
30
+ Codex requires a git repository and PTY mode:
31
+
32
+ ```bash
33
+ # Quick one-shot (auto-approves changes)
34
+ cd /path/to/project && codex exec --full-auto 'Build a dark mode toggle'
35
+
36
+ # Codex refuses to run outside a git directory. For scratch work:
37
+ SCRATCH=$(mktemp -d) && cd $SCRATCH && git init && codex exec "Your prompt"
38
+ ```
39
+
40
+ ### Pi Coding Agent
41
+
42
+ ```bash
43
+ # Install: npm install -g @mariozechner/pi-coding-agent
44
+ cd /path/to/project && pi 'Your task'
45
+
46
+ # Non-interactive mode
47
+ pi -p 'Summarize src/'
48
+
49
+ # Different provider/model
50
+ pi --provider openai --model gpt-4o-mini -p 'Your task'
51
+ ```
52
+
53
+ ### OpenCode
54
+
55
+ ```bash
56
+ cd /path/to/project && opencode run 'Your task'
57
+ ```
58
+
59
+ ## PR Reviews
60
+
61
+ Clone to a temp folder or use git worktree — never review PRs in the SwarmClaw project directory:
62
+
63
+ ```bash
64
+ # Clone to temp for safe review
65
+ REVIEW_DIR=$(mktemp -d)
66
+ git clone https://github.com/user/repo.git $REVIEW_DIR
67
+ cd $REVIEW_DIR && gh pr checkout 130
68
+ codex review --base origin/main
69
+
70
+ # Or use git worktree
71
+ git worktree add /tmp/pr-130-review pr-130-branch
72
+ cd /tmp/pr-130-review && codex review --base main
73
+ ```
74
+
75
+ ## Parallel Issue Fixing
76
+
77
+ Use git worktrees to fix multiple issues in parallel:
78
+
79
+ ```bash
80
+ # Create worktrees
81
+ git worktree add -b fix/issue-78 /tmp/issue-78 main
82
+ git worktree add -b fix/issue-99 /tmp/issue-99 main
83
+
84
+ # Launch agents (use background shell execution)
85
+ cd /tmp/issue-78 && codex --yolo 'Fix issue #78: <description>. Commit when done.'
86
+ cd /tmp/issue-99 && codex --yolo 'Fix issue #99: <description>. Commit when done.'
87
+
88
+ # Create PRs after
89
+ cd /tmp/issue-78 && git push -u origin fix/issue-78
90
+ gh pr create --repo user/repo --head fix/issue-78 --title "fix: ..." --body "..."
91
+
92
+ # Cleanup
93
+ git worktree remove /tmp/issue-78
94
+ git worktree remove /tmp/issue-99
95
+ ```
96
+
97
+ ## Rules
98
+
99
+ 1. **Use the right execution mode per agent**: Claude Code uses `--print` (no PTY); Codex/Pi/OpenCode may need interactive terminal.
100
+ 2. **Respect tool choice** — if the user asks for Codex, use Codex. Don't silently switch agents.
101
+ 3. **Be patient** — don't kill sessions because they seem slow.
102
+ 4. **Monitor progress** — check output periodically without interfering.
103
+ 5. **Never run coding agents inside the SwarmClaw project directory** — use a separate project directory or temp folder.
104
+
105
+ ## Progress Updates
106
+
107
+ When spawning coding agents in the background:
108
+
109
+ - Send a short message when you start (what's running, where).
110
+ - Update only when something changes (milestone, error, completion).
111
+ - If you kill a session, say so immediately and explain why.
@@ -0,0 +1,140 @@
1
+ ---
2
+ name: github
3
+ description: "GitHub operations via `gh` CLI: issues, PRs, CI runs, code review, API queries. Use when: (1) checking PR status or CI, (2) creating/commenting on issues, (3) listing/filtering PRs or issues, (4) viewing run logs. NOT for: local git operations (use git directly), non-GitHub repos, or cloning (use git clone)."
4
+ metadata:
5
+ {
6
+ "openclaw":
7
+ {
8
+ "emoji": "🐙",
9
+ "requires": { "bins": ["gh"] },
10
+ "install":
11
+ [
12
+ {
13
+ "id": "brew",
14
+ "kind": "brew",
15
+ "formula": "gh",
16
+ "bins": ["gh"],
17
+ "label": "Install GitHub CLI (brew)",
18
+ },
19
+ {
20
+ "id": "apt",
21
+ "kind": "apt",
22
+ "package": "gh",
23
+ "bins": ["gh"],
24
+ "label": "Install GitHub CLI (apt)",
25
+ },
26
+ ],
27
+ },
28
+ }
29
+ ---
30
+
31
+ # GitHub Skill
32
+
33
+ Use the `gh` CLI to interact with GitHub repositories, issues, PRs, and CI.
34
+
35
+ ## Setup
36
+
37
+ ```bash
38
+ # Authenticate (one-time)
39
+ gh auth login
40
+
41
+ # Verify
42
+ gh auth status
43
+ ```
44
+
45
+ ## Common Commands
46
+
47
+ ### Pull Requests
48
+
49
+ ```bash
50
+ # List PRs
51
+ gh pr list --repo owner/repo
52
+
53
+ # Check CI status
54
+ gh pr checks 55 --repo owner/repo
55
+
56
+ # View PR details
57
+ gh pr view 55 --repo owner/repo
58
+
59
+ # Create PR
60
+ gh pr create --title "feat: add feature" --body "Description"
61
+
62
+ # Merge PR
63
+ gh pr merge 55 --squash --repo owner/repo
64
+ ```
65
+
66
+ ### Issues
67
+
68
+ ```bash
69
+ # List issues
70
+ gh issue list --repo owner/repo --state open
71
+
72
+ # Create issue
73
+ gh issue create --title "Bug: something broken" --body "Details..."
74
+
75
+ # Close issue
76
+ gh issue close 42 --repo owner/repo
77
+ ```
78
+
79
+ ### CI/Workflow Runs
80
+
81
+ ```bash
82
+ # List recent runs
83
+ gh run list --repo owner/repo --limit 10
84
+
85
+ # View specific run
86
+ gh run view <run-id> --repo owner/repo
87
+
88
+ # View failed step logs only
89
+ gh run view <run-id> --repo owner/repo --log-failed
90
+
91
+ # Re-run failed jobs
92
+ gh run rerun <run-id> --failed --repo owner/repo
93
+ ```
94
+
95
+ ### API Queries
96
+
97
+ ```bash
98
+ # Get PR with specific fields
99
+ gh api repos/owner/repo/pulls/55 --jq '.title, .state, .user.login'
100
+
101
+ # List all labels
102
+ gh api repos/owner/repo/labels --jq '.[].name'
103
+
104
+ # Get repo stats
105
+ gh api repos/owner/repo --jq '{stars: .stargazers_count, forks: .forks_count}'
106
+ ```
107
+
108
+ ## JSON Output
109
+
110
+ Most commands support `--json` for structured output with `--jq` filtering:
111
+
112
+ ```bash
113
+ gh issue list --repo owner/repo --json number,title --jq '.[] | "\(.number): \(.title)"'
114
+ gh pr list --json number,title,state,mergeable --jq '.[] | select(.mergeable == "MERGEABLE")'
115
+ ```
116
+
117
+ ## Templates
118
+
119
+ ### PR Review Summary
120
+
121
+ ```bash
122
+ PR=55 REPO=owner/repo
123
+ echo "## PR #$PR Summary"
124
+ gh pr view $PR --repo $REPO --json title,body,author,additions,deletions,changedFiles \
125
+ --jq '"**\(.title)** by @\(.author.login)\n\n\(.body)\n\n+\(.additions) -\(.deletions) across \(.changedFiles) files"'
126
+ gh pr checks $PR --repo $REPO
127
+ ```
128
+
129
+ ### Issue Triage
130
+
131
+ ```bash
132
+ gh issue list --repo owner/repo --state open --json number,title,labels,createdAt \
133
+ --jq '.[] | "[\(.number)] \(.title) - \([.labels[].name] | join(", ")) (\(.createdAt[:10]))"'
134
+ ```
135
+
136
+ ## Notes
137
+
138
+ - Always specify `--repo owner/repo` when not in a git directory.
139
+ - Use URLs directly: `gh pr view https://github.com/owner/repo/pull/55`
140
+ - Rate limits apply; use `gh api --cache 1h` for repeated queries.
@@ -0,0 +1,62 @@
1
+ ---
2
+ name: nano-banana-pro
3
+ description: Generate or edit images via Gemini 3 Pro Image (Nano Banana Pro). Use when asked to create, generate, or edit images and a Gemini API key is available. Supports text-to-image generation, single-image editing, and multi-image composition (up to 14 images).
4
+ metadata:
5
+ {
6
+ "openclaw":
7
+ {
8
+ "emoji": "🍌",
9
+ "requires": { "bins": ["uv"], "env": ["GEMINI_API_KEY"] },
10
+ "primaryEnv": "GEMINI_API_KEY",
11
+ "install":
12
+ [
13
+ {
14
+ "id": "uv-brew",
15
+ "kind": "brew",
16
+ "formula": "uv",
17
+ "bins": ["uv"],
18
+ "label": "Install uv (brew)",
19
+ },
20
+ ],
21
+ },
22
+ }
23
+ ---
24
+
25
+ # Nano Banana Pro (Gemini 3 Pro Image)
26
+
27
+ Use the bundled script to generate or edit images.
28
+
29
+ ## Generate
30
+
31
+ ```bash
32
+ uv run {baseDir}/scripts/generate_image.py --prompt "your image description" --filename "output.png" --resolution 1K
33
+ ```
34
+
35
+ ## Edit (Single Image)
36
+
37
+ ```bash
38
+ uv run {baseDir}/scripts/generate_image.py --prompt "edit instructions" --filename "output.png" -i "/path/in.png" --resolution 2K
39
+ ```
40
+
41
+ ## Multi-Image Composition (up to 14 images)
42
+
43
+ ```bash
44
+ uv run {baseDir}/scripts/generate_image.py --prompt "combine these into one scene" --filename "output.png" -i img1.png -i img2.png -i img3.png
45
+ ```
46
+
47
+ ## API Key
48
+
49
+ Set `GEMINI_API_KEY` as an environment variable, or pass `--api-key <KEY>` to the script.
50
+
51
+ ## Aspect Ratio (optional)
52
+
53
+ ```bash
54
+ uv run {baseDir}/scripts/generate_image.py --prompt "portrait photo" --filename "output.png" --aspect-ratio 9:16
55
+ ```
56
+
57
+ ## Notes
58
+
59
+ - Resolutions: `1K` (default), `2K`, `4K`.
60
+ - Aspect ratios: `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9`. Without `--aspect-ratio`, the model picks freely.
61
+ - Use timestamps in filenames for uniqueness: `yyyy-mm-dd-hh-mm-ss-name.png`.
62
+ - Do not read the image back into context; report the saved path only.
@@ -0,0 +1,235 @@
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = [
5
+ # "google-genai>=1.0.0",
6
+ # "pillow>=10.0.0",
7
+ # ]
8
+ # ///
9
+ """
10
+ Generate images using Google's Nano Banana Pro (Gemini 3 Pro Image) API.
11
+
12
+ Usage:
13
+ uv run generate_image.py --prompt "your image description" --filename "output.png" [--resolution 1K|2K|4K] [--api-key KEY]
14
+
15
+ Multi-image editing (up to 14 images):
16
+ uv run generate_image.py --prompt "combine these images" --filename "output.png" -i img1.png -i img2.png -i img3.png
17
+ """
18
+
19
+ import argparse
20
+ import os
21
+ import sys
22
+ from pathlib import Path
23
+
24
+ SUPPORTED_ASPECT_RATIOS = [
25
+ "1:1",
26
+ "2:3",
27
+ "3:2",
28
+ "3:4",
29
+ "4:3",
30
+ "4:5",
31
+ "5:4",
32
+ "9:16",
33
+ "16:9",
34
+ "21:9",
35
+ ]
36
+
37
+
38
+ def get_api_key(provided_key: str | None) -> str | None:
39
+ """Get API key from argument first, then environment."""
40
+ if provided_key:
41
+ return provided_key
42
+ return os.environ.get("GEMINI_API_KEY")
43
+
44
+
45
+ def auto_detect_resolution(max_input_dim: int) -> str:
46
+ """Infer output resolution from the largest input image dimension."""
47
+ if max_input_dim >= 3000:
48
+ return "4K"
49
+ if max_input_dim >= 1500:
50
+ return "2K"
51
+ return "1K"
52
+
53
+
54
+ def choose_output_resolution(
55
+ requested_resolution: str | None,
56
+ max_input_dim: int,
57
+ has_input_images: bool,
58
+ ) -> tuple[str, bool]:
59
+ """Choose final resolution and whether it was auto-detected.
60
+
61
+ Auto-detection is only applied when the user did not pass --resolution.
62
+ """
63
+ if requested_resolution is not None:
64
+ return requested_resolution, False
65
+
66
+ if has_input_images and max_input_dim > 0:
67
+ return auto_detect_resolution(max_input_dim), True
68
+
69
+ return "1K", False
70
+
71
+
72
+ def main():
73
+ parser = argparse.ArgumentParser(
74
+ description="Generate images using Nano Banana Pro (Gemini 3 Pro Image)"
75
+ )
76
+ parser.add_argument(
77
+ "--prompt", "-p",
78
+ required=True,
79
+ help="Image description/prompt"
80
+ )
81
+ parser.add_argument(
82
+ "--filename", "-f",
83
+ required=True,
84
+ help="Output filename (e.g., sunset-mountains.png)"
85
+ )
86
+ parser.add_argument(
87
+ "--input-image", "-i",
88
+ action="append",
89
+ dest="input_images",
90
+ metavar="IMAGE",
91
+ help="Input image path(s) for editing/composition. Can be specified multiple times (up to 14 images)."
92
+ )
93
+ parser.add_argument(
94
+ "--resolution", "-r",
95
+ choices=["1K", "2K", "4K"],
96
+ default=None,
97
+ help="Output resolution: 1K, 2K, or 4K. If omitted with input images, auto-detect from largest image dimension."
98
+ )
99
+ parser.add_argument(
100
+ "--aspect-ratio", "-a",
101
+ choices=SUPPORTED_ASPECT_RATIOS,
102
+ default=None,
103
+ help=f"Output aspect ratio (default: model decides). Options: {', '.join(SUPPORTED_ASPECT_RATIOS)}"
104
+ )
105
+ parser.add_argument(
106
+ "--api-key", "-k",
107
+ help="Gemini API key (overrides GEMINI_API_KEY env var)"
108
+ )
109
+
110
+ args = parser.parse_args()
111
+
112
+ # Get API key
113
+ api_key = get_api_key(args.api_key)
114
+ if not api_key:
115
+ print("Error: No API key provided.", file=sys.stderr)
116
+ print("Please either:", file=sys.stderr)
117
+ print(" 1. Provide --api-key argument", file=sys.stderr)
118
+ print(" 2. Set GEMINI_API_KEY environment variable", file=sys.stderr)
119
+ sys.exit(1)
120
+
121
+ # Import here after checking API key to avoid slow import on error
122
+ from google import genai
123
+ from google.genai import types
124
+ from PIL import Image as PILImage
125
+
126
+ # Initialise client
127
+ client = genai.Client(api_key=api_key)
128
+
129
+ # Set up output path
130
+ output_path = Path(args.filename)
131
+ output_path.parent.mkdir(parents=True, exist_ok=True)
132
+
133
+ # Load input images if provided (up to 14 supported by Nano Banana Pro)
134
+ input_images = []
135
+ max_input_dim = 0
136
+ if args.input_images:
137
+ if len(args.input_images) > 14:
138
+ print(f"Error: Too many input images ({len(args.input_images)}). Maximum is 14.", file=sys.stderr)
139
+ sys.exit(1)
140
+
141
+ for img_path in args.input_images:
142
+ try:
143
+ with PILImage.open(img_path) as img:
144
+ copied = img.copy()
145
+ width, height = copied.size
146
+ input_images.append(copied)
147
+ print(f"Loaded input image: {img_path}")
148
+
149
+ # Track largest dimension for auto-resolution
150
+ max_input_dim = max(max_input_dim, width, height)
151
+ except Exception as e:
152
+ print(f"Error loading input image '{img_path}': {e}", file=sys.stderr)
153
+ sys.exit(1)
154
+
155
+ output_resolution, auto_detected = choose_output_resolution(
156
+ requested_resolution=args.resolution,
157
+ max_input_dim=max_input_dim,
158
+ has_input_images=bool(input_images),
159
+ )
160
+ if auto_detected:
161
+ print(
162
+ f"Auto-detected resolution: {output_resolution} "
163
+ f"(from max input dimension {max_input_dim})"
164
+ )
165
+
166
+ # Build contents (images first if editing, prompt only if generating)
167
+ if input_images:
168
+ contents = [*input_images, args.prompt]
169
+ img_count = len(input_images)
170
+ print(f"Processing {img_count} image{'s' if img_count > 1 else ''} with resolution {output_resolution}...")
171
+ else:
172
+ contents = args.prompt
173
+ print(f"Generating image with resolution {output_resolution}...")
174
+
175
+ try:
176
+ # Build image config with optional aspect ratio
177
+ image_cfg_kwargs = {"image_size": output_resolution}
178
+ if args.aspect_ratio:
179
+ image_cfg_kwargs["aspect_ratio"] = args.aspect_ratio
180
+
181
+ response = client.models.generate_content(
182
+ model="gemini-3-pro-image-preview",
183
+ contents=contents,
184
+ config=types.GenerateContentConfig(
185
+ response_modalities=["TEXT", "IMAGE"],
186
+ image_config=types.ImageConfig(**image_cfg_kwargs)
187
+ )
188
+ )
189
+
190
+ # Process response and convert to PNG
191
+ image_saved = False
192
+ for part in response.parts:
193
+ if part.text is not None:
194
+ print(f"Model response: {part.text}")
195
+ elif part.inline_data is not None:
196
+ # Convert inline data to PIL Image and save as PNG
197
+ from io import BytesIO
198
+
199
+ # inline_data.data is already bytes, not base64
200
+ image_data = part.inline_data.data
201
+ if isinstance(image_data, str):
202
+ # If it's a string, it might be base64
203
+ import base64
204
+ image_data = base64.b64decode(image_data)
205
+
206
+ image = PILImage.open(BytesIO(image_data))
207
+
208
+ # Ensure RGB mode for PNG (convert RGBA to RGB with white background if needed)
209
+ if image.mode == 'RGBA':
210
+ rgb_image = PILImage.new('RGB', image.size, (255, 255, 255))
211
+ rgb_image.paste(image, mask=image.split()[3])
212
+ rgb_image.save(str(output_path), 'PNG')
213
+ elif image.mode == 'RGB':
214
+ image.save(str(output_path), 'PNG')
215
+ else:
216
+ image.convert('RGB').save(str(output_path), 'PNG')
217
+ image_saved = True
218
+
219
+ if image_saved:
220
+ full_path = output_path.resolve()
221
+ print(f"\nImage saved: {full_path}")
222
+ # OpenClaw parses MEDIA: tokens and will attach the file on
223
+ # supported chat providers. Emit the canonical MEDIA:<path> form.
224
+ print(f"MEDIA:{full_path}")
225
+ else:
226
+ print("Error: No image was generated in the response.", file=sys.stderr)
227
+ sys.exit(1)
228
+
229
+ except Exception as e:
230
+ print(f"Error generating image: {e}", file=sys.stderr)
231
+ sys.exit(1)
232
+
233
+
234
+ if __name__ == "__main__":
235
+ main()
@@ -0,0 +1,53 @@
1
+ ---
2
+ name: nano-pdf
3
+ description: Edit or create PDFs with natural-language instructions using the nano-pdf CLI. Use when asked to make a PDF, edit a PDF, add pages, change text in a PDF, or convert content to PDF format.
4
+ metadata:
5
+ {
6
+ "openclaw":
7
+ {
8
+ "emoji": "📄",
9
+ "requires": { "bins": ["nano-pdf"] },
10
+ "install":
11
+ [
12
+ {
13
+ "id": "uv",
14
+ "kind": "uv",
15
+ "package": "nano-pdf",
16
+ "bins": ["nano-pdf"],
17
+ "label": "Install nano-pdf (uv)",
18
+ },
19
+ ],
20
+ },
21
+ }
22
+ ---
23
+
24
+ # nano-pdf
25
+
26
+ Use `nano-pdf` to apply edits to a specific page in a PDF using a natural-language instruction.
27
+
28
+ ## Quick Start
29
+
30
+ ```bash
31
+ nano-pdf edit deck.pdf 1 "Change the title to 'Q3 Results' and fix the typo in the subtitle"
32
+ ```
33
+
34
+ ## Creating a New PDF
35
+
36
+ ```bash
37
+ nano-pdf create output.pdf "Create a one-page summary of quarterly results with a header, bullet points, and a footer"
38
+ ```
39
+
40
+ ## Usage in SwarmClaw
41
+
42
+ When a user asks to create or edit a PDF:
43
+
44
+ 1. Check if `nano-pdf` is installed: `which nano-pdf`
45
+ 2. If not installed, install via `uv tool install nano-pdf` or `pip install nano-pdf`
46
+ 3. Run the appropriate command
47
+ 4. Report the output file path to the user
48
+
49
+ ## Notes
50
+
51
+ - Page numbers are 0-based or 1-based depending on the tool's version; if the result looks off by one, retry with the other.
52
+ - Always sanity-check the output PDF before reporting success.
53
+ - For multi-page edits, run separate commands per page.