@swarmclawai/swarmclaw 1.2.1 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. package/README.md +16 -85
  2. package/bin/server-cmd.js +64 -1
  3. package/package.json +2 -2
  4. package/skills/coding-agent/SKILL.md +111 -0
  5. package/skills/github/SKILL.md +140 -0
  6. package/skills/nano-banana-pro/SKILL.md +62 -0
  7. package/skills/nano-banana-pro/scripts/generate_image.py +235 -0
  8. package/skills/nano-pdf/SKILL.md +53 -0
  9. package/skills/openai-image-gen/SKILL.md +78 -0
  10. package/skills/openai-image-gen/scripts/gen.py +328 -0
  11. package/skills/resourceful-problem-solving/SKILL.md +49 -0
  12. package/skills/skill-creator/SKILL.md +147 -0
  13. package/skills/skill-creator/scripts/init_skill.py +378 -0
  14. package/skills/skill-creator/scripts/quick_validate.py +159 -0
  15. package/skills/summarize/SKILL.md +77 -0
  16. package/src/app/api/auth/route.ts +20 -5
  17. package/src/app/api/chats/[id]/devserver/route.ts +13 -19
  18. package/src/app/api/chats/[id]/messages/route.ts +13 -15
  19. package/src/app/api/chats/[id]/route.ts +9 -10
  20. package/src/app/api/chats/[id]/stop/route.ts +5 -7
  21. package/src/app/api/chats/messages-route.test.ts +8 -6
  22. package/src/app/api/chats/route.ts +9 -10
  23. package/src/app/api/ip/route.ts +2 -2
  24. package/src/app/api/preview-server/route.ts +1 -1
  25. package/src/app/api/projects/[id]/route.ts +7 -46
  26. package/src/cli/server-cmd.test.js +74 -0
  27. package/src/components/chat/chat-area.tsx +45 -23
  28. package/src/components/chat/message-bubble.test.ts +35 -0
  29. package/src/components/chat/message-bubble.tsx +19 -9
  30. package/src/components/chat/message-list.tsx +37 -3
  31. package/src/components/input/chat-input.tsx +34 -14
  32. package/src/components/openclaw/openclaw-deploy-panel.tsx +4 -0
  33. package/src/instrumentation.ts +1 -1
  34. package/src/lib/chat/assistant-render-id.ts +3 -0
  35. package/src/lib/chat/chat-streaming-state.test.ts +42 -3
  36. package/src/lib/chat/chat-streaming-state.ts +20 -8
  37. package/src/lib/chat/queued-message-queue.test.ts +23 -1
  38. package/src/lib/chat/queued-message-queue.ts +11 -2
  39. package/src/lib/providers/cli-utils.test.ts +124 -0
  40. package/src/lib/server/activity/activity-log.ts +21 -0
  41. package/src/lib/server/agents/agent-availability.test.ts +10 -5
  42. package/src/lib/server/agents/agent-cascade.ts +79 -59
  43. package/src/lib/server/agents/agent-registry.ts +3 -1
  44. package/src/lib/server/agents/agent-repository.ts +90 -0
  45. package/src/lib/server/agents/delegation-job-repository.ts +53 -0
  46. package/src/lib/server/agents/delegation-jobs.ts +11 -4
  47. package/src/lib/server/agents/guardian-checkpoint-repository.ts +35 -0
  48. package/src/lib/server/agents/guardian.ts +2 -2
  49. package/src/lib/server/agents/main-agent-loop.ts +10 -3
  50. package/src/lib/server/agents/main-loop-state-repository.ts +38 -0
  51. package/src/lib/server/agents/subagent-runtime.ts +9 -6
  52. package/src/lib/server/agents/subagent-swarm.ts +3 -2
  53. package/src/lib/server/agents/task-session.ts +3 -4
  54. package/src/lib/server/approvals/approval-repository.ts +30 -0
  55. package/src/lib/server/autonomy/supervisor-incident-repository.ts +42 -0
  56. package/src/lib/server/chat-execution/chat-execution-types.ts +38 -0
  57. package/src/lib/server/chat-execution/chat-execution-utils.ts +1 -1
  58. package/src/lib/server/chat-execution/chat-execution.ts +84 -1926
  59. package/src/lib/server/chat-execution/chat-turn-finalization.ts +620 -0
  60. package/src/lib/server/chat-execution/chat-turn-partial-persistence.ts +221 -0
  61. package/src/lib/server/chat-execution/chat-turn-preflight.ts +133 -0
  62. package/src/lib/server/chat-execution/chat-turn-preparation.ts +817 -0
  63. package/src/lib/server/chat-execution/chat-turn-stream-execution.ts +296 -0
  64. package/src/lib/server/chat-execution/chat-turn-tool-routing.ts +5 -5
  65. package/src/lib/server/chat-execution/message-classifier.test.ts +329 -0
  66. package/src/lib/server/chat-execution/post-stream-finalization.ts +1 -1
  67. package/src/lib/server/chat-execution/prompt-builder.ts +11 -0
  68. package/src/lib/server/chat-execution/prompt-sections.ts +5 -6
  69. package/src/lib/server/chat-execution/situational-awareness.ts +12 -7
  70. package/src/lib/server/chat-execution/stream-agent-chat.ts +16 -13
  71. package/src/lib/server/chatrooms/chatroom-repository.ts +32 -0
  72. package/src/lib/server/connectors/connector-repository.ts +58 -0
  73. package/src/lib/server/connectors/runtime-state.test.ts +117 -0
  74. package/src/lib/server/credentials/credential-repository.ts +7 -0
  75. package/src/lib/server/gateways/gateway-profile-repository.ts +4 -0
  76. package/src/lib/server/memory/memory-abstract.test.ts +59 -0
  77. package/src/lib/server/missions/mission-repository.ts +74 -0
  78. package/src/lib/server/missions/mission-service/actions.ts +6 -0
  79. package/src/lib/server/missions/mission-service/bindings.ts +9 -0
  80. package/src/lib/server/missions/mission-service/context.ts +4 -0
  81. package/src/lib/server/missions/mission-service/core.ts +2269 -0
  82. package/src/lib/server/missions/mission-service/queries.ts +12 -0
  83. package/src/lib/server/missions/mission-service/recovery.ts +5 -0
  84. package/src/lib/server/missions/mission-service/ticks.ts +9 -0
  85. package/src/lib/server/missions/mission-service.test.ts +9 -2
  86. package/src/lib/server/missions/mission-service.ts +6 -2266
  87. package/src/lib/server/openclaw/deploy.test.ts +42 -3
  88. package/src/lib/server/openclaw/deploy.ts +26 -12
  89. package/src/lib/server/persistence/repository-utils.ts +154 -0
  90. package/src/lib/server/persistence/storage-context.ts +51 -0
  91. package/src/lib/server/persistence/transaction.ts +1 -0
  92. package/src/lib/server/projects/project-repository.ts +36 -0
  93. package/src/lib/server/projects/project-service.ts +79 -0
  94. package/src/lib/server/protocols/protocol-normalization.test.ts +6 -4
  95. package/src/lib/server/runtime/alert-dispatch.ts +1 -1
  96. package/src/lib/server/runtime/daemon-policy.ts +1 -1
  97. package/src/lib/server/runtime/daemon-state/core.ts +1570 -0
  98. package/src/lib/server/runtime/daemon-state/health.ts +6 -0
  99. package/src/lib/server/runtime/daemon-state/policy.ts +7 -0
  100. package/src/lib/server/runtime/daemon-state/supervisor.ts +6 -0
  101. package/src/lib/server/runtime/daemon-state.test.ts +48 -0
  102. package/src/lib/server/runtime/daemon-state.ts +3 -1470
  103. package/src/lib/server/runtime/estop-repository.ts +4 -0
  104. package/src/lib/server/runtime/estop.ts +3 -1
  105. package/src/lib/server/runtime/heartbeat-service.test.ts +2 -2
  106. package/src/lib/server/runtime/heartbeat-service.ts +55 -34
  107. package/src/lib/server/runtime/heartbeat-wake.ts +6 -4
  108. package/src/lib/server/runtime/idle-window.ts +2 -2
  109. package/src/lib/server/runtime/network.ts +11 -0
  110. package/src/lib/server/runtime/orchestrator-events.ts +2 -2
  111. package/src/lib/server/runtime/queue/claims.ts +4 -0
  112. package/src/lib/server/runtime/queue/core.ts +2079 -0
  113. package/src/lib/server/runtime/queue/execution.ts +7 -0
  114. package/src/lib/server/runtime/queue/followups.ts +4 -0
  115. package/src/lib/server/runtime/queue/queries.ts +12 -0
  116. package/src/lib/server/runtime/queue/recovery.ts +7 -0
  117. package/src/lib/server/runtime/queue-recovery.test.ts +48 -13
  118. package/src/lib/server/runtime/queue-repository.ts +17 -0
  119. package/src/lib/server/runtime/queue.ts +5 -2061
  120. package/src/lib/server/runtime/run-ledger.ts +6 -5
  121. package/src/lib/server/runtime/run-repository.ts +73 -0
  122. package/src/lib/server/runtime/runtime-lock-repository.ts +8 -0
  123. package/src/lib/server/runtime/runtime-settings.ts +1 -1
  124. package/src/lib/server/runtime/runtime-state.ts +99 -0
  125. package/src/lib/server/runtime/scheduler.ts +4 -2
  126. package/src/lib/server/runtime/session-run-manager/cancellation.ts +157 -0
  127. package/src/lib/server/runtime/session-run-manager/drain.ts +246 -0
  128. package/src/lib/server/runtime/session-run-manager/enqueue.ts +287 -0
  129. package/src/lib/server/runtime/session-run-manager/queries.ts +117 -0
  130. package/src/lib/server/runtime/session-run-manager/recovery.ts +238 -0
  131. package/src/lib/server/runtime/session-run-manager/state.ts +441 -0
  132. package/src/lib/server/runtime/session-run-manager/types.ts +74 -0
  133. package/src/lib/server/runtime/session-run-manager.ts +72 -1377
  134. package/src/lib/server/runtime/watch-job-repository.ts +35 -0
  135. package/src/lib/server/runtime/watch-jobs.ts +3 -1
  136. package/src/lib/server/schedules/schedule-repository.ts +42 -0
  137. package/src/lib/server/sessions/session-repository.ts +85 -0
  138. package/src/lib/server/settings/settings-repository.ts +25 -0
  139. package/src/lib/server/skills/skill-discovery.test.ts +2 -2
  140. package/src/lib/server/skills/skill-discovery.ts +2 -2
  141. package/src/lib/server/skills/skill-repository.ts +14 -0
  142. package/src/lib/server/storage.ts +13 -24
  143. package/src/lib/server/tasks/task-repository.ts +54 -0
  144. package/src/lib/server/usage/usage-repository.ts +30 -0
  145. package/src/lib/server/webhooks/webhook-repository.ts +10 -0
  146. package/src/lib/strip-internal-metadata.test.ts +42 -41
  147. package/src/stores/use-chat-store.test.ts +54 -0
  148. package/src/stores/use-chat-store.ts +21 -5
  149. /package/{bundled-skills → skills}/google-workspace/SKILL.md +0 -0
@@ -0,0 +1,235 @@
1
+ #!/usr/bin/env python3
2
+ # /// script
3
+ # requires-python = ">=3.10"
4
+ # dependencies = [
5
+ # "google-genai>=1.0.0",
6
+ # "pillow>=10.0.0",
7
+ # ]
8
+ # ///
9
+ """
10
+ Generate images using Google's Nano Banana Pro (Gemini 3 Pro Image) API.
11
+
12
+ Usage:
13
+ uv run generate_image.py --prompt "your image description" --filename "output.png" [--resolution 1K|2K|4K] [--api-key KEY]
14
+
15
+ Multi-image editing (up to 14 images):
16
+ uv run generate_image.py --prompt "combine these images" --filename "output.png" -i img1.png -i img2.png -i img3.png
17
+ """
18
+
19
+ import argparse
20
+ import os
21
+ import sys
22
+ from pathlib import Path
23
+
24
+ SUPPORTED_ASPECT_RATIOS = [
25
+ "1:1",
26
+ "2:3",
27
+ "3:2",
28
+ "3:4",
29
+ "4:3",
30
+ "4:5",
31
+ "5:4",
32
+ "9:16",
33
+ "16:9",
34
+ "21:9",
35
+ ]
36
+
37
+
38
+ def get_api_key(provided_key: str | None) -> str | None:
39
+ """Get API key from argument first, then environment."""
40
+ if provided_key:
41
+ return provided_key
42
+ return os.environ.get("GEMINI_API_KEY")
43
+
44
+
45
+ def auto_detect_resolution(max_input_dim: int) -> str:
46
+ """Infer output resolution from the largest input image dimension."""
47
+ if max_input_dim >= 3000:
48
+ return "4K"
49
+ if max_input_dim >= 1500:
50
+ return "2K"
51
+ return "1K"
52
+
53
+
54
+ def choose_output_resolution(
55
+ requested_resolution: str | None,
56
+ max_input_dim: int,
57
+ has_input_images: bool,
58
+ ) -> tuple[str, bool]:
59
+ """Choose final resolution and whether it was auto-detected.
60
+
61
+ Auto-detection is only applied when the user did not pass --resolution.
62
+ """
63
+ if requested_resolution is not None:
64
+ return requested_resolution, False
65
+
66
+ if has_input_images and max_input_dim > 0:
67
+ return auto_detect_resolution(max_input_dim), True
68
+
69
+ return "1K", False
70
+
71
+
72
+ def main():
73
+ parser = argparse.ArgumentParser(
74
+ description="Generate images using Nano Banana Pro (Gemini 3 Pro Image)"
75
+ )
76
+ parser.add_argument(
77
+ "--prompt", "-p",
78
+ required=True,
79
+ help="Image description/prompt"
80
+ )
81
+ parser.add_argument(
82
+ "--filename", "-f",
83
+ required=True,
84
+ help="Output filename (e.g., sunset-mountains.png)"
85
+ )
86
+ parser.add_argument(
87
+ "--input-image", "-i",
88
+ action="append",
89
+ dest="input_images",
90
+ metavar="IMAGE",
91
+ help="Input image path(s) for editing/composition. Can be specified multiple times (up to 14 images)."
92
+ )
93
+ parser.add_argument(
94
+ "--resolution", "-r",
95
+ choices=["1K", "2K", "4K"],
96
+ default=None,
97
+ help="Output resolution: 1K, 2K, or 4K. If omitted with input images, auto-detect from largest image dimension."
98
+ )
99
+ parser.add_argument(
100
+ "--aspect-ratio", "-a",
101
+ choices=SUPPORTED_ASPECT_RATIOS,
102
+ default=None,
103
+ help=f"Output aspect ratio (default: model decides). Options: {', '.join(SUPPORTED_ASPECT_RATIOS)}"
104
+ )
105
+ parser.add_argument(
106
+ "--api-key", "-k",
107
+ help="Gemini API key (overrides GEMINI_API_KEY env var)"
108
+ )
109
+
110
+ args = parser.parse_args()
111
+
112
+ # Get API key
113
+ api_key = get_api_key(args.api_key)
114
+ if not api_key:
115
+ print("Error: No API key provided.", file=sys.stderr)
116
+ print("Please either:", file=sys.stderr)
117
+ print(" 1. Provide --api-key argument", file=sys.stderr)
118
+ print(" 2. Set GEMINI_API_KEY environment variable", file=sys.stderr)
119
+ sys.exit(1)
120
+
121
+ # Import here after checking API key to avoid slow import on error
122
+ from google import genai
123
+ from google.genai import types
124
+ from PIL import Image as PILImage
125
+
126
+ # Initialise client
127
+ client = genai.Client(api_key=api_key)
128
+
129
+ # Set up output path
130
+ output_path = Path(args.filename)
131
+ output_path.parent.mkdir(parents=True, exist_ok=True)
132
+
133
+ # Load input images if provided (up to 14 supported by Nano Banana Pro)
134
+ input_images = []
135
+ max_input_dim = 0
136
+ if args.input_images:
137
+ if len(args.input_images) > 14:
138
+ print(f"Error: Too many input images ({len(args.input_images)}). Maximum is 14.", file=sys.stderr)
139
+ sys.exit(1)
140
+
141
+ for img_path in args.input_images:
142
+ try:
143
+ with PILImage.open(img_path) as img:
144
+ copied = img.copy()
145
+ width, height = copied.size
146
+ input_images.append(copied)
147
+ print(f"Loaded input image: {img_path}")
148
+
149
+ # Track largest dimension for auto-resolution
150
+ max_input_dim = max(max_input_dim, width, height)
151
+ except Exception as e:
152
+ print(f"Error loading input image '{img_path}': {e}", file=sys.stderr)
153
+ sys.exit(1)
154
+
155
+ output_resolution, auto_detected = choose_output_resolution(
156
+ requested_resolution=args.resolution,
157
+ max_input_dim=max_input_dim,
158
+ has_input_images=bool(input_images),
159
+ )
160
+ if auto_detected:
161
+ print(
162
+ f"Auto-detected resolution: {output_resolution} "
163
+ f"(from max input dimension {max_input_dim})"
164
+ )
165
+
166
+ # Build contents (images first if editing, prompt only if generating)
167
+ if input_images:
168
+ contents = [*input_images, args.prompt]
169
+ img_count = len(input_images)
170
+ print(f"Processing {img_count} image{'s' if img_count > 1 else ''} with resolution {output_resolution}...")
171
+ else:
172
+ contents = args.prompt
173
+ print(f"Generating image with resolution {output_resolution}...")
174
+
175
+ try:
176
+ # Build image config with optional aspect ratio
177
+ image_cfg_kwargs = {"image_size": output_resolution}
178
+ if args.aspect_ratio:
179
+ image_cfg_kwargs["aspect_ratio"] = args.aspect_ratio
180
+
181
+ response = client.models.generate_content(
182
+ model="gemini-3-pro-image-preview",
183
+ contents=contents,
184
+ config=types.GenerateContentConfig(
185
+ response_modalities=["TEXT", "IMAGE"],
186
+ image_config=types.ImageConfig(**image_cfg_kwargs)
187
+ )
188
+ )
189
+
190
+ # Process response and convert to PNG
191
+ image_saved = False
192
+ for part in response.parts:
193
+ if part.text is not None:
194
+ print(f"Model response: {part.text}")
195
+ elif part.inline_data is not None:
196
+ # Convert inline data to PIL Image and save as PNG
197
+ from io import BytesIO
198
+
199
+ # inline_data.data is already bytes, not base64
200
+ image_data = part.inline_data.data
201
+ if isinstance(image_data, str):
202
+ # If it's a string, it might be base64
203
+ import base64
204
+ image_data = base64.b64decode(image_data)
205
+
206
+ image = PILImage.open(BytesIO(image_data))
207
+
208
+ # Ensure RGB mode for PNG (convert RGBA to RGB with white background if needed)
209
+ if image.mode == 'RGBA':
210
+ rgb_image = PILImage.new('RGB', image.size, (255, 255, 255))
211
+ rgb_image.paste(image, mask=image.split()[3])
212
+ rgb_image.save(str(output_path), 'PNG')
213
+ elif image.mode == 'RGB':
214
+ image.save(str(output_path), 'PNG')
215
+ else:
216
+ image.convert('RGB').save(str(output_path), 'PNG')
217
+ image_saved = True
218
+
219
+ if image_saved:
220
+ full_path = output_path.resolve()
221
+ print(f"\nImage saved: {full_path}")
222
+ # OpenClaw parses MEDIA: tokens and will attach the file on
223
+ # supported chat providers. Emit the canonical MEDIA:<path> form.
224
+ print(f"MEDIA:{full_path}")
225
+ else:
226
+ print("Error: No image was generated in the response.", file=sys.stderr)
227
+ sys.exit(1)
228
+
229
+ except Exception as e:
230
+ print(f"Error generating image: {e}", file=sys.stderr)
231
+ sys.exit(1)
232
+
233
+
234
+ if __name__ == "__main__":
235
+ main()
@@ -0,0 +1,53 @@
1
+ ---
2
+ name: nano-pdf
3
+ description: Edit or create PDFs with natural-language instructions using the nano-pdf CLI. Use when asked to make a PDF, edit a PDF, add pages, change text in a PDF, or convert content to PDF format.
4
+ metadata:
5
+ {
6
+ "openclaw":
7
+ {
8
+ "emoji": "📄",
9
+ "requires": { "bins": ["nano-pdf"] },
10
+ "install":
11
+ [
12
+ {
13
+ "id": "uv",
14
+ "kind": "uv",
15
+ "package": "nano-pdf",
16
+ "bins": ["nano-pdf"],
17
+ "label": "Install nano-pdf (uv)",
18
+ },
19
+ ],
20
+ },
21
+ }
22
+ ---
23
+
24
+ # nano-pdf
25
+
26
+ Use `nano-pdf` to apply edits to a specific page in a PDF using a natural-language instruction.
27
+
28
+ ## Quick Start
29
+
30
+ ```bash
31
+ nano-pdf edit deck.pdf 1 "Change the title to 'Q3 Results' and fix the typo in the subtitle"
32
+ ```
33
+
34
+ ## Creating a New PDF
35
+
36
+ ```bash
37
+ nano-pdf create output.pdf "Create a one-page summary of quarterly results with a header, bullet points, and a footer"
38
+ ```
39
+
40
+ ## Usage in SwarmClaw
41
+
42
+ When a user asks to create or edit a PDF:
43
+
44
+ 1. Check if `nano-pdf` is installed: `which nano-pdf`
45
+ 2. If not installed, install via `uv tool install nano-pdf` or `pip install nano-pdf`
46
+ 3. Run the appropriate command
47
+ 4. Report the output file path to the user
48
+
49
+ ## Notes
50
+
51
+ - Page numbers are 0-based or 1-based depending on the tool's version; if the result looks off by one, retry with the other.
52
+ - Always sanity-check the output PDF before reporting success.
53
+ - For multi-page edits, run separate commands per page.
@@ -0,0 +1,78 @@
1
+ ---
2
+ name: openai-image-gen
3
+ description: Generate images via OpenAI Images API (GPT Image, DALL-E 3, DALL-E 2). Supports batch generation with random prompt sampler and HTML gallery output. Use when asked to generate images with OpenAI and an OPENAI_API_KEY is available.
4
+ metadata:
5
+ {
6
+ "openclaw":
7
+ {
8
+ "emoji": "🎨",
9
+ "requires": { "bins": ["python3"], "env": ["OPENAI_API_KEY"] },
10
+ "primaryEnv": "OPENAI_API_KEY",
11
+ "install":
12
+ [
13
+ {
14
+ "id": "python-brew",
15
+ "kind": "brew",
16
+ "formula": "python",
17
+ "bins": ["python3"],
18
+ "label": "Install Python (brew)",
19
+ },
20
+ ],
21
+ },
22
+ }
23
+ ---
24
+
25
+ # OpenAI Image Gen
26
+
27
+ Generate images via the OpenAI Images API with an HTML gallery viewer.
28
+
29
+ ## Run
30
+
31
+ Note: Image generation can take longer than typical timeouts. Set a higher timeout when running via shell (e.g., 300 seconds).
32
+
33
+ ```bash
34
+ python3 {baseDir}/scripts/gen.py
35
+ ```
36
+
37
+ ## Useful Flags
38
+
39
+ ```bash
40
+ # GPT image models with various options
41
+ python3 {baseDir}/scripts/gen.py --count 16 --model gpt-image-1
42
+ python3 {baseDir}/scripts/gen.py --prompt "ultra-detailed studio photo of a lobster astronaut" --count 4
43
+ python3 {baseDir}/scripts/gen.py --size 1536x1024 --quality high --out-dir ./out/images
44
+ python3 {baseDir}/scripts/gen.py --model gpt-image-1.5 --background transparent --output-format webp
45
+
46
+ # DALL-E 3 (note: count is automatically limited to 1)
47
+ python3 {baseDir}/scripts/gen.py --model dall-e-3 --quality hd --size 1792x1024 --style vivid
48
+ python3 {baseDir}/scripts/gen.py --model dall-e-3 --style natural --prompt "serene mountain landscape"
49
+
50
+ # DALL-E 2
51
+ python3 {baseDir}/scripts/gen.py --model dall-e-2 --size 512x512 --count 4
52
+ ```
53
+
54
+ ## Model-Specific Parameters
55
+
56
+ ### Size
57
+
58
+ - **GPT image models** (`gpt-image-1`, `gpt-image-1-mini`, `gpt-image-1.5`): `1024x1024`, `1536x1024` (landscape), `1024x1536` (portrait), or `auto`. Default: `1024x1024`
59
+ - **dall-e-3**: `1024x1024`, `1792x1024`, or `1024x1792`. Default: `1024x1024`
60
+ - **dall-e-2**: `256x256`, `512x512`, or `1024x1024`. Default: `1024x1024`
61
+
62
+ ### Quality
63
+
64
+ - **GPT image models**: `auto`, `high`, `medium`, or `low`. Default: `high`
65
+ - **dall-e-3**: `hd` or `standard`. Default: `standard`
66
+ - **dall-e-2**: `standard` only
67
+
68
+ ### Other Parameters
69
+
70
+ - **GPT image models** support `--background` (`transparent`, `opaque`, `auto`) and `--output-format` (`png`, `jpeg`, `webp`)
71
+ - **dall-e-3** supports `--style` (`vivid` for hyper-real, `natural` for more natural looking)
72
+ - **dall-e-3** only supports `n=1`; the script automatically limits count to 1
73
+
74
+ ## Output
75
+
76
+ - Image files (`*.png`, `*.jpeg`, or `*.webp` depending on model and format)
77
+ - `prompts.json` (prompt-to-file mapping)
78
+ - `index.html` (thumbnail gallery — open in browser to review)
@@ -0,0 +1,328 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import base64
4
+ import datetime as dt
5
+ import json
6
+ import os
7
+ import random
8
+ import re
9
+ import sys
10
+ import urllib.error
11
+ import urllib.request
12
+ from collections.abc import Callable
13
+ from html import escape as html_escape
14
+ from pathlib import Path
15
+
16
+
17
+ def slugify(text: str) -> str:
18
+ text = text.lower().strip()
19
+ text = re.sub(r"[^a-z0-9]+", "-", text)
20
+ text = re.sub(r"-{2,}", "-", text).strip("-")
21
+ return text or "image"
22
+
23
+
24
+ def default_out_dir() -> Path:
25
+ now = dt.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
26
+ preferred = Path.home() / "Projects" / "tmp"
27
+ base = preferred if preferred.is_dir() else Path("./tmp")
28
+ base.mkdir(parents=True, exist_ok=True)
29
+ return base / f"openai-image-gen-{now}"
30
+
31
+
32
+ def pick_prompts(count: int) -> list[str]:
33
+ subjects = [
34
+ "a lobster astronaut",
35
+ "a brutalist lighthouse",
36
+ "a cozy reading nook",
37
+ "a cyberpunk noodle shop",
38
+ "a Vienna street at dusk",
39
+ "a minimalist product photo",
40
+ "a surreal underwater library",
41
+ ]
42
+ styles = [
43
+ "ultra-detailed studio photo",
44
+ "35mm film still",
45
+ "isometric illustration",
46
+ "editorial photography",
47
+ "soft watercolor",
48
+ "architectural render",
49
+ "high-contrast monochrome",
50
+ ]
51
+ lighting = [
52
+ "golden hour",
53
+ "overcast soft light",
54
+ "neon lighting",
55
+ "dramatic rim light",
56
+ "candlelight",
57
+ "foggy atmosphere",
58
+ ]
59
+ prompts: list[str] = []
60
+ for _ in range(count):
61
+ prompts.append(
62
+ f"{random.choice(styles)} of {random.choice(subjects)}, {random.choice(lighting)}"
63
+ )
64
+ return prompts
65
+
66
+
67
+ def get_model_defaults(model: str) -> tuple[str, str]:
68
+ """Return (default_size, default_quality) for the given model."""
69
+ if model == "dall-e-2":
70
+ # quality will be ignored
71
+ return ("1024x1024", "standard")
72
+ elif model == "dall-e-3":
73
+ return ("1024x1024", "standard")
74
+ else:
75
+ # GPT image or future models
76
+ return ("1024x1024", "high")
77
+
78
+
79
+ def normalize_optional_flag(
80
+ *,
81
+ model: str,
82
+ raw_value: str,
83
+ flag_name: str,
84
+ supported: Callable[[str], bool],
85
+ allowed: set[str],
86
+ allowed_text: str,
87
+ unsupported_message: str,
88
+ aliases: dict[str, str] | None = None,
89
+ ) -> str:
90
+ """Normalize a string flag, warn when unsupported, and reject invalid values."""
91
+ value = raw_value.strip().lower()
92
+ if not value:
93
+ return ""
94
+
95
+ if not supported(model):
96
+ print(unsupported_message.format(model=model), file=sys.stderr)
97
+ return ""
98
+
99
+ if aliases:
100
+ value = aliases.get(value, value)
101
+
102
+ if value not in allowed:
103
+ raise ValueError(
104
+ f"Invalid --{flag_name} '{raw_value}'. Allowed values: {allowed_text}."
105
+ )
106
+ return value
107
+
108
+
109
+ def normalize_background(model: str, background: str) -> str:
110
+ """Validate --background for GPT image models."""
111
+ return normalize_optional_flag(
112
+ model=model,
113
+ raw_value=background,
114
+ flag_name="background",
115
+ supported=lambda candidate: candidate.startswith("gpt-image"),
116
+ allowed={"transparent", "opaque", "auto"},
117
+ allowed_text="transparent, opaque, auto",
118
+ unsupported_message=(
119
+ "Warning: --background is only supported for gpt-image models; "
120
+ "ignoring for '{model}'."
121
+ ),
122
+ )
123
+
124
+
125
+ def normalize_style(model: str, style: str) -> str:
126
+ """Validate --style for dall-e-3."""
127
+ return normalize_optional_flag(
128
+ model=model,
129
+ raw_value=style,
130
+ flag_name="style",
131
+ supported=lambda candidate: candidate == "dall-e-3",
132
+ allowed={"vivid", "natural"},
133
+ allowed_text="vivid, natural",
134
+ unsupported_message=(
135
+ "Warning: --style is only supported for dall-e-3; ignoring for '{model}'."
136
+ ),
137
+ )
138
+
139
+
140
+ def normalize_output_format(model: str, output_format: str) -> str:
141
+ """Normalize output format for GPT image models and validate allowed values."""
142
+ return normalize_optional_flag(
143
+ model=model,
144
+ raw_value=output_format,
145
+ flag_name="output-format",
146
+ supported=lambda candidate: candidate.startswith("gpt-image"),
147
+ allowed={"png", "jpeg", "webp"},
148
+ allowed_text="png, jpeg, webp",
149
+ unsupported_message=(
150
+ "Warning: --output-format is only supported for gpt-image models; "
151
+ "ignoring for '{model}'."
152
+ ),
153
+ aliases={"jpg": "jpeg"},
154
+ )
155
+
156
+
157
+ def request_images(
158
+ api_key: str,
159
+ prompt: str,
160
+ model: str,
161
+ size: str,
162
+ quality: str,
163
+ background: str = "",
164
+ output_format: str = "",
165
+ style: str = "",
166
+ ) -> dict:
167
+ url = "https://api.openai.com/v1/images/generations"
168
+ args = {
169
+ "model": model,
170
+ "prompt": prompt,
171
+ "size": size,
172
+ "n": 1,
173
+ }
174
+
175
+ # Quality parameter - dall-e-2 doesn't accept this parameter
176
+ if model != "dall-e-2":
177
+ args["quality"] = quality
178
+
179
+ # Note: response_format no longer supported by OpenAI Images API
180
+ # dall-e models now return URLs by default
181
+
182
+ if model.startswith("gpt-image"):
183
+ if background:
184
+ args["background"] = background
185
+ if output_format:
186
+ args["output_format"] = output_format
187
+
188
+ if model == "dall-e-3" and style:
189
+ args["style"] = style
190
+
191
+ body = json.dumps(args).encode("utf-8")
192
+ req = urllib.request.Request(
193
+ url,
194
+ method="POST",
195
+ headers={
196
+ "Authorization": f"Bearer {api_key}",
197
+ "Content-Type": "application/json",
198
+ },
199
+ data=body,
200
+ )
201
+ try:
202
+ with urllib.request.urlopen(req, timeout=300) as resp:
203
+ return json.loads(resp.read().decode("utf-8"))
204
+ except urllib.error.HTTPError as e:
205
+ payload = e.read().decode("utf-8", errors="replace")
206
+ raise RuntimeError(f"OpenAI Images API failed ({e.code}): {payload}") from e
207
+
208
+
209
+ def write_gallery(out_dir: Path, items: list[dict]) -> None:
210
+ thumbs = "\n".join(
211
+ [
212
+ f"""
213
+ <figure>
214
+ <a href="{html_escape(it["file"], quote=True)}"><img src="{html_escape(it["file"], quote=True)}" loading="lazy" /></a>
215
+ <figcaption>{html_escape(it["prompt"])}</figcaption>
216
+ </figure>
217
+ """.strip()
218
+ for it in items
219
+ ]
220
+ )
221
+ html = f"""<!doctype html>
222
+ <meta charset="utf-8" />
223
+ <title>openai-image-gen</title>
224
+ <style>
225
+ :root {{ color-scheme: dark; }}
226
+ body {{ margin: 24px; font: 14px/1.4 ui-sans-serif, system-ui; background: #0b0f14; color: #e8edf2; }}
227
+ h1 {{ font-size: 18px; margin: 0 0 16px; }}
228
+ .grid {{ display: grid; grid-template-columns: repeat(auto-fill, minmax(240px, 1fr)); gap: 16px; }}
229
+ figure {{ margin: 0; padding: 12px; border: 1px solid #1e2a36; border-radius: 14px; background: #0f1620; }}
230
+ img {{ width: 100%; height: auto; border-radius: 10px; display: block; }}
231
+ figcaption {{ margin-top: 10px; color: #b7c2cc; }}
232
+ code {{ color: #9cd1ff; }}
233
+ </style>
234
+ <h1>openai-image-gen</h1>
235
+ <p>Output: <code>{html_escape(out_dir.as_posix())}</code></p>
236
+ <div class="grid">
237
+ {thumbs}
238
+ </div>
239
+ """
240
+ (out_dir / "index.html").write_text(html, encoding="utf-8")
241
+
242
+
243
+ def main() -> int:
244
+ ap = argparse.ArgumentParser(description="Generate images via OpenAI Images API.")
245
+ ap.add_argument("--prompt", help="Single prompt. If omitted, random prompts are generated.")
246
+ ap.add_argument("--count", type=int, default=8, help="How many images to generate.")
247
+ ap.add_argument("--model", default="gpt-image-1", help="Image model id.")
248
+ ap.add_argument("--size", default="", help="Image size (e.g. 1024x1024, 1536x1024). Defaults based on model if not specified.")
249
+ ap.add_argument("--quality", default="", help="Image quality (e.g. high, standard). Defaults based on model if not specified.")
250
+ ap.add_argument("--background", default="", help="Background transparency (GPT models only): transparent, opaque, or auto.")
251
+ ap.add_argument("--output-format", default="", help="Output format (GPT models only): png, jpeg, or webp.")
252
+ ap.add_argument("--style", default="", help="Image style (dall-e-3 only): vivid or natural.")
253
+ ap.add_argument("--out-dir", default="", help="Output directory (default: ./tmp/openai-image-gen-<ts>).")
254
+ args = ap.parse_args()
255
+
256
+ api_key = (os.environ.get("OPENAI_API_KEY") or "").strip()
257
+ if not api_key:
258
+ print("Missing OPENAI_API_KEY", file=sys.stderr)
259
+ return 2
260
+
261
+ # Apply model-specific defaults if not specified
262
+ default_size, default_quality = get_model_defaults(args.model)
263
+ size = args.size or default_size
264
+ quality = args.quality or default_quality
265
+
266
+ count = args.count
267
+ if args.model == "dall-e-3" and count > 1:
268
+ print(f"Warning: dall-e-3 only supports generating 1 image at a time. Reducing count from {count} to 1.", file=sys.stderr)
269
+ count = 1
270
+
271
+ out_dir = Path(args.out_dir).expanduser() if args.out_dir else default_out_dir()
272
+ out_dir.mkdir(parents=True, exist_ok=True)
273
+
274
+ prompts = [args.prompt] * count if args.prompt else pick_prompts(count)
275
+
276
+ try:
277
+ normalized_background = normalize_background(args.model, args.background)
278
+ normalized_style = normalize_style(args.model, args.style)
279
+ normalized_output_format = normalize_output_format(args.model, args.output_format)
280
+ except ValueError as e:
281
+ print(str(e), file=sys.stderr)
282
+ return 2
283
+
284
+ # Determine file extension based on output format
285
+ if args.model.startswith("gpt-image") and normalized_output_format:
286
+ file_ext = normalized_output_format
287
+ else:
288
+ file_ext = "png"
289
+
290
+ items: list[dict] = []
291
+ for idx, prompt in enumerate(prompts, start=1):
292
+ print(f"[{idx}/{len(prompts)}] {prompt}")
293
+ res = request_images(
294
+ api_key,
295
+ prompt,
296
+ args.model,
297
+ size,
298
+ quality,
299
+ normalized_background,
300
+ normalized_output_format,
301
+ normalized_style,
302
+ )
303
+ data = res.get("data", [{}])[0]
304
+ image_b64 = data.get("b64_json")
305
+ image_url = data.get("url")
306
+ if not image_b64 and not image_url:
307
+ raise RuntimeError(f"Unexpected response: {json.dumps(res)[:400]}")
308
+
309
+ filename = f"{idx:03d}-{slugify(prompt)[:40]}.{file_ext}"
310
+ filepath = out_dir / filename
311
+ if image_b64:
312
+ filepath.write_bytes(base64.b64decode(image_b64))
313
+ else:
314
+ try:
315
+ urllib.request.urlretrieve(image_url, filepath)
316
+ except urllib.error.URLError as e:
317
+ raise RuntimeError(f"Failed to download image from {image_url}: {e}") from e
318
+
319
+ items.append({"prompt": prompt, "file": filename})
320
+
321
+ (out_dir / "prompts.json").write_text(json.dumps(items, indent=2), encoding="utf-8")
322
+ write_gallery(out_dir, items)
323
+ print(f"\nWrote: {(out_dir / 'index.html').as_posix()}")
324
+ return 0
325
+
326
+
327
+ if __name__ == "__main__":
328
+ raise SystemExit(main())