openclacky 1.2.18 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +21 -0
- data/lib/clacky/agent/time_machine.rb +256 -74
- data/lib/clacky/agent/tool_executor.rb +12 -0
- data/lib/clacky/agent.rb +15 -20
- data/lib/clacky/agent_config.rb +18 -0
- data/lib/clacky/cli.rb +55 -3
- data/lib/clacky/default_skills/media-gen/SKILL.md +172 -5
- data/lib/clacky/media/base.rb +93 -0
- data/lib/clacky/media/gemini.rb +10 -0
- data/lib/clacky/media/generator.rb +57 -0
- data/lib/clacky/media/openai_compat.rb +160 -0
- data/lib/clacky/message_history.rb +12 -7
- data/lib/clacky/providers.rb +28 -0
- data/lib/clacky/rich_ui_controller.rb +3 -1
- data/lib/clacky/server/backup_manager.rb +200 -0
- data/lib/clacky/server/channel/adapters/feishu/adapter.rb +10 -2
- data/lib/clacky/server/channel/adapters/feishu/bot.rb +68 -15
- data/lib/clacky/server/channel/channel_manager.rb +65 -50
- data/lib/clacky/server/http_server.rb +345 -14
- data/lib/clacky/server/scheduler.rb +19 -0
- data/lib/clacky/server/session_registry.rb +8 -4
- data/lib/clacky/session_manager.rb +40 -2
- data/lib/clacky/tools/trash_manager.rb +14 -0
- data/lib/clacky/ui2/components/command_suggestions.rb +1 -0
- data/lib/clacky/ui2/components/modal_component.rb +34 -7
- data/lib/clacky/ui2/ui_controller.rb +150 -19
- data/lib/clacky/utils/file_processor.rb +75 -4
- data/lib/clacky/version.rb +1 -1
- data/lib/clacky/web/app.css +2038 -1147
- data/lib/clacky/web/app.js +22 -1
- data/lib/clacky/web/backup.js +119 -0
- data/lib/clacky/web/billing.js +94 -7
- data/lib/clacky/web/channels.js +81 -11
- data/lib/clacky/web/design-sample.css +247 -0
- data/lib/clacky/web/design-sample.html +127 -0
- data/lib/clacky/web/favicon.svg +16 -0
- data/lib/clacky/web/i18n.js +159 -31
- data/lib/clacky/web/index.html +175 -55
- data/lib/clacky/web/logo_nav_dark.png +0 -0
- data/lib/clacky/web/onboard.js +114 -28
- data/lib/clacky/web/sessions.js +436 -192
- data/lib/clacky/web/settings.js +21 -1
- data/lib/clacky/web/skills.js +1 -1
- data/lib/clacky/web/tasks.js +129 -61
- data/lib/clacky/web/utils.js +72 -0
- data/lib/clacky/web/ws-dispatcher.js +6 -0
- data/lib/clacky.rb +1 -0
- metadata +7 -3
- data/lib/clacky/server/channel/group_message_buffer.rb +0 -53
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: media-gen
|
|
3
|
-
description: 'Generate images
|
|
3
|
+
description: 'Generate images, videos, or audio (text-to-speech) in the current task. Use whenever the user asks to create/generate/produce a picture / image / illustration / cover / poster / icon / artwork, a video / clip / animation, or speech / voiceover / narration / TTS — e.g. 生成图片, 画一张, 做封面, 配图, generate image, make a picture, draw, design a cover, 生成视频, 做个视频, text-to-video, 朗读, 配音, 旁白, 文字转语音, generate speech, voiceover. Also use when a document (slides, poster, README hero) needs an inline image.'
|
|
4
4
|
disable-model-invocation: false
|
|
5
5
|
user-invocable: true
|
|
6
6
|
always-show: true
|
|
@@ -27,13 +27,29 @@ curl -s http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/types
|
|
|
27
27
|
|
|
28
28
|
If the response shows `image.configured = false`, stop and tell the user:
|
|
29
29
|
|
|
30
|
-
>
|
|
30
|
+
> 还没有配置生图模型。请打开设置页 → 添加模型 → 类型选 `image`(走 openclacky 官方网关时推荐 `or-gemini-3-pro-image` 或 `or-gpt-image-2`)。配好后再让我生图。
|
|
31
31
|
|
|
32
32
|
Do NOT try to fall back to `terminal` + a hand-written `curl https://api.openai.com/...` — that bypasses the user's configured backend and won't be billed correctly.
|
|
33
33
|
|
|
34
|
+
**You do NOT configure models — the user does, in the settings page.** Never
|
|
35
|
+
edit the user's `config.yml` to add or change a model, and never invent a model
|
|
36
|
+
name from memory (e.g. `or-gpt-5.4-image-2` does not exist). The real, current
|
|
37
|
+
model is whatever `/api/media/types` reports under `image.model`. If you think a
|
|
38
|
+
different model is needed, tell the user which one to set in the settings page —
|
|
39
|
+
don't touch the config file yourself.
|
|
40
|
+
|
|
34
41
|
## Step 2 — Generate the image
|
|
35
42
|
|
|
36
|
-
###
|
|
43
|
+
### The model does NOT honor exact pixel sizes
|
|
44
|
+
|
|
45
|
+
There is no `size` / `width` / `height` field — the only shape control is
|
|
46
|
+
`aspect_ratio` (`landscape` / `square` / `portrait`), and even that is just a
|
|
47
|
+
rough hint (ask for `576x96` and you may get `1408x768`). When the user needs an
|
|
48
|
+
**exact pixel size, a grid, an icon at NxN, or a spritesheet**, generate first at
|
|
49
|
+
whatever size the model gives, then resize / crop / tile to the exact pixels with
|
|
50
|
+
ImageMagick (`magick`). Verify with `magick identify` before reporting done.
|
|
51
|
+
|
|
52
|
+
### Important: generation speed & concurrency
|
|
37
53
|
|
|
38
54
|
- **Image generation can be slow — up to 2 minutes per image depending on the model.** Before calling the API, warn the user that it may take a minute or two. The curl request blocks until the image is ready; do NOT run it in the background.
|
|
39
55
|
- **One at a time only.** Never generate multiple images concurrently (e.g. by running several `curl` commands simultaneously or in a script loop). Each call consumes significant server-side resources, and parallel requests will almost certainly cause timeouts. If the user wants several images, generate them **sequentially**, one after another.
|
|
@@ -47,6 +63,10 @@ curl -s -X POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/ima
|
|
|
47
63
|
}'
|
|
48
64
|
```
|
|
49
65
|
|
|
66
|
+
- The terminal blocks multi-line commands — write the request into a `.sh` file and run it, don't paste a multi-line `curl`.
|
|
67
|
+
- If a call fails with `400 / INVALID_ARGUMENT`, drop the `aspect_ratio` field and retry once before reporting the error.
|
|
68
|
+
- If a call fails with `unknown image model` (400), the configured model name isn't recognized by its backend — tell the user to fix the model name in the settings page; do NOT guess another name and retry.
|
|
69
|
+
|
|
50
70
|
### Request fields
|
|
51
71
|
|
|
52
72
|
| Field | Required | Values | Notes |
|
|
@@ -129,6 +149,153 @@ When the user gives a vague request like "给我配张图", ask one clarifying q
|
|
|
129
149
|
- The user wants a **diagram / chart** with specific data — use a charting library (matplotlib, mermaid, etc.) instead; image gen is for illustrations, not data viz
|
|
130
150
|
- The user asks for **screenshots** of real software — use the browser tool
|
|
131
151
|
|
|
132
|
-
##
|
|
152
|
+
## Generating video (Veo)
|
|
153
|
+
|
|
154
|
+
The same `/api/media/` namespace serves video generation. The user must
|
|
155
|
+
configure a `type=video` model in settings (recommended: `or-veo-3-1`).
|
|
156
|
+
|
|
157
|
+
### Endpoint
|
|
158
|
+
|
|
159
|
+
```
|
|
160
|
+
POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/video
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Check `GET /api/media/types` first — if `video.configured = false`, tell the
|
|
164
|
+
user to add a `type=video` model in settings before generating.
|
|
165
|
+
|
|
166
|
+
### Video is slow and expensive
|
|
167
|
+
|
|
168
|
+
- **A single clip can take 1–3 minutes (sometimes longer).** Warn the user
|
|
169
|
+
before calling, and run the curl in the foreground — it blocks until the
|
|
170
|
+
MP4 is ready. Do NOT background it.
|
|
171
|
+
- **One at a time.** Never run multiple video generations concurrently.
|
|
172
|
+
- Each clip costs real money (billed per output-second). Confirm the prompt
|
|
173
|
+
with the user before generating.
|
|
174
|
+
|
|
175
|
+
### Request
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
curl -s -X POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/video \
|
|
179
|
+
-H "Content-Type: application/json" \
|
|
180
|
+
-d '{
|
|
181
|
+
"prompt": "A cinematic drone shot flying over a misty mountain range at sunrise, golden light, 4K.",
|
|
182
|
+
"aspect_ratio": "landscape",
|
|
183
|
+
"duration_seconds": 8
|
|
184
|
+
}'
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
| Field | Required | Values | Notes |
|
|
188
|
+
|--------------------|----------|---------------------------------|-------|
|
|
189
|
+
| `prompt` | yes | string | Same prompt-craft tips as images apply. |
|
|
190
|
+
| `aspect_ratio` | no | `landscape` / `portrait` | Defaults to `landscape` (16:9). |
|
|
191
|
+
| `duration_seconds` | no | 4–8 | Defaults to 8. |
|
|
192
|
+
| `image` | no | `{ "b64_json": "...", "mime_type": "image/png" }` | Optional first frame for image-to-video. |
|
|
193
|
+
| `output_dir` | no | absolute path | MP4 saved under `<output_dir>/assets/generated/`. |
|
|
194
|
+
|
|
195
|
+
### Response (success)
|
|
196
|
+
|
|
197
|
+
```json
|
|
198
|
+
{
|
|
199
|
+
"success": true,
|
|
200
|
+
"video": "/abs/path/to/working_dir/assets/generated/vid_20260615_011820_a1b2c3d4.mp4",
|
|
201
|
+
"model": "or-veo-3-1",
|
|
202
|
+
"provider": "openclacky",
|
|
203
|
+
"prompt": "A cinematic drone shot ...",
|
|
204
|
+
"aspect_ratio": "landscape",
|
|
205
|
+
"duration_seconds": 8,
|
|
206
|
+
"cost_usd": 2.688
|
|
207
|
+
}
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
The `video` field is an absolute path on disk. Show it to the user with a
|
|
211
|
+
markdown link or an HTML5 `<video>` tag pointing at the `file://` path; embed
|
|
212
|
+
it in documents with a relative path under `./assets/generated/`.
|
|
213
|
+
|
|
214
|
+
### Response (failure)
|
|
215
|
+
|
|
216
|
+
Same shape and `error_type` values as image generation, but with `"video": null`.
|
|
217
|
+
`not_configured` means no `type=video` model is set up.
|
|
218
|
+
|
|
219
|
+
## Generating speech (Gemini TTS)
|
|
220
|
+
|
|
221
|
+
The same `/api/media/` namespace serves text-to-speech. The user must
|
|
222
|
+
configure a `type=audio` model in settings (recommended:
|
|
223
|
+
`or-tts-gemini-2-5-flash`, the cheap+fast default).
|
|
224
|
+
|
|
225
|
+
### Endpoint
|
|
226
|
+
|
|
227
|
+
```
|
|
228
|
+
POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/audio/speech
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Check `GET /api/media/types` first — if `audio.configured = false`, tell the
|
|
232
|
+
user to add a `type=audio` model in settings before generating.
|
|
233
|
+
|
|
234
|
+
### Request
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
curl -s -X POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/audio/speech \
|
|
238
|
+
-H "Content-Type: application/json" \
|
|
239
|
+
-d '{
|
|
240
|
+
"input": "Hello and welcome to openclacky. Today we will explore...",
|
|
241
|
+
"voice": "Kore"
|
|
242
|
+
}'
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
| Field | Required | Values | Notes |
|
|
246
|
+
|--------------|----------|---------------------------------|-------|
|
|
247
|
+
| `input` | yes | string | The text to speak. Plain prose works best; you can prefix with style cues like "Say cheerfully:" or "In a calm tone:". |
|
|
248
|
+
| `voice` | no | string voice name | Defaults to `Kore`. Common Gemini voices: `Kore`, `Puck`, `Charon`, `Fenrir`, `Aoede`. |
|
|
249
|
+
| `output_dir` | no | absolute path | WAV saved under `<output_dir>/assets/generated/`. |
|
|
250
|
+
|
|
251
|
+
Generation typically takes 2–10 seconds depending on length. The request
|
|
252
|
+
blocks until the WAV is ready.
|
|
253
|
+
|
|
254
|
+
### Response (success)
|
|
255
|
+
|
|
256
|
+
```json
|
|
257
|
+
{
|
|
258
|
+
"success": true,
|
|
259
|
+
"audio": "/abs/path/to/working_dir/assets/generated/tts_20260615_233522_4ff02705.wav",
|
|
260
|
+
"model": "or-tts-gemini-2-5-flash",
|
|
261
|
+
"provider": "openclacky",
|
|
262
|
+
"input": "Hello and welcome to openclacky...",
|
|
263
|
+
"voice": "Kore",
|
|
264
|
+
"mime_type": "audio/wav",
|
|
265
|
+
"usage": { "prompt_tokens": 13, "completion_tokens": 122, "total_tokens": 135 },
|
|
266
|
+
"cost_usd": 0.000259
|
|
267
|
+
}
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
The `audio` field is an absolute path on disk. Output is mono 16-bit PCM at
|
|
271
|
+
24 kHz wrapped in a standard WAV container — playable by any browser, OS
|
|
272
|
+
player, or `<audio>` tag without conversion.
|
|
273
|
+
|
|
274
|
+
To let the user hear it, write a markdown link in your reply:
|
|
275
|
+
|
|
276
|
+
```markdown
|
|
277
|
+
[🔊 听一下](file:///abs/path/from/response.wav)
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
For embedding in HTML documents, use:
|
|
281
|
+
|
|
282
|
+
```html
|
|
283
|
+
<audio controls src="./assets/generated/xxx.wav"></audio>
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
### Response (failure)
|
|
287
|
+
|
|
288
|
+
Same shape and `error_type` values as image generation, but with `"audio": null`.
|
|
289
|
+
`not_configured` means no `type=audio` model is set up.
|
|
290
|
+
|
|
291
|
+
### Cost & length tips
|
|
292
|
+
|
|
293
|
+
- Gemini TTS bills by tokens (input text + generated audio). A typical
|
|
294
|
+
one-paragraph narration costs well under $0.001.
|
|
295
|
+
- For long-form audio (>1 minute), split the script into paragraphs and
|
|
296
|
+
generate each separately, then concatenate locally — avoids upstream
|
|
297
|
+
truncation and gives you finer control over pacing.
|
|
298
|
+
- Voice consistency: Gemini TTS does not currently support voice cloning;
|
|
299
|
+
use the same `voice` name across calls in one project to keep the
|
|
300
|
+
narrator consistent.
|
|
133
301
|
|
|
134
|
-
The same `/api/media/` namespace will gain `video` and `audio` endpoints. The pattern is identical: the user configures `type=video` / `type=audio` models in settings, this skill (or its successor) calls the matching endpoint.
|
data/lib/clacky/media/base.rb
CHANGED
|
@@ -29,6 +29,28 @@ module Clacky
|
|
|
29
29
|
raise NotImplementedError, "#{self.class.name} must implement #generate_image"
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
+
# @return [Hash] either video_success_response(...) or
|
|
33
|
+
# video_error_response(...)
|
|
34
|
+
def generate_video(prompt:, aspect_ratio: "landscape", duration_seconds: nil, output_dir: nil, **_kwargs)
|
|
35
|
+
video_error_response(
|
|
36
|
+
error: "Video generation is not supported by #{self.class.name.split("::").last}. Use the openclacky gateway with a video model such as or-veo-3-1.",
|
|
37
|
+
error_type: "not_implemented",
|
|
38
|
+
provider: "",
|
|
39
|
+
prompt: prompt,
|
|
40
|
+
aspect_ratio: aspect_ratio
|
|
41
|
+
)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# @return [Hash] either audio_success_response(...) or audio_error_response(...)
|
|
45
|
+
def generate_speech(input:, voice: nil, output_dir: nil, **_kwargs)
|
|
46
|
+
audio_error_response(
|
|
47
|
+
error: "Speech synthesis is not supported by #{self.class.name.split("::").last}. Use the openclacky gateway with a TTS model such as or-tts-gemini-2-5-flash.",
|
|
48
|
+
error_type: "not_implemented",
|
|
49
|
+
provider: "",
|
|
50
|
+
input: input
|
|
51
|
+
)
|
|
52
|
+
end
|
|
53
|
+
|
|
32
54
|
# Persist a base64-encoded image under <output_dir>/assets/generated/.
|
|
33
55
|
# Returns the absolute path on disk.
|
|
34
56
|
private def save_b64_image(b64_data, output_dir:, prefix: "img", extension: "png")
|
|
@@ -41,6 +63,29 @@ module Clacky
|
|
|
41
63
|
path
|
|
42
64
|
end
|
|
43
65
|
|
|
66
|
+
# Persist a base64-encoded video under <output_dir>/assets/generated/.
|
|
67
|
+
# Returns the absolute path on disk. Mirrors #save_b64_image; the only
|
|
68
|
+
# difference is the default extension (mp4).
|
|
69
|
+
private def save_b64_video(b64_data, output_dir:, prefix: "vid", extension: "mp4")
|
|
70
|
+
target_dir = File.join(output_dir, "assets", "generated")
|
|
71
|
+
FileUtils.mkdir_p(target_dir)
|
|
72
|
+
ts = Time.now.strftime("%Y%m%d_%H%M%S")
|
|
73
|
+
short = SecureRandom.hex(4)
|
|
74
|
+
path = File.join(target_dir, "#{prefix}_#{ts}_#{short}.#{extension}")
|
|
75
|
+
File.binwrite(path, Base64.decode64(b64_data))
|
|
76
|
+
path
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
private def save_b64_audio(b64_data, output_dir:, prefix: "tts", extension: "wav")
|
|
80
|
+
target_dir = File.join(output_dir, "assets", "generated")
|
|
81
|
+
FileUtils.mkdir_p(target_dir)
|
|
82
|
+
ts = Time.now.strftime("%Y%m%d_%H%M%S")
|
|
83
|
+
short = SecureRandom.hex(4)
|
|
84
|
+
path = File.join(target_dir, "#{prefix}_#{ts}_#{short}.#{extension}")
|
|
85
|
+
File.binwrite(path, Base64.decode64(b64_data))
|
|
86
|
+
path
|
|
87
|
+
end
|
|
88
|
+
|
|
44
89
|
# Download a remote image URL and persist it under
|
|
45
90
|
# <output_dir>/assets/generated/, mirroring save_b64_image so providers
|
|
46
91
|
# that return URLs (e.g. DashScope, whose links expire after 24h) land
|
|
@@ -95,6 +140,54 @@ module Clacky
|
|
|
95
140
|
"provider" => provider
|
|
96
141
|
}
|
|
97
142
|
end
|
|
143
|
+
|
|
144
|
+
private def video_success_response(video:, prompt:, aspect_ratio:, provider:, extra: {})
|
|
145
|
+
{
|
|
146
|
+
"success" => true,
|
|
147
|
+
"video" => video,
|
|
148
|
+
"model" => @model,
|
|
149
|
+
"prompt" => prompt,
|
|
150
|
+
"aspect_ratio" => aspect_ratio,
|
|
151
|
+
"provider" => provider
|
|
152
|
+
}.merge(extra)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
private def video_error_response(error:, error_type: "provider_error", provider: "", prompt: "", aspect_ratio: "landscape")
|
|
156
|
+
{
|
|
157
|
+
"success" => false,
|
|
158
|
+
"video" => nil,
|
|
159
|
+
"error" => error,
|
|
160
|
+
"error_type" => error_type,
|
|
161
|
+
"model" => @model,
|
|
162
|
+
"prompt" => prompt,
|
|
163
|
+
"aspect_ratio" => aspect_ratio,
|
|
164
|
+
"provider" => provider
|
|
165
|
+
}
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
private def audio_success_response(audio:, input:, voice:, provider:, extra: {})
|
|
169
|
+
{
|
|
170
|
+
"success" => true,
|
|
171
|
+
"audio" => audio,
|
|
172
|
+
"model" => @model,
|
|
173
|
+
"input" => input,
|
|
174
|
+
"voice" => voice,
|
|
175
|
+
"provider" => provider
|
|
176
|
+
}.merge(extra)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
private def audio_error_response(error:, error_type: "provider_error", provider: "", input: "", voice: "")
|
|
180
|
+
{
|
|
181
|
+
"success" => false,
|
|
182
|
+
"audio" => nil,
|
|
183
|
+
"error" => error,
|
|
184
|
+
"error_type" => error_type,
|
|
185
|
+
"model" => @model,
|
|
186
|
+
"input" => input,
|
|
187
|
+
"voice" => voice,
|
|
188
|
+
"provider" => provider
|
|
189
|
+
}
|
|
190
|
+
end
|
|
98
191
|
end
|
|
99
192
|
end
|
|
100
193
|
end
|
data/lib/clacky/media/gemini.rb
CHANGED
|
@@ -31,6 +31,16 @@ module Clacky
|
|
|
31
31
|
aspect_ratio: aspect_ratio
|
|
32
32
|
)
|
|
33
33
|
end
|
|
34
|
+
|
|
35
|
+
def generate_video(prompt:, aspect_ratio: "landscape", duration_seconds: nil, output_dir: nil, **_kwargs)
|
|
36
|
+
video_error_response(
|
|
37
|
+
error: "Direct Google AI Studio video generation is not supported. Use the openclacky gateway (base_url https://api.openclacky.com) with a video model such as or-veo-3-1.",
|
|
38
|
+
error_type: "not_implemented",
|
|
39
|
+
provider: "gemini-direct",
|
|
40
|
+
prompt: prompt,
|
|
41
|
+
aspect_ratio: aspect_ratio
|
|
42
|
+
)
|
|
43
|
+
end
|
|
34
44
|
end
|
|
35
45
|
end
|
|
36
46
|
end
|
|
@@ -44,6 +44,16 @@ module Clacky
|
|
|
44
44
|
@agent_config.find_model_by_type("image")
|
|
45
45
|
end
|
|
46
46
|
|
|
47
|
+
# @return [Hash, nil] the type=video model entry, or nil if not configured
|
|
48
|
+
def video_model_entry
|
|
49
|
+
@agent_config.find_model_by_type("video")
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# @return [Hash, nil] the type=audio model entry, or nil if not configured
|
|
53
|
+
def audio_model_entry
|
|
54
|
+
@agent_config.find_model_by_type("audio")
|
|
55
|
+
end
|
|
56
|
+
|
|
47
57
|
def generate_image(prompt:, aspect_ratio: "landscape", output_dir: nil, **kwargs)
|
|
48
58
|
entry = image_model_entry
|
|
49
59
|
if entry.nil?
|
|
@@ -67,6 +77,53 @@ module Clacky
|
|
|
67
77
|
)
|
|
68
78
|
end
|
|
69
79
|
|
|
80
|
+
def generate_video(prompt:, aspect_ratio: "landscape", duration_seconds: nil, output_dir: nil, **kwargs)
|
|
81
|
+
entry = video_model_entry
|
|
82
|
+
if entry.nil?
|
|
83
|
+
return {
|
|
84
|
+
"success" => false,
|
|
85
|
+
"video" => nil,
|
|
86
|
+
"error" => "No video model configured. Add a model with type=video in settings.",
|
|
87
|
+
"error_type" => "not_configured",
|
|
88
|
+
"provider" => "",
|
|
89
|
+
"model" => "",
|
|
90
|
+
"prompt" => prompt
|
|
91
|
+
}
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
provider = build_provider_for(entry)
|
|
95
|
+
provider.generate_video(
|
|
96
|
+
prompt: prompt,
|
|
97
|
+
aspect_ratio: aspect_ratio,
|
|
98
|
+
duration_seconds: duration_seconds,
|
|
99
|
+
output_dir: output_dir,
|
|
100
|
+
**kwargs
|
|
101
|
+
)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def generate_speech(input:, voice: nil, output_dir: nil, **kwargs)
|
|
105
|
+
entry = audio_model_entry
|
|
106
|
+
if entry.nil?
|
|
107
|
+
return {
|
|
108
|
+
"success" => false,
|
|
109
|
+
"audio" => nil,
|
|
110
|
+
"error" => "No audio model configured. Add a model with type=audio in settings.",
|
|
111
|
+
"error_type" => "not_configured",
|
|
112
|
+
"provider" => "",
|
|
113
|
+
"model" => "",
|
|
114
|
+
"input" => input
|
|
115
|
+
}
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
provider = build_provider_for(entry)
|
|
119
|
+
provider.generate_speech(
|
|
120
|
+
input: input,
|
|
121
|
+
voice: voice,
|
|
122
|
+
output_dir: output_dir,
|
|
123
|
+
**kwargs
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
70
127
|
# Pick the adapter class for a media model entry.
|
|
71
128
|
#
|
|
72
129
|
# Routing rules:
|
|
@@ -22,6 +22,12 @@ module Clacky
|
|
|
22
22
|
|
|
23
23
|
DEFAULT_ASPECT = "landscape"
|
|
24
24
|
|
|
25
|
+
# Video aspect ratios accepted by the gateway's /videos/generations
|
|
26
|
+
# endpoint. The human-friendly labels map straight through; the gateway
|
|
27
|
+
# normalises to Veo's "16:9" / "9:16" internally.
|
|
28
|
+
VIDEO_ASPECTS = %w[landscape portrait].freeze
|
|
29
|
+
DEFAULT_VIDEO_DURATION = 8
|
|
30
|
+
|
|
25
31
|
def generate_image(prompt:, aspect_ratio: DEFAULT_ASPECT, output_dir: nil, n: 1, **_kwargs)
|
|
26
32
|
provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
|
|
27
33
|
aspect = ASPECT_TO_SIZE.key?(aspect_ratio) ? aspect_ratio : DEFAULT_ASPECT
|
|
@@ -135,6 +141,143 @@ module Clacky
|
|
|
135
141
|
)
|
|
136
142
|
end
|
|
137
143
|
|
|
144
|
+
def generate_video(prompt:, aspect_ratio: DEFAULT_ASPECT, duration_seconds: nil, output_dir: nil, image: nil, **_kwargs)
|
|
145
|
+
provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
|
|
146
|
+
aspect = VIDEO_ASPECTS.include?(aspect_ratio) ? aspect_ratio : DEFAULT_ASPECT
|
|
147
|
+
duration = duration_seconds.to_i
|
|
148
|
+
duration = DEFAULT_VIDEO_DURATION if duration <= 0
|
|
149
|
+
|
|
150
|
+
if prompt.to_s.strip.empty?
|
|
151
|
+
return video_error_response(
|
|
152
|
+
error: "Prompt is required and must be a non-empty string",
|
|
153
|
+
error_type: "invalid_argument", provider: provider_id, aspect_ratio: aspect
|
|
154
|
+
)
|
|
155
|
+
end
|
|
156
|
+
if @api_key.to_s.empty?
|
|
157
|
+
return video_error_response(
|
|
158
|
+
error: "api_key not configured for video model '#{@model}'",
|
|
159
|
+
error_type: "auth_required", provider: provider_id, prompt: prompt, aspect_ratio: aspect
|
|
160
|
+
)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
payload = { model: @model, prompt: prompt, aspect_ratio: aspect, duration_seconds: duration }
|
|
164
|
+
payload[:image] = image if image.is_a?(Hash) && image["b64_json"]
|
|
165
|
+
|
|
166
|
+
begin
|
|
167
|
+
response = video_connection.post("videos/generations") do |req|
|
|
168
|
+
req.headers["Content-Type"] = "application/json"
|
|
169
|
+
req.headers["Authorization"] = "Bearer #{@api_key}"
|
|
170
|
+
req.body = JSON.generate(payload)
|
|
171
|
+
end
|
|
172
|
+
rescue Faraday::Error => e
|
|
173
|
+
return video_error_response(
|
|
174
|
+
error: "HTTP request failed: #{e.message}",
|
|
175
|
+
error_type: "network_error", provider: provider_id, prompt: prompt, aspect_ratio: aspect
|
|
176
|
+
)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
unless response.success?
|
|
180
|
+
return video_error_response(
|
|
181
|
+
error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
|
|
182
|
+
error_type: "api_error", provider: provider_id, prompt: prompt, aspect_ratio: aspect
|
|
183
|
+
)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
body = parse_json(response.body)
|
|
187
|
+
return video_error_response(
|
|
188
|
+
error: "Invalid JSON response from upstream",
|
|
189
|
+
error_type: "invalid_response", provider: provider_id, prompt: prompt, aspect_ratio: aspect
|
|
190
|
+
) unless body.is_a?(Hash)
|
|
191
|
+
|
|
192
|
+
first = (body["data"] || []).first
|
|
193
|
+
if first.nil? || first["b64_json"].to_s.empty?
|
|
194
|
+
return video_error_response(
|
|
195
|
+
error: "Upstream returned no video data",
|
|
196
|
+
error_type: "empty_response", provider: provider_id, prompt: prompt, aspect_ratio: aspect
|
|
197
|
+
)
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
path = save_b64_video(first["b64_json"], output_dir: output_dir || Dir.pwd, prefix: "vid")
|
|
201
|
+
video_success_response(
|
|
202
|
+
video: path, prompt: prompt, aspect_ratio: aspect, provider: provider_id,
|
|
203
|
+
extra: {
|
|
204
|
+
"duration_seconds" => duration,
|
|
205
|
+
"usage" => body["usage"],
|
|
206
|
+
"cost_usd" => body["cost_usd"]
|
|
207
|
+
}.compact
|
|
208
|
+
)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def generate_speech(input:, voice: nil, output_dir: nil, **_kwargs)
|
|
212
|
+
provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
|
|
213
|
+
|
|
214
|
+
if input.to_s.strip.empty?
|
|
215
|
+
return audio_error_response(
|
|
216
|
+
error: "input is required and must be a non-empty string",
|
|
217
|
+
error_type: "invalid_argument", provider: provider_id, voice: voice.to_s
|
|
218
|
+
)
|
|
219
|
+
end
|
|
220
|
+
if @api_key.to_s.empty?
|
|
221
|
+
return audio_error_response(
|
|
222
|
+
error: "api_key not configured for audio model '#{@model}'",
|
|
223
|
+
error_type: "auth_required", provider: provider_id, input: input, voice: voice.to_s
|
|
224
|
+
)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
payload = { model: @model, input: input }
|
|
228
|
+
payload[:voice] = voice if voice && !voice.to_s.strip.empty?
|
|
229
|
+
|
|
230
|
+
begin
|
|
231
|
+
response = audio_connection.post("audio/speech") do |req|
|
|
232
|
+
req.headers["Content-Type"] = "application/json"
|
|
233
|
+
req.headers["Authorization"] = "Bearer #{@api_key}"
|
|
234
|
+
req.body = JSON.generate(payload)
|
|
235
|
+
end
|
|
236
|
+
rescue Faraday::Error => e
|
|
237
|
+
return audio_error_response(
|
|
238
|
+
error: "HTTP request failed: #{e.message}",
|
|
239
|
+
error_type: "network_error", provider: provider_id, input: input, voice: voice.to_s
|
|
240
|
+
)
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
unless response.success?
|
|
244
|
+
return audio_error_response(
|
|
245
|
+
error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
|
|
246
|
+
error_type: "api_error", provider: provider_id, input: input, voice: voice.to_s
|
|
247
|
+
)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
body = parse_json(response.body)
|
|
251
|
+
return audio_error_response(
|
|
252
|
+
error: "Invalid JSON response from upstream",
|
|
253
|
+
error_type: "invalid_response", provider: provider_id, input: input, voice: voice.to_s
|
|
254
|
+
) unless body.is_a?(Hash)
|
|
255
|
+
|
|
256
|
+
first = (body["data"] || []).first
|
|
257
|
+
if first.nil? || first["b64_json"].to_s.empty?
|
|
258
|
+
return audio_error_response(
|
|
259
|
+
error: "Upstream returned no audio data",
|
|
260
|
+
error_type: "empty_response", provider: provider_id, input: input, voice: voice.to_s
|
|
261
|
+
)
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
ext = case first["mime_type"].to_s
|
|
265
|
+
when "audio/mpeg", "audio/mp3" then "mp3"
|
|
266
|
+
when "audio/ogg" then "ogg"
|
|
267
|
+
else "wav"
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
path = save_b64_audio(first["b64_json"], output_dir: output_dir || Dir.pwd, prefix: "tts", extension: ext)
|
|
271
|
+
audio_success_response(
|
|
272
|
+
audio: path, input: input, voice: body["voice"] || voice.to_s, provider: provider_id,
|
|
273
|
+
extra: {
|
|
274
|
+
"mime_type" => first["mime_type"],
|
|
275
|
+
"usage" => body["usage"],
|
|
276
|
+
"cost_usd" => body["cost_usd"]
|
|
277
|
+
}.compact
|
|
278
|
+
)
|
|
279
|
+
end
|
|
280
|
+
|
|
138
281
|
private def connection
|
|
139
282
|
Faraday.new(url: normalized_base_url) do |f|
|
|
140
283
|
f.options.timeout = 240
|
|
@@ -142,6 +285,23 @@ module Clacky
|
|
|
142
285
|
end
|
|
143
286
|
end
|
|
144
287
|
|
|
288
|
+
# Video generation runs the gateway's submit+poll cycle inside one
|
|
289
|
+
# request, which can take several minutes; give it a much longer read
|
|
290
|
+
# timeout than the image path.
|
|
291
|
+
private def video_connection
|
|
292
|
+
Faraday.new(url: normalized_base_url) do |f|
|
|
293
|
+
f.options.timeout = 600
|
|
294
|
+
f.options.open_timeout = 10
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
private def audio_connection
|
|
299
|
+
Faraday.new(url: normalized_base_url) do |f|
|
|
300
|
+
f.options.timeout = 120
|
|
301
|
+
f.options.open_timeout = 10
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
145
305
|
private def gemini_family?(model_name)
|
|
146
306
|
model_name.to_s.match?(/gemini|imagen/i)
|
|
147
307
|
end
|
|
@@ -150,11 +150,6 @@ module Clacky
|
|
|
150
150
|
@messages.find { |m| m[:subagent_instructions] }
|
|
151
151
|
end
|
|
152
152
|
|
|
153
|
-
# Return all messages where task_id <= given id (Time Machine support).
|
|
154
|
-
def for_task(task_id)
|
|
155
|
-
@messages.select { |m| !m[:task_id] || m[:task_id] <= task_id }
|
|
156
|
-
end
|
|
157
|
-
|
|
158
153
|
# ─────────────────────────────────────────────
|
|
159
154
|
# Size helpers
|
|
160
155
|
# ─────────────────────────────────────────────
|
|
@@ -191,8 +186,18 @@ module Clacky
|
|
|
191
186
|
# can't fire when the previous turns came from a provider that keeps
|
|
192
187
|
# thinking inline (e.g. MiniMax: <think>...</think> in content), so
|
|
193
188
|
# this bypass lets us recover on the retry without a server restart.
|
|
194
|
-
|
|
195
|
-
|
|
189
|
+
# Convert to API-ready messages. When `task_chain` is given (a Set of
|
|
190
|
+
# task IDs forming the active task's ancestor chain), messages tagged with
|
|
191
|
+
# a task_id outside that chain are dropped first — this is the Time Machine
|
|
192
|
+
# path, ensuring undone/sibling-branch turns never reach the LLM. Messages
|
|
193
|
+
# without a task_id (system / injected context) are always kept.
|
|
194
|
+
def to_api(force_reasoning_content_pad: false, task_chain: nil)
|
|
195
|
+
source = if task_chain
|
|
196
|
+
@messages.select { |m| !m[:task_id] || task_chain.include?(m[:task_id]) }
|
|
197
|
+
else
|
|
198
|
+
@messages
|
|
199
|
+
end
|
|
200
|
+
msgs = source.map { |m| strip_for_api(m) }
|
|
196
201
|
msgs = repair_tool_call_pairing(msgs)
|
|
197
202
|
ensure_reasoning_content_consistency(msgs, force: force_reasoning_content_pad)
|
|
198
203
|
end
|
data/lib/clacky/providers.rb
CHANGED
|
@@ -60,6 +60,34 @@ module Clacky
|
|
|
60
60
|
"or-gpt-image-2" => "GPT Image 2"
|
|
61
61
|
},
|
|
62
62
|
"default_image_model" => "or-gpt-image-2",
|
|
63
|
+
# Video generation models served by the openclacky gateway, which
|
|
64
|
+
# routes them to Vertex AI Veo (async predictLongRunning under the
|
|
65
|
+
# hood; the gateway hides the polling and returns the MP4 inline).
|
|
66
|
+
"video_models" => [
|
|
67
|
+
"or-veo-3-1",
|
|
68
|
+
"or-veo-3-1-fast",
|
|
69
|
+
"or-veo-3",
|
|
70
|
+
"or-veo-3-fast"
|
|
71
|
+
],
|
|
72
|
+
"video_model_aliases" => {
|
|
73
|
+
"or-veo-3-1" => "Veo 3.1",
|
|
74
|
+
"or-veo-3-1-fast" => "Veo 3.1 Fast",
|
|
75
|
+
"or-veo-3" => "Veo 3",
|
|
76
|
+
"or-veo-3-fast" => "Veo 3 Fast"
|
|
77
|
+
},
|
|
78
|
+
"default_video_model" => "or-veo-3-1",
|
|
79
|
+
# Text-to-speech models served by the openclacky gateway, which
|
|
80
|
+
# routes them to Vertex AI Gemini 2.5 (responseModalities=["AUDIO"]).
|
|
81
|
+
# The gateway returns WAV inline as base64.
|
|
82
|
+
"audio_models" => [
|
|
83
|
+
"or-tts-gemini-2-5-flash",
|
|
84
|
+
"or-tts-gemini-2-5-pro"
|
|
85
|
+
],
|
|
86
|
+
"audio_model_aliases" => {
|
|
87
|
+
"or-tts-gemini-2-5-flash" => "Gemini 2.5 Flash TTS",
|
|
88
|
+
"or-tts-gemini-2-5-pro" => "Gemini 2.5 Pro TTS"
|
|
89
|
+
},
|
|
90
|
+
"default_audio_model" => "or-tts-gemini-2-5-flash",
|
|
63
91
|
# Default OCR sidecar — used when the primary model is text-only.
|
|
64
92
|
# Candidates are derived from the provider's vision-capable models;
|
|
65
93
|
# this just picks the cheap+fast default to surface in "auto" mode.
|
|
@@ -559,7 +559,9 @@ module Clacky
|
|
|
559
559
|
@running = false
|
|
560
560
|
end
|
|
561
561
|
|
|
562
|
-
|
|
562
|
+
# Clears the screen on exit by default — the Rich UI repaints fullscreen
|
|
563
|
+
# and leaves no useful scrollback to preserve.
|
|
564
|
+
def stop(clear_screen: true)
|
|
563
565
|
@running = false
|
|
564
566
|
@shell.stop
|
|
565
567
|
RubyRich::Terminal.clear if clear_screen
|