openclacky 1.2.18 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +21 -0
  3. data/lib/clacky/agent/time_machine.rb +256 -74
  4. data/lib/clacky/agent/tool_executor.rb +12 -0
  5. data/lib/clacky/agent.rb +15 -20
  6. data/lib/clacky/agent_config.rb +18 -0
  7. data/lib/clacky/cli.rb +55 -3
  8. data/lib/clacky/default_skills/media-gen/SKILL.md +172 -5
  9. data/lib/clacky/media/base.rb +93 -0
  10. data/lib/clacky/media/gemini.rb +10 -0
  11. data/lib/clacky/media/generator.rb +57 -0
  12. data/lib/clacky/media/openai_compat.rb +160 -0
  13. data/lib/clacky/message_history.rb +12 -7
  14. data/lib/clacky/providers.rb +28 -0
  15. data/lib/clacky/rich_ui_controller.rb +3 -1
  16. data/lib/clacky/server/backup_manager.rb +200 -0
  17. data/lib/clacky/server/channel/adapters/feishu/adapter.rb +10 -2
  18. data/lib/clacky/server/channel/adapters/feishu/bot.rb +68 -15
  19. data/lib/clacky/server/channel/channel_manager.rb +65 -50
  20. data/lib/clacky/server/http_server.rb +345 -14
  21. data/lib/clacky/server/scheduler.rb +19 -0
  22. data/lib/clacky/server/session_registry.rb +8 -4
  23. data/lib/clacky/session_manager.rb +40 -2
  24. data/lib/clacky/tools/trash_manager.rb +14 -0
  25. data/lib/clacky/ui2/components/command_suggestions.rb +1 -0
  26. data/lib/clacky/ui2/components/modal_component.rb +34 -7
  27. data/lib/clacky/ui2/ui_controller.rb +150 -19
  28. data/lib/clacky/utils/file_processor.rb +75 -4
  29. data/lib/clacky/version.rb +1 -1
  30. data/lib/clacky/web/app.css +2038 -1147
  31. data/lib/clacky/web/app.js +22 -1
  32. data/lib/clacky/web/backup.js +119 -0
  33. data/lib/clacky/web/billing.js +94 -7
  34. data/lib/clacky/web/channels.js +81 -11
  35. data/lib/clacky/web/design-sample.css +247 -0
  36. data/lib/clacky/web/design-sample.html +127 -0
  37. data/lib/clacky/web/favicon.svg +16 -0
  38. data/lib/clacky/web/i18n.js +159 -31
  39. data/lib/clacky/web/index.html +175 -55
  40. data/lib/clacky/web/logo_nav_dark.png +0 -0
  41. data/lib/clacky/web/onboard.js +114 -28
  42. data/lib/clacky/web/sessions.js +436 -192
  43. data/lib/clacky/web/settings.js +21 -1
  44. data/lib/clacky/web/skills.js +1 -1
  45. data/lib/clacky/web/tasks.js +129 -61
  46. data/lib/clacky/web/utils.js +72 -0
  47. data/lib/clacky/web/ws-dispatcher.js +6 -0
  48. data/lib/clacky.rb +1 -0
  49. metadata +7 -3
  50. data/lib/clacky/server/channel/group_message_buffer.rb +0 -53
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: media-gen
3
- description: 'Generate images (and later videos / audio) inside the current task. Use this skill whenever the user asks to create, generate, or produce a picture / image / illustration / cover / poster / icon / artwork including phrases like 生成图片, 画一张, 做封面, 来张配图, generate image, make a picture, draw, create artwork, design a cover. Also use when building documents (slides, PPT, posters, marketing pages, README hero shots) where an image is needed inline. Routes calls through the local Clacky HTTP server, which uses the user-configured `type=image` model — you do NOT need to know which provider; the server handles it.'
3
+ description: 'Generate images, videos, or audio (text-to-speech) in the current task. Use whenever the user asks to create/generate/produce a picture / image / illustration / cover / poster / icon / artwork, a video / clip / animation, or speech / voiceover / narration / TTS — e.g. 生成图片, 画一张, 做封面, 配图, generate image, make a picture, draw, design a cover, 生成视频, 做个视频, text-to-video, 朗读, 配音, 旁白, 文字转语音, generate speech, voiceover. Also use when a document (slides, poster, README hero) needs an inline image.'
4
4
  disable-model-invocation: false
5
5
  user-invocable: true
6
6
  always-show: true
@@ -27,13 +27,29 @@ curl -s http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/types
27
27
 
28
28
  If the response shows `image.configured = false`, stop and tell the user:
29
29
 
30
- > 还没有配置生图模型。请打开 Clacky 设置页 → 添加模型 → 类型选 `image`(推荐 `or-gemini-3-pro-image` 或 `or-gpt-image-1`)。配好后再让我生图。
30
+ > 还没有配置生图模型。请打开设置页 → 添加模型 → 类型选 `image`(走 openclacky 官方网关时推荐 `or-gemini-3-pro-image` 或 `or-gpt-image-2`)。配好后再让我生图。
31
31
 
32
32
  Do NOT try to fall back to `terminal` + a hand-written `curl https://api.openai.com/...` — that bypasses the user's configured backend and won't be billed correctly.
33
33
 
34
+ **You do NOT configure models — the user does, in the settings page.** Never
35
+ edit the user's `config.yml` to add or change a model, and never invent a model
36
+ name from memory (e.g. `or-gpt-5.4-image-2` does not exist). The real, current
37
+ model is whatever `/api/media/types` reports under `image.model`. If you think a
38
+ different model is needed, tell the user which one to set in the settings page —
39
+ don't touch the config file yourself.
40
+
34
41
  ## Step 2 — Generate the image
35
42
 
36
- ### ⚠️ Important: generation speed & concurrency
43
+ ### The model does NOT honor exact pixel sizes
44
+
45
+ There is no `size` / `width` / `height` field — the only shape control is
46
+ `aspect_ratio` (`landscape` / `square` / `portrait`), and even that is just a
47
+ rough hint (ask for `576x96` and you may get `1408x768`). When the user needs an
48
+ **exact pixel size, a grid, an icon at NxN, or a spritesheet**, generate first at
49
+ whatever size the model gives, then resize / crop / tile to the exact pixels with
50
+ ImageMagick (`magick`). Verify with `magick identify` before reporting done.
51
+
52
+ ### Important: generation speed & concurrency
37
53
 
38
54
  - **Image generation can be slow — up to 2 minutes per image depending on the model.** Before calling the API, warn the user that it may take a minute or two. The curl request blocks until the image is ready; do NOT run it in the background.
39
55
  - **One at a time only.** Never generate multiple images concurrently (e.g. by running several `curl` commands simultaneously or in a script loop). Each call consumes significant server-side resources, and parallel requests will almost certainly cause timeouts. If the user wants several images, generate them **sequentially**, one after another.
@@ -47,6 +63,10 @@ curl -s -X POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/ima
47
63
  }'
48
64
  ```
49
65
 
66
+ - The terminal blocks multi-line commands — write the request into a `.sh` file and run it, don't paste a multi-line `curl`.
67
+ - If a call fails with `400 / INVALID_ARGUMENT`, drop the `aspect_ratio` field and retry once before reporting the error.
68
+ - If a call fails with `unknown image model` (400), the configured model name isn't recognized by its backend — tell the user to fix the model name in the settings page; do NOT guess another name and retry.
69
+
50
70
  ### Request fields
51
71
 
52
72
  | Field | Required | Values | Notes |
@@ -129,6 +149,153 @@ When the user gives a vague request like "给我配张图", ask one clarifying q
129
149
  - The user wants a **diagram / chart** with specific data — use a charting library (matplotlib, mermaid, etc.) instead; image gen is for illustrations, not data viz
130
150
  - The user asks for **screenshots** of real software — use the browser tool
131
151
 
132
- ## Future modalities
152
+ ## Generating video (Veo)
153
+
154
+ The same `/api/media/` namespace serves video generation. The user must
155
+ configure a `type=video` model in settings (recommended: `or-veo-3-1`).
156
+
157
+ ### Endpoint
158
+
159
+ ```
160
+ POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/video
161
+ ```
162
+
163
+ Check `GET /api/media/types` first — if `video.configured = false`, tell the
164
+ user to add a `type=video` model in settings before generating.
165
+
166
+ ### Video is slow and expensive
167
+
168
+ - **A single clip can take 1–3 minutes (sometimes longer).** Warn the user
169
+ before calling, and run the curl in the foreground — it blocks until the
170
+ MP4 is ready. Do NOT background it.
171
+ - **One at a time.** Never run multiple video generations concurrently.
172
+ - Each clip costs real money (billed per output-second). Confirm the prompt
173
+ with the user before generating.
174
+
175
+ ### Request
176
+
177
+ ```bash
178
+ curl -s -X POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/video \
179
+ -H "Content-Type: application/json" \
180
+ -d '{
181
+ "prompt": "A cinematic drone shot flying over a misty mountain range at sunrise, golden light, 4K.",
182
+ "aspect_ratio": "landscape",
183
+ "duration_seconds": 8
184
+ }'
185
+ ```
186
+
187
+ | Field | Required | Values | Notes |
188
+ |--------------------|----------|---------------------------------|-------|
189
+ | `prompt` | yes | string | Same prompt-craft tips as images apply. |
190
+ | `aspect_ratio` | no | `landscape` / `portrait` | Defaults to `landscape` (16:9). |
191
+ | `duration_seconds` | no | 4–8 | Defaults to 8. |
192
+ | `image` | no | `{ "b64_json": "...", "mime_type": "image/png" }` | Optional first frame for image-to-video. |
193
+ | `output_dir` | no | absolute path | MP4 saved under `<output_dir>/assets/generated/`. |
194
+
195
+ ### Response (success)
196
+
197
+ ```json
198
+ {
199
+ "success": true,
200
+ "video": "/abs/path/to/working_dir/assets/generated/vid_20260615_011820_a1b2c3d4.mp4",
201
+ "model": "or-veo-3-1",
202
+ "provider": "openclacky",
203
+ "prompt": "A cinematic drone shot ...",
204
+ "aspect_ratio": "landscape",
205
+ "duration_seconds": 8,
206
+ "cost_usd": 2.688
207
+ }
208
+ ```
209
+
210
+ The `video` field is an absolute path on disk. Show it to the user with a
211
+ markdown link or an HTML5 `<video>` tag pointing at the `file://` path; embed
212
+ it in documents with a relative path under `./assets/generated/`.
213
+
214
+ ### Response (failure)
215
+
216
+ Same shape and `error_type` values as image generation, but with `"video": null`.
217
+ `not_configured` means no `type=video` model is set up.
218
+
219
+ ## Generating speech (Gemini TTS)
220
+
221
+ The same `/api/media/` namespace serves text-to-speech. The user must
222
+ configure a `type=audio` model in settings (recommended:
223
+ `or-tts-gemini-2-5-flash`, the cheap+fast default).
224
+
225
+ ### Endpoint
226
+
227
+ ```
228
+ POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/audio/speech
229
+ ```
230
+
231
+ Check `GET /api/media/types` first — if `audio.configured = false`, tell the
232
+ user to add a `type=audio` model in settings before generating.
233
+
234
+ ### Request
235
+
236
+ ```bash
237
+ curl -s -X POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/audio/speech \
238
+ -H "Content-Type: application/json" \
239
+ -d '{
240
+ "input": "Hello and welcome to openclacky. Today we will explore...",
241
+ "voice": "Kore"
242
+ }'
243
+ ```
244
+
245
+ | Field | Required | Values | Notes |
246
+ |--------------|----------|---------------------------------|-------|
247
+ | `input` | yes | string | The text to speak. Plain prose works best; you can prefix with style cues like "Say cheerfully:" or "In a calm tone:". |
248
+ | `voice` | no | string voice name | Defaults to `Kore`. Common Gemini voices: `Kore`, `Puck`, `Charon`, `Fenrir`, `Aoede`. |
249
+ | `output_dir` | no | absolute path | WAV saved under `<output_dir>/assets/generated/`. |
250
+
251
+ Generation typically takes 2–10 seconds depending on length. The request
252
+ blocks until the WAV is ready.
253
+
254
+ ### Response (success)
255
+
256
+ ```json
257
+ {
258
+ "success": true,
259
+ "audio": "/abs/path/to/working_dir/assets/generated/tts_20260615_233522_4ff02705.wav",
260
+ "model": "or-tts-gemini-2-5-flash",
261
+ "provider": "openclacky",
262
+ "input": "Hello and welcome to openclacky...",
263
+ "voice": "Kore",
264
+ "mime_type": "audio/wav",
265
+ "usage": { "prompt_tokens": 13, "completion_tokens": 122, "total_tokens": 135 },
266
+ "cost_usd": 0.000259
267
+ }
268
+ ```
269
+
270
+ The `audio` field is an absolute path on disk. Output is mono 16-bit PCM at
271
+ 24 kHz wrapped in a standard WAV container — playable by any browser, OS
272
+ player, or `<audio>` tag without conversion.
273
+
274
+ To let the user hear it, write a markdown link in your reply:
275
+
276
+ ```markdown
277
+ [🔊 听一下](file:///abs/path/from/response.wav)
278
+ ```
279
+
280
+ For embedding in HTML documents, use:
281
+
282
+ ```html
283
+ <audio controls src="./assets/generated/xxx.wav"></audio>
284
+ ```
285
+
286
+ ### Response (failure)
287
+
288
+ Same shape and `error_type` values as image generation, but with `"audio": null`.
289
+ `not_configured` means no `type=audio` model is set up.
290
+
291
+ ### Cost & length tips
292
+
293
+ - Gemini TTS bills by tokens (input text + generated audio). A typical
294
+ one-paragraph narration costs well under $0.001.
295
+ - For long-form audio (>1 minute), split the script into paragraphs and
296
+ generate each separately, then concatenate locally — avoids upstream
297
+ truncation and gives you finer control over pacing.
298
+ - Voice consistency: Gemini TTS does not currently support voice cloning;
299
+ use the same `voice` name across calls in one project to keep the
300
+ narrator consistent.
133
301
 
134
- The same `/api/media/` namespace will gain `video` and `audio` endpoints. The pattern is identical: the user configures `type=video` / `type=audio` models in settings, this skill (or its successor) calls the matching endpoint.
@@ -29,6 +29,28 @@ module Clacky
29
29
  raise NotImplementedError, "#{self.class.name} must implement #generate_image"
30
30
  end
31
31
 
32
+ # @return [Hash] either video_success_response(...) or
33
+ # video_error_response(...)
34
+ def generate_video(prompt:, aspect_ratio: "landscape", duration_seconds: nil, output_dir: nil, **_kwargs)
35
+ video_error_response(
36
+ error: "Video generation is not supported by #{self.class.name.split("::").last}. Use the openclacky gateway with a video model such as or-veo-3-1.",
37
+ error_type: "not_implemented",
38
+ provider: "",
39
+ prompt: prompt,
40
+ aspect_ratio: aspect_ratio
41
+ )
42
+ end
43
+
44
+ # @return [Hash] either audio_success_response(...) or audio_error_response(...)
45
+ def generate_speech(input:, voice: nil, output_dir: nil, **_kwargs)
46
+ audio_error_response(
47
+ error: "Speech synthesis is not supported by #{self.class.name.split("::").last}. Use the openclacky gateway with a TTS model such as or-tts-gemini-2-5-flash.",
48
+ error_type: "not_implemented",
49
+ provider: "",
50
+ input: input
51
+ )
52
+ end
53
+
32
54
  # Persist a base64-encoded image under <output_dir>/assets/generated/.
33
55
  # Returns the absolute path on disk.
34
56
  private def save_b64_image(b64_data, output_dir:, prefix: "img", extension: "png")
@@ -41,6 +63,29 @@ module Clacky
41
63
  path
42
64
  end
43
65
 
66
+ # Persist a base64-encoded video under <output_dir>/assets/generated/.
67
+ # Returns the absolute path on disk. Mirrors #save_b64_image; the only
68
+ # difference is the default extension (mp4).
69
+ private def save_b64_video(b64_data, output_dir:, prefix: "vid", extension: "mp4")
70
+ target_dir = File.join(output_dir, "assets", "generated")
71
+ FileUtils.mkdir_p(target_dir)
72
+ ts = Time.now.strftime("%Y%m%d_%H%M%S")
73
+ short = SecureRandom.hex(4)
74
+ path = File.join(target_dir, "#{prefix}_#{ts}_#{short}.#{extension}")
75
+ File.binwrite(path, Base64.decode64(b64_data))
76
+ path
77
+ end
78
+
79
+ private def save_b64_audio(b64_data, output_dir:, prefix: "tts", extension: "wav")
80
+ target_dir = File.join(output_dir, "assets", "generated")
81
+ FileUtils.mkdir_p(target_dir)
82
+ ts = Time.now.strftime("%Y%m%d_%H%M%S")
83
+ short = SecureRandom.hex(4)
84
+ path = File.join(target_dir, "#{prefix}_#{ts}_#{short}.#{extension}")
85
+ File.binwrite(path, Base64.decode64(b64_data))
86
+ path
87
+ end
88
+
44
89
  # Download a remote image URL and persist it under
45
90
  # <output_dir>/assets/generated/, mirroring save_b64_image so providers
46
91
  # that return URLs (e.g. DashScope, whose links expire after 24h) land
@@ -95,6 +140,54 @@ module Clacky
95
140
  "provider" => provider
96
141
  }
97
142
  end
143
+
144
+ private def video_success_response(video:, prompt:, aspect_ratio:, provider:, extra: {})
145
+ {
146
+ "success" => true,
147
+ "video" => video,
148
+ "model" => @model,
149
+ "prompt" => prompt,
150
+ "aspect_ratio" => aspect_ratio,
151
+ "provider" => provider
152
+ }.merge(extra)
153
+ end
154
+
155
+ private def video_error_response(error:, error_type: "provider_error", provider: "", prompt: "", aspect_ratio: "landscape")
156
+ {
157
+ "success" => false,
158
+ "video" => nil,
159
+ "error" => error,
160
+ "error_type" => error_type,
161
+ "model" => @model,
162
+ "prompt" => prompt,
163
+ "aspect_ratio" => aspect_ratio,
164
+ "provider" => provider
165
+ }
166
+ end
167
+
168
+ private def audio_success_response(audio:, input:, voice:, provider:, extra: {})
169
+ {
170
+ "success" => true,
171
+ "audio" => audio,
172
+ "model" => @model,
173
+ "input" => input,
174
+ "voice" => voice,
175
+ "provider" => provider
176
+ }.merge(extra)
177
+ end
178
+
179
+ private def audio_error_response(error:, error_type: "provider_error", provider: "", input: "", voice: "")
180
+ {
181
+ "success" => false,
182
+ "audio" => nil,
183
+ "error" => error,
184
+ "error_type" => error_type,
185
+ "model" => @model,
186
+ "input" => input,
187
+ "voice" => voice,
188
+ "provider" => provider
189
+ }
190
+ end
98
191
  end
99
192
  end
100
193
  end
@@ -31,6 +31,16 @@ module Clacky
31
31
  aspect_ratio: aspect_ratio
32
32
  )
33
33
  end
34
+
35
+ def generate_video(prompt:, aspect_ratio: "landscape", duration_seconds: nil, output_dir: nil, **_kwargs)
36
+ video_error_response(
37
+ error: "Direct Google AI Studio video generation is not supported. Use the openclacky gateway (base_url https://api.openclacky.com) with a video model such as or-veo-3-1.",
38
+ error_type: "not_implemented",
39
+ provider: "gemini-direct",
40
+ prompt: prompt,
41
+ aspect_ratio: aspect_ratio
42
+ )
43
+ end
34
44
  end
35
45
  end
36
46
  end
@@ -44,6 +44,16 @@ module Clacky
44
44
  @agent_config.find_model_by_type("image")
45
45
  end
46
46
 
47
+ # @return [Hash, nil] the type=video model entry, or nil if not configured
48
+ def video_model_entry
49
+ @agent_config.find_model_by_type("video")
50
+ end
51
+
52
+ # @return [Hash, nil] the type=audio model entry, or nil if not configured
53
+ def audio_model_entry
54
+ @agent_config.find_model_by_type("audio")
55
+ end
56
+
47
57
  def generate_image(prompt:, aspect_ratio: "landscape", output_dir: nil, **kwargs)
48
58
  entry = image_model_entry
49
59
  if entry.nil?
@@ -67,6 +77,53 @@ module Clacky
67
77
  )
68
78
  end
69
79
 
80
+ def generate_video(prompt:, aspect_ratio: "landscape", duration_seconds: nil, output_dir: nil, **kwargs)
81
+ entry = video_model_entry
82
+ if entry.nil?
83
+ return {
84
+ "success" => false,
85
+ "video" => nil,
86
+ "error" => "No video model configured. Add a model with type=video in settings.",
87
+ "error_type" => "not_configured",
88
+ "provider" => "",
89
+ "model" => "",
90
+ "prompt" => prompt
91
+ }
92
+ end
93
+
94
+ provider = build_provider_for(entry)
95
+ provider.generate_video(
96
+ prompt: prompt,
97
+ aspect_ratio: aspect_ratio,
98
+ duration_seconds: duration_seconds,
99
+ output_dir: output_dir,
100
+ **kwargs
101
+ )
102
+ end
103
+
104
+ def generate_speech(input:, voice: nil, output_dir: nil, **kwargs)
105
+ entry = audio_model_entry
106
+ if entry.nil?
107
+ return {
108
+ "success" => false,
109
+ "audio" => nil,
110
+ "error" => "No audio model configured. Add a model with type=audio in settings.",
111
+ "error_type" => "not_configured",
112
+ "provider" => "",
113
+ "model" => "",
114
+ "input" => input
115
+ }
116
+ end
117
+
118
+ provider = build_provider_for(entry)
119
+ provider.generate_speech(
120
+ input: input,
121
+ voice: voice,
122
+ output_dir: output_dir,
123
+ **kwargs
124
+ )
125
+ end
126
+
70
127
  # Pick the adapter class for a media model entry.
71
128
  #
72
129
  # Routing rules:
@@ -22,6 +22,12 @@ module Clacky
22
22
 
23
23
  DEFAULT_ASPECT = "landscape"
24
24
 
25
+ # Video aspect ratios accepted by the gateway's /videos/generations
26
+ # endpoint. The human-friendly labels map straight through; the gateway
27
+ # normalises to Veo's "16:9" / "9:16" internally.
28
+ VIDEO_ASPECTS = %w[landscape portrait].freeze
29
+ DEFAULT_VIDEO_DURATION = 8
30
+
25
31
  def generate_image(prompt:, aspect_ratio: DEFAULT_ASPECT, output_dir: nil, n: 1, **_kwargs)
26
32
  provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
27
33
  aspect = ASPECT_TO_SIZE.key?(aspect_ratio) ? aspect_ratio : DEFAULT_ASPECT
@@ -135,6 +141,143 @@ module Clacky
135
141
  )
136
142
  end
137
143
 
144
+ def generate_video(prompt:, aspect_ratio: DEFAULT_ASPECT, duration_seconds: nil, output_dir: nil, image: nil, **_kwargs)
145
+ provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
146
+ aspect = VIDEO_ASPECTS.include?(aspect_ratio) ? aspect_ratio : DEFAULT_ASPECT
147
+ duration = duration_seconds.to_i
148
+ duration = DEFAULT_VIDEO_DURATION if duration <= 0
149
+
150
+ if prompt.to_s.strip.empty?
151
+ return video_error_response(
152
+ error: "Prompt is required and must be a non-empty string",
153
+ error_type: "invalid_argument", provider: provider_id, aspect_ratio: aspect
154
+ )
155
+ end
156
+ if @api_key.to_s.empty?
157
+ return video_error_response(
158
+ error: "api_key not configured for video model '#{@model}'",
159
+ error_type: "auth_required", provider: provider_id, prompt: prompt, aspect_ratio: aspect
160
+ )
161
+ end
162
+
163
+ payload = { model: @model, prompt: prompt, aspect_ratio: aspect, duration_seconds: duration }
164
+ payload[:image] = image if image.is_a?(Hash) && image["b64_json"]
165
+
166
+ begin
167
+ response = video_connection.post("videos/generations") do |req|
168
+ req.headers["Content-Type"] = "application/json"
169
+ req.headers["Authorization"] = "Bearer #{@api_key}"
170
+ req.body = JSON.generate(payload)
171
+ end
172
+ rescue Faraday::Error => e
173
+ return video_error_response(
174
+ error: "HTTP request failed: #{e.message}",
175
+ error_type: "network_error", provider: provider_id, prompt: prompt, aspect_ratio: aspect
176
+ )
177
+ end
178
+
179
+ unless response.success?
180
+ return video_error_response(
181
+ error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
182
+ error_type: "api_error", provider: provider_id, prompt: prompt, aspect_ratio: aspect
183
+ )
184
+ end
185
+
186
+ body = parse_json(response.body)
187
+ return video_error_response(
188
+ error: "Invalid JSON response from upstream",
189
+ error_type: "invalid_response", provider: provider_id, prompt: prompt, aspect_ratio: aspect
190
+ ) unless body.is_a?(Hash)
191
+
192
+ first = (body["data"] || []).first
193
+ if first.nil? || first["b64_json"].to_s.empty?
194
+ return video_error_response(
195
+ error: "Upstream returned no video data",
196
+ error_type: "empty_response", provider: provider_id, prompt: prompt, aspect_ratio: aspect
197
+ )
198
+ end
199
+
200
+ path = save_b64_video(first["b64_json"], output_dir: output_dir || Dir.pwd, prefix: "vid")
201
+ video_success_response(
202
+ video: path, prompt: prompt, aspect_ratio: aspect, provider: provider_id,
203
+ extra: {
204
+ "duration_seconds" => duration,
205
+ "usage" => body["usage"],
206
+ "cost_usd" => body["cost_usd"]
207
+ }.compact
208
+ )
209
+ end
210
+
211
+ def generate_speech(input:, voice: nil, output_dir: nil, **_kwargs)
212
+ provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
213
+
214
+ if input.to_s.strip.empty?
215
+ return audio_error_response(
216
+ error: "input is required and must be a non-empty string",
217
+ error_type: "invalid_argument", provider: provider_id, voice: voice.to_s
218
+ )
219
+ end
220
+ if @api_key.to_s.empty?
221
+ return audio_error_response(
222
+ error: "api_key not configured for audio model '#{@model}'",
223
+ error_type: "auth_required", provider: provider_id, input: input, voice: voice.to_s
224
+ )
225
+ end
226
+
227
+ payload = { model: @model, input: input }
228
+ payload[:voice] = voice if voice && !voice.to_s.strip.empty?
229
+
230
+ begin
231
+ response = audio_connection.post("audio/speech") do |req|
232
+ req.headers["Content-Type"] = "application/json"
233
+ req.headers["Authorization"] = "Bearer #{@api_key}"
234
+ req.body = JSON.generate(payload)
235
+ end
236
+ rescue Faraday::Error => e
237
+ return audio_error_response(
238
+ error: "HTTP request failed: #{e.message}",
239
+ error_type: "network_error", provider: provider_id, input: input, voice: voice.to_s
240
+ )
241
+ end
242
+
243
+ unless response.success?
244
+ return audio_error_response(
245
+ error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
246
+ error_type: "api_error", provider: provider_id, input: input, voice: voice.to_s
247
+ )
248
+ end
249
+
250
+ body = parse_json(response.body)
251
+ return audio_error_response(
252
+ error: "Invalid JSON response from upstream",
253
+ error_type: "invalid_response", provider: provider_id, input: input, voice: voice.to_s
254
+ ) unless body.is_a?(Hash)
255
+
256
+ first = (body["data"] || []).first
257
+ if first.nil? || first["b64_json"].to_s.empty?
258
+ return audio_error_response(
259
+ error: "Upstream returned no audio data",
260
+ error_type: "empty_response", provider: provider_id, input: input, voice: voice.to_s
261
+ )
262
+ end
263
+
264
+ ext = case first["mime_type"].to_s
265
+ when "audio/mpeg", "audio/mp3" then "mp3"
266
+ when "audio/ogg" then "ogg"
267
+ else "wav"
268
+ end
269
+
270
+ path = save_b64_audio(first["b64_json"], output_dir: output_dir || Dir.pwd, prefix: "tts", extension: ext)
271
+ audio_success_response(
272
+ audio: path, input: input, voice: body["voice"] || voice.to_s, provider: provider_id,
273
+ extra: {
274
+ "mime_type" => first["mime_type"],
275
+ "usage" => body["usage"],
276
+ "cost_usd" => body["cost_usd"]
277
+ }.compact
278
+ )
279
+ end
280
+
138
281
  private def connection
139
282
  Faraday.new(url: normalized_base_url) do |f|
140
283
  f.options.timeout = 240
@@ -142,6 +285,23 @@ module Clacky
142
285
  end
143
286
  end
144
287
 
288
+ # Video generation runs the gateway's submit+poll cycle inside one
289
+ # request, which can take several minutes; give it a much longer read
290
+ # timeout than the image path.
291
+ private def video_connection
292
+ Faraday.new(url: normalized_base_url) do |f|
293
+ f.options.timeout = 600
294
+ f.options.open_timeout = 10
295
+ end
296
+ end
297
+
298
+ private def audio_connection
299
+ Faraday.new(url: normalized_base_url) do |f|
300
+ f.options.timeout = 120
301
+ f.options.open_timeout = 10
302
+ end
303
+ end
304
+
145
305
  private def gemini_family?(model_name)
146
306
  model_name.to_s.match?(/gemini|imagen/i)
147
307
  end
@@ -150,11 +150,6 @@ module Clacky
150
150
  @messages.find { |m| m[:subagent_instructions] }
151
151
  end
152
152
 
153
- # Return all messages where task_id <= given id (Time Machine support).
154
- def for_task(task_id)
155
- @messages.select { |m| !m[:task_id] || m[:task_id] <= task_id }
156
- end
157
-
158
153
  # ─────────────────────────────────────────────
159
154
  # Size helpers
160
155
  # ─────────────────────────────────────────────
@@ -191,8 +186,18 @@ module Clacky
191
186
  # can't fire when the previous turns came from a provider that keeps
192
187
  # thinking inline (e.g. MiniMax: <think>...</think> in content), so
193
188
  # this bypass lets us recover on the retry without a server restart.
194
- def to_api(force_reasoning_content_pad: false)
195
- msgs = @messages.map { |m| strip_for_api(m) }
189
+ # Convert to API-ready messages. When `task_chain` is given (a Set of
190
+ # task IDs forming the active task's ancestor chain), messages tagged with
191
+ # a task_id outside that chain are dropped first — this is the Time Machine
192
+ # path, ensuring undone/sibling-branch turns never reach the LLM. Messages
193
+ # without a task_id (system / injected context) are always kept.
194
+ def to_api(force_reasoning_content_pad: false, task_chain: nil)
195
+ source = if task_chain
196
+ @messages.select { |m| !m[:task_id] || task_chain.include?(m[:task_id]) }
197
+ else
198
+ @messages
199
+ end
200
+ msgs = source.map { |m| strip_for_api(m) }
196
201
  msgs = repair_tool_call_pairing(msgs)
197
202
  ensure_reasoning_content_consistency(msgs, force: force_reasoning_content_pad)
198
203
  end
@@ -60,6 +60,34 @@ module Clacky
60
60
  "or-gpt-image-2" => "GPT Image 2"
61
61
  },
62
62
  "default_image_model" => "or-gpt-image-2",
63
+ # Video generation models served by the openclacky gateway, which
64
+ # routes them to Vertex AI Veo (async predictLongRunning under the
65
+ # hood; the gateway hides the polling and returns the MP4 inline).
66
+ "video_models" => [
67
+ "or-veo-3-1",
68
+ "or-veo-3-1-fast",
69
+ "or-veo-3",
70
+ "or-veo-3-fast"
71
+ ],
72
+ "video_model_aliases" => {
73
+ "or-veo-3-1" => "Veo 3.1",
74
+ "or-veo-3-1-fast" => "Veo 3.1 Fast",
75
+ "or-veo-3" => "Veo 3",
76
+ "or-veo-3-fast" => "Veo 3 Fast"
77
+ },
78
+ "default_video_model" => "or-veo-3-1",
79
+ # Text-to-speech models served by the openclacky gateway, which
80
+ # routes them to Vertex AI Gemini 2.5 (responseModalities=["AUDIO"]).
81
+ # The gateway returns WAV inline as base64.
82
+ "audio_models" => [
83
+ "or-tts-gemini-2-5-flash",
84
+ "or-tts-gemini-2-5-pro"
85
+ ],
86
+ "audio_model_aliases" => {
87
+ "or-tts-gemini-2-5-flash" => "Gemini 2.5 Flash TTS",
88
+ "or-tts-gemini-2-5-pro" => "Gemini 2.5 Pro TTS"
89
+ },
90
+ "default_audio_model" => "or-tts-gemini-2-5-flash",
63
91
  # Default OCR sidecar — used when the primary model is text-only.
64
92
  # Candidates are derived from the provider's vision-capable models;
65
93
  # this just picks the cheap+fast default to surface in "auto" mode.
@@ -559,7 +559,9 @@ module Clacky
559
559
  @running = false
560
560
  end
561
561
 
562
- def stop(clear_screen: false)
562
+ # Clears the screen on exit by default — the Rich UI repaints fullscreen
563
+ # and leaves no useful scrollback to preserve.
564
+ def stop(clear_screen: true)
563
565
  @running = false
564
566
  @shell.stop
565
567
  RubyRich::Terminal.clear if clear_screen