openclacky 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +27 -0
  3. data/lib/clacky/agent/fake_tool_call_detector.rb +52 -0
  4. data/lib/clacky/agent/session_serializer.rb +3 -2
  5. data/lib/clacky/agent/tool_executor.rb +0 -12
  6. data/lib/clacky/agent.rb +74 -9
  7. data/lib/clacky/api_extension.rb +81 -0
  8. data/lib/clacky/api_extension_loader.rb +13 -1
  9. data/lib/clacky/client.rb +14 -17
  10. data/lib/clacky/default_agents/_panels/time_machine/panel.js +22 -0
  11. data/lib/clacky/default_agents/base_prompt.md +1 -0
  12. data/lib/clacky/default_extensions/meeting/handler.rb +331 -0
  13. data/lib/clacky/default_extensions/meeting/meeting.js +790 -0
  14. data/lib/clacky/default_extensions/meeting/meta.yml +3 -0
  15. data/lib/clacky/default_extensions/meeting/skills/meeting-summarizer/SKILL.md +44 -0
  16. data/lib/clacky/default_skills/media-gen/SKILL.md +63 -0
  17. data/lib/clacky/default_skills/media-gen/scripts/video_seq.sh +114 -0
  18. data/lib/clacky/json_ui_controller.rb +1 -1
  19. data/lib/clacky/media/base.rb +60 -0
  20. data/lib/clacky/media/dashscope.rb +385 -21
  21. data/lib/clacky/media/gemini.rb +9 -0
  22. data/lib/clacky/media/generator.rb +52 -0
  23. data/lib/clacky/media/openai_compat.rb +166 -0
  24. data/lib/clacky/null_ui_controller.rb +13 -0
  25. data/lib/clacky/plain_ui_controller.rb +1 -1
  26. data/lib/clacky/providers.rb +50 -2
  27. data/lib/clacky/rich_ui/rich_ui_controller.rb +1 -1
  28. data/lib/clacky/server/channel/channel_ui_controller.rb +1 -1
  29. data/lib/clacky/server/http_server.rb +144 -9
  30. data/lib/clacky/server/session_registry.rb +4 -2
  31. data/lib/clacky/server/web_ui_controller.rb +3 -2
  32. data/lib/clacky/skill_loader.rb +14 -2
  33. data/lib/clacky/tools/terminal/output_cleaner.rb +1 -3
  34. data/lib/clacky/tools/terminal.rb +0 -43
  35. data/lib/clacky/ui2/components/modal_component.rb +1 -1
  36. data/lib/clacky/ui2/ui_controller.rb +140 -31
  37. data/lib/clacky/ui_interface.rb +10 -1
  38. data/lib/clacky/utils/encoding.rb +25 -0
  39. data/lib/clacky/version.rb +1 -1
  40. data/lib/clacky/web/app.css +145 -22
  41. data/lib/clacky/web/components/onboard.js +1 -14
  42. data/lib/clacky/web/features/brand/view.js +8 -5
  43. data/lib/clacky/web/features/channels/store.js +1 -20
  44. data/lib/clacky/web/features/mcp/store.js +1 -20
  45. data/lib/clacky/web/features/profile/store.js +1 -13
  46. data/lib/clacky/web/features/profile/view.js +16 -4
  47. data/lib/clacky/web/features/skills/store.js +6 -21
  48. data/lib/clacky/web/features/version/store.js +2 -0
  49. data/lib/clacky/web/i18n.js +24 -1
  50. data/lib/clacky/web/index.html +15 -0
  51. data/lib/clacky/web/sessions.js +141 -51
  52. data/lib/clacky/web/settings.js +34 -2
  53. data/lib/clacky/web/ws-dispatcher.js +11 -3
  54. data/lib/clacky.rb +12 -5
  55. metadata +8 -1
@@ -7,33 +7,48 @@ require_relative "base"
7
7
 
8
8
  module Clacky
9
9
  module Media
10
- # Alibaba DashScope (Qwen-Image) image generation provider.
10
+ # Alibaba DashScope (Qwen-Image / CosyVoice / HappyHorse) media generation provider.
11
11
  #
12
- # DashScope is NOT an OpenAI-compatible image API. It has its own
13
- # endpoint, request envelope and response schema:
14
- #
15
- # POST <host>/api/v1/services/aigc/multimodal-generation/generation
16
- # Authorization: Bearer <key>
17
- # { "model": "qwen-image-2.0-pro",
18
- # "input": { "messages": [ { "role": "user",
19
- # "content": [ { "text": "<prompt>" } ] } ] },
20
- # "parameters": { "size": "2048*2048", "n": 1,
21
- # "prompt_extend": true, "watermark": false } }
22
- #
23
- # => { "output": { "choices": [ { "message": { "content": [
24
- # { "image": "https://...png?Expires=..." } ] } } ] },
25
- # "usage": { "width": 2048, "height": 2048, "image_count": 1 } }
26
- #
27
- # The image link expires after 24h, so we download and persist it under
28
- # <output_dir>/assets/generated/ (via Base#save_image_from_url), matching
29
- # the on-disk shape of the base64 providers.
12
+ # DashScope is NOT an OpenAI-compatible API. It has its own endpoint,
13
+ # request envelope and response schema for image, speech (TTS), and video generation.
30
14
  #
31
15
  # Routing: Generator sends any base_url under *.aliyuncs.com here. We
32
16
  # derive the real generation endpoint from the host so users can paste
33
17
  # the compatible-mode base_url (…/compatible-mode/v1) they already use
34
- # for Qwen text models and still get working image generation.
18
+ # for Qwen text models and still get working media generation.
19
+ #
20
+ # --- Endpoint migration TODO (2026-06) ---------------------------------
21
+ # Aliyun is gradually deprecating the shared `dashscope.aliyuncs.com`
22
+ # host in favor of the per-workspace MaaS domain
23
+ # `https://{WorkspaceId}.cn-beijing.maas.aliyuncs.com` (intl:
24
+ # `{WorkspaceId}.dashscope-intl.aliyuncs.com`). Docs have already moved
25
+ # to the new domain; the old host still works for most models but is
26
+ # expected to be sunset eventually.
27
+ #
28
+ # Current stance: keep accepting the old shared host as the default
29
+ # (zero-config for users + compatibility with third-party aggregators
30
+ # that don't use aliyuncs.com at all). The new MaaS domain already
31
+ # works today via endpoint_base derivation. Non-real-time TTS
32
+ # (qwen3-tts) does NOT work on the shared host and already emits a
33
+ # hint pointing users at the MaaS domain — see the "url error" branch
34
+ # in generate_speech.
35
+ #
36
+ # Action when Aliyun announces the sunset of compatible-mode:
37
+ # 1. Flip the default expectation to the WorkspaceId MaaS domain.
38
+ # 2. Add a setup flow / docs explaining how to find WorkspaceId.
39
+ # 3. Keep accepting aggregator base_urls unchanged.
40
+ # Do NOT pre-emptively migrate before an official sunset notice — it
41
+ # would break zero-config UX and aggregator users for no current gain.
35
42
  class DashScope < Base
36
- GENERATION_PATH = "/api/v1/services/aigc/multimodal-generation/generation"
43
+ GENERATION_PATH = "/api/v1/services/aigc/multimodal-generation/generation"
44
+ SPEECH_PATH_COSY = "/api/v1/services/audio/tts/SpeechSynthesizer"
45
+ VIDEO_PATH = "/api/v1/services/aigc/video-generation/video-synthesis"
46
+ TASK_PATH = "/api/v1/tasks/"
47
+
48
+ # Default voice per TTS model family. CosyVoice defaults to longanyang;
49
+ # Qwen3-TTS defaults to Cherry (most common Chinese female voice).
50
+ DEFAULT_SPEECH_VOICE_COSY = "longanyang"
51
+ DEFAULT_SPEECH_VOICE_QWEN = "Cherry"
37
52
 
38
53
  # aspect_ratio -> "<width>*<height>" (DashScope uses '*' not 'x').
39
54
  # qwen-image-2.0 / -plus / -max share these recommended resolutions;
@@ -178,6 +193,314 @@ module Clacky
178
193
  )
179
194
  end
180
195
 
196
+ # Synthesizes speech (TTS) using Alibaba CosyVoice models (e.g. cosyvoice-v3-flash).
197
+ # This is a synchronous call.
198
+ #
199
+ # @param input [String] the text to synthesize
200
+ # @param voice [String, nil] the voice name; defaults to "longanyang" for CosyVoice or "Cherry" for Qwen3-TTS
201
+ # @param output_dir [String, nil] the directory to save the output audio
202
+ # @param language_type [String, nil] language hint for Qwen3-TTS (default "Chinese"); ignored by CosyVoice
203
+ # @return [Hash] audio_success_response or audio_error_response
204
+ def generate_speech(input:, voice: nil, output_dir: nil, language_type: nil, **_kwargs)
205
+ if input.to_s.strip.empty?
206
+ return audio_error_response(
207
+ error: "Input text is required and must be a non-empty string",
208
+ error_type: "invalid_argument",
209
+ provider: PROVIDER_ID,
210
+ voice: voice.to_s
211
+ )
212
+ end
213
+
214
+ if @api_key.to_s.empty?
215
+ return audio_error_response(
216
+ error: "api_key not configured for audio model '#{@model}'",
217
+ error_type: "auth_required",
218
+ provider: PROVIDER_ID,
219
+ input: input,
220
+ voice: voice.to_s
221
+ )
222
+ end
223
+
224
+ # Pick endpoint and payload shape based on model family. CosyVoice
225
+ # uses the dedicated TTS endpoint and accepts format/sample_rate;
226
+ # Qwen3-TTS is a multimodal-generation model and expects
227
+ # language_type instead.
228
+ endpoint = speech_endpoint
229
+ chosen_voice = voice || default_speech_voice
230
+ payload = speech_payload(input: input, voice: chosen_voice, language_type: language_type)
231
+
232
+ begin
233
+ response = connection.post(endpoint) do |req|
234
+ req.headers["Content-Type"] = "application/json"
235
+ req.headers["Authorization"] = "Bearer #{@api_key}"
236
+ req.body = JSON.generate(payload)
237
+ end
238
+ rescue Faraday::Error => e
239
+ return audio_error_response(
240
+ error: "HTTP request failed: #{e.message}",
241
+ error_type: "network_error",
242
+ provider: PROVIDER_ID,
243
+ input: input,
244
+ voice: voice.to_s
245
+ )
246
+ end
247
+
248
+ body = parse_json(response.body)
249
+ unless body.is_a?(Hash)
250
+ return audio_error_response(
251
+ error: "Invalid JSON response from upstream",
252
+ error_type: "invalid_response",
253
+ provider: PROVIDER_ID,
254
+ input: input,
255
+ voice: voice.to_s
256
+ )
257
+ end
258
+
259
+ # Inspect any business level errors from DashScope
260
+ if body["code"] && !body["code"].to_s.empty?
261
+ err_msg = body["message"].to_s
262
+ if err_msg.include?("url error") && @base_url.to_s.include?("dashscope.aliyuncs.com")
263
+ err_msg += " (Note: Alibaba Model Studio non-real-time TTS does not support the public shared endpoint. " \
264
+ "Set the model's Base URL to your dedicated MaaS domain, e.g. " \
265
+ "https://{WorkspaceId}.cn-beijing.maas.aliyuncs.com)"
266
+ end
267
+ return audio_error_response(
268
+ error: "Upstream error #{body["code"]}: #{err_msg}",
269
+ error_type: "api_error",
270
+ provider: PROVIDER_ID,
271
+ input: input,
272
+ voice: voice.to_s
273
+ )
274
+ end
275
+
276
+ unless response.success?
277
+ return audio_error_response(
278
+ error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
279
+ error_type: "api_error",
280
+ provider: PROVIDER_ID,
281
+ input: input,
282
+ voice: voice.to_s
283
+ )
284
+ end
285
+
286
+ audio_url = body.dig("output", "audio", "url")
287
+ if audio_url.nil? || audio_url.empty?
288
+ return audio_error_response(
289
+ error: "Upstream returned no audio data",
290
+ error_type: "empty_response",
291
+ provider: PROVIDER_ID,
292
+ input: input,
293
+ voice: voice.to_s
294
+ )
295
+ end
296
+
297
+ # Download the audio file from OSS and save it locally in the target output directory
298
+ local_path = save_image_from_url(audio_url, output_dir: output_dir || Dir.pwd, prefix: "tts", extension: "wav")
299
+ if local_path.nil?
300
+ return audio_error_response(
301
+ error: "Failed to download generated audio from #{audio_url}",
302
+ error_type: "download_failed",
303
+ provider: PROVIDER_ID,
304
+ input: input,
305
+ voice: voice.to_s
306
+ )
307
+ end
308
+
309
+ audio_success_response(
310
+ audio: local_path,
311
+ input: input,
312
+ voice: chosen_voice,
313
+ provider: PROVIDER_ID,
314
+ extra: {
315
+ "request_id" => body["request_id"]
316
+ }.compact
317
+ )
318
+ end
319
+
320
+ # Generates a video using Alibaba HappyHorse or Wanx models.
321
+ # This is a mandatory asynchronous API. We submit the task, and poll
322
+ # the task status until it succeeds, fails, or times out.
323
+ #
324
+ # @param prompt [String] the video prompt
325
+ # @param aspect_ratio [String] "landscape", "portrait", or "square"
326
+ # @param duration_seconds [Integer, nil] duration in seconds
327
+ # @param output_dir [String, nil] the directory to save the output video
328
+ # @return [Hash] video_success_response or video_error_response
329
+ def generate_video(prompt:, aspect_ratio: "landscape", duration_seconds: nil, output_dir: nil, **_kwargs)
330
+ if prompt.to_s.strip.empty?
331
+ return video_error_response(
332
+ error: "Prompt is required and must be a non-empty string",
333
+ error_type: "invalid_argument",
334
+ provider: PROVIDER_ID,
335
+ aspect_ratio: aspect_ratio
336
+ )
337
+ end
338
+
339
+ if @api_key.to_s.empty?
340
+ return video_error_response(
341
+ error: "api_key not configured for video model '#{@model}'",
342
+ error_type: "auth_required",
343
+ provider: PROVIDER_ID,
344
+ prompt: prompt,
345
+ aspect_ratio: aspect_ratio
346
+ )
347
+ end
348
+
349
+ # Map aspect ratio strings to Alibaba's ratio values (e.g. 16:9).
350
+ ratio = case aspect_ratio
351
+ when "portrait" then "9:16"
352
+ when "square" then "1:1"
353
+ else "16:9"
354
+ end
355
+
356
+ # Construct payload. Ratio and resolution are placed under the "parameters" key.
357
+ payload = {
358
+ model: @model,
359
+ input: {
360
+ prompt: prompt
361
+ },
362
+ parameters: {
363
+ resolution: "720P",
364
+ ratio: ratio
365
+ }
366
+ }
367
+ payload[:parameters][:duration] = duration_seconds if duration_seconds
368
+
369
+ begin
370
+ # Submit the task. Alibaba requires 'X-DashScope-Async: enable' header for video synthesis.
371
+ response = connection.post(VIDEO_PATH) do |req|
372
+ req.headers["Content-Type"] = "application/json"
373
+ req.headers["Authorization"] = "Bearer #{@api_key}"
374
+ req.headers["X-DashScope-Async"] = "enable"
375
+ req.body = JSON.generate(payload)
376
+ end
377
+ rescue Faraday::Error => e
378
+ return video_error_response(
379
+ error: "HTTP request failed: #{e.message}",
380
+ error_type: "network_error",
381
+ provider: PROVIDER_ID,
382
+ prompt: prompt,
383
+ aspect_ratio: aspect_ratio
384
+ )
385
+ end
386
+
387
+ body = parse_json(response.body)
388
+ unless body.is_a?(Hash)
389
+ return video_error_response(
390
+ error: "Invalid JSON response from upstream",
391
+ error_type: "invalid_response",
392
+ provider: PROVIDER_ID,
393
+ prompt: prompt,
394
+ aspect_ratio: aspect_ratio
395
+ )
396
+ end
397
+
398
+ if body["code"] && !body["code"].to_s.empty?
399
+ return video_error_response(
400
+ error: "Upstream error #{body["code"]}: #{body["message"]}",
401
+ error_type: "api_error",
402
+ provider: PROVIDER_ID,
403
+ prompt: prompt,
404
+ aspect_ratio: aspect_ratio
405
+ )
406
+ end
407
+
408
+ unless response.success?
409
+ return video_error_response(
410
+ error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
411
+ error_type: "api_error",
412
+ provider: PROVIDER_ID,
413
+ prompt: prompt,
414
+ aspect_ratio: aspect_ratio
415
+ )
416
+ end
417
+
418
+ task_id = body.dig("output", "task_id")
419
+ if task_id.nil? || task_id.empty?
420
+ return video_error_response(
421
+ error: "Upstream did not return a task_id",
422
+ error_type: "empty_response",
423
+ provider: PROVIDER_ID,
424
+ prompt: prompt,
425
+ aspect_ratio: aspect_ratio
426
+ )
427
+ end
428
+
429
+ # Poll the task status asynchronously. Alibaba limits video tasks, so we check
430
+ # status at interval blocks until completion or timeout.
431
+ max_duration = 300
432
+ interval = 5
433
+ elapsed = 0
434
+ video_url = nil
435
+ polling_err = nil
436
+
437
+ while elapsed < max_duration
438
+ begin
439
+ task_resp = connection.get("#{TASK_PATH}#{task_id}") do |req|
440
+ req.headers["Authorization"] = "Bearer #{@api_key}"
441
+ end
442
+ rescue Faraday::Error => e
443
+ polling_err = "Polling request failed: #{e.message}"
444
+ break
445
+ end
446
+
447
+ task_body = parse_json(task_resp.body)
448
+ unless task_body.is_a?(Hash)
449
+ polling_err = "Invalid polling response JSON"
450
+ break
451
+ end
452
+
453
+ task_output = task_body["output"] || {}
454
+ status = task_output["task_status"]
455
+
456
+ if status == "SUCCEEDED"
457
+ video_url = task_output["video_url"]
458
+ break
459
+ elsif status == "FAILED"
460
+ polling_err = "Task failed: #{task_output["message"] || 'Unknown error'}"
461
+ break
462
+ elsif status == "CANCELED"
463
+ polling_err = "Task was canceled"
464
+ break
465
+ end
466
+
467
+ sleep interval
468
+ elapsed += interval
469
+ end
470
+
471
+ if video_url.nil?
472
+ return video_error_response(
473
+ error: polling_err || "Polling timed out after #{max_duration} seconds",
474
+ error_type: "polling_failed",
475
+ provider: PROVIDER_ID,
476
+ prompt: prompt,
477
+ aspect_ratio: aspect_ratio
478
+ )
479
+ end
480
+
481
+ # Download the final MP4 video file and save it locally
482
+ local_path = save_image_from_url(video_url, output_dir: output_dir || Dir.pwd, prefix: "vid", extension: "mp4")
483
+ if local_path.nil?
484
+ return video_error_response(
485
+ error: "Failed to download generated video from #{video_url}",
486
+ error_type: "download_failed",
487
+ provider: PROVIDER_ID,
488
+ prompt: prompt,
489
+ aspect_ratio: aspect_ratio
490
+ )
491
+ end
492
+
493
+ video_success_response(
494
+ video: local_path,
495
+ prompt: prompt,
496
+ aspect_ratio: aspect_ratio,
497
+ provider: PROVIDER_ID,
498
+ extra: {
499
+ "request_id" => body["request_id"]
500
+ }.compact
501
+ )
502
+ end
503
+
181
504
  # qwen-image-max / qwen-image-plus accept only the fixed resolution set;
182
505
  # everything else (qwen-image-2.0 family, plain qwen-image) uses the 2.0
183
506
  # recommended sizes.
@@ -189,6 +512,47 @@ module Clacky
189
512
  end
190
513
  end
191
514
 
515
+ # CosyVoice models (cosyvoice-*, cosyvoice-v3-flash, etc.) use the
516
+ # dedicated TTS endpoint; Qwen3-TTS models (qwen3-tts-flash,
517
+ # qwen3-tts-instruct-flash) are served via the multimodal-generation
518
+ # endpoint despite being TTS — see Aliyun docs:
519
+ # https://help.aliyun.com/zh/model-studio/qwen-tts-api
520
+ #
521
+ # Matching is POSITIVE (by model-name pattern) so third-party
522
+ # aggregators that keep the official model names keep working, and
523
+ # unknown TTS models are not silently misrouted. Anything not
524
+ # recognized as Qwen3-TTS falls back to the CosyVoice endpoint for
525
+ # backward compatibility — every TTS model clacky supported before
526
+ # qwen3-tts was a CosyVoice model.
527
+ private def speech_endpoint
528
+ m = @model.to_s
529
+ if m.match?(/(^|[-_])qwen3-tts(-|$)/i)
530
+ GENERATION_PATH
531
+ else
532
+ SPEECH_PATH_COSY
533
+ end
534
+ end
535
+
536
+ private def default_speech_voice
537
+ speech_endpoint == GENERATION_PATH ? DEFAULT_SPEECH_VOICE_QWEN : DEFAULT_SPEECH_VOICE_COSY
538
+ end
539
+
540
+ # Each model family has its own payload shape. We branch on endpoint
541
+ # because the endpoint identity uniquely identifies the family here.
542
+ private def speech_payload(input:, voice:, language_type: nil)
543
+ input_body = { text: input, voice: voice }
544
+ if speech_endpoint == GENERATION_PATH
545
+ # Qwen3-TTS expects language_type; default to Chinese when caller
546
+ # doesn't specify, since most users run Chinese TTS.
547
+ input_body[:language_type] = (language_type.to_s.empty? ? "Chinese" : language_type)
548
+ else
549
+ # CosyVoice expects format + sample_rate.
550
+ input_body[:format] = "wav"
551
+ input_body[:sample_rate] = 24000
552
+ end
553
+ { model: @model, input: input_body }
554
+ end
555
+
192
556
  # output.choices[].message.content[].image -> first image URL
193
557
  private def extract_image_url(body)
194
558
  choices = body.dig("output", "choices")
@@ -41,6 +41,15 @@ module Clacky
41
41
  aspect_ratio: aspect_ratio
42
42
  )
43
43
  end
44
+
45
+ def understand_video(video_base64:, mime_type:, prompt: nil, **_kwargs)
46
+ video_understanding_error_response(
47
+ error: "Direct Google AI Studio video understanding is not supported. Use the openclacky gateway (base_url https://api.openclacky.com) with a video understanding model such as or-gemini-3-5-flash.",
48
+ error_type: "not_implemented",
49
+ provider: "gemini-direct",
50
+ prompt: prompt || ""
51
+ )
52
+ end
44
53
  end
45
54
  end
46
55
  end
@@ -124,6 +124,58 @@ module Clacky
124
124
  )
125
125
  end
126
126
 
127
+ def stt_model_entry
128
+ @agent_config.find_model_by_type("stt")
129
+ end
130
+
131
+ def video_understanding_model_entry
132
+ @agent_config.find_model_by_type("video_understanding")
133
+ end
134
+
135
+ def generate_transcription(audio_base64:, mime_type:, **kwargs)
136
+ entry = stt_model_entry
137
+ if entry.nil?
138
+ return {
139
+ "success" => false,
140
+ "text" => nil,
141
+ "error" => "No STT model configured. Add a model with type=stt in settings.",
142
+ "error_type" => "not_configured",
143
+ "provider" => "",
144
+ "model" => ""
145
+ }
146
+ end
147
+
148
+ provider = build_provider_for(entry)
149
+ provider.generate_transcription(
150
+ audio_base64: audio_base64,
151
+ mime_type: mime_type,
152
+ **kwargs
153
+ )
154
+ end
155
+
156
+ def understand_video(video_base64:, mime_type:, prompt: nil, **kwargs)
157
+ entry = video_understanding_model_entry
158
+ if entry.nil?
159
+ return {
160
+ "success" => false,
161
+ "analysis" => nil,
162
+ "error" => "No video understanding model configured. Add a model with type=video_understanding in settings.",
163
+ "error_type" => "not_configured",
164
+ "provider" => "",
165
+ "model" => "",
166
+ "prompt" => prompt
167
+ }
168
+ end
169
+
170
+ provider = build_provider_for(entry)
171
+ provider.understand_video(
172
+ video_base64: video_base64,
173
+ mime_type: mime_type,
174
+ prompt: prompt,
175
+ **kwargs
176
+ )
177
+ end
178
+
127
179
  # Pick the adapter class for a media model entry.
128
180
  #
129
181
  # Routing rules:
@@ -3,6 +3,7 @@
3
3
  require "faraday"
4
4
  require "json"
5
5
  require "base64"
6
+ require "securerandom"
6
7
  require_relative "base"
7
8
 
8
9
  module Clacky
@@ -296,6 +297,157 @@ module Clacky
296
297
  )
297
298
  end
298
299
 
300
+ def generate_transcription(audio_base64:, mime_type:, prompt: nil, **_kwargs)
301
+ provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
302
+
303
+ if @api_key.to_s.empty?
304
+ return transcription_error_response(
305
+ error: "api_key not configured for STT model '#{@model}'",
306
+ error_type: "auth_required", provider: provider_id
307
+ )
308
+ end
309
+
310
+ ext = mime_type.split(";").first.split("/").last.then { |e| e == "mpeg" ? "mp3" : e }
311
+ filename = "chunk.#{ext}"
312
+ audio_data = Base64.decode64(audio_base64)
313
+ boundary = "----FormBoundary#{SecureRandom.hex(8)}"
314
+ # A multipart body is a byte stream: build it in binary so UTF-8 text
315
+ # parts (e.g. a non-ASCII vocabulary prompt) don't clash with the
316
+ # ASCII-8BIT audio bytes.
317
+ body = "".b
318
+ body << "--#{boundary}\r\n".b
319
+ body << "Content-Disposition: form-data; name=\"file\"; filename=\"#{filename}\"\r\n".b
320
+ body << "Content-Type: #{mime_type.split(';').first}\r\n\r\n".b
321
+ body << audio_data.b
322
+ body << "\r\n--#{boundary}\r\n".b
323
+ body << "Content-Disposition: form-data; name=\"model\"\r\n\r\n".b
324
+ body << @model.to_s.b
325
+ unless prompt.to_s.strip.empty?
326
+ body << "\r\n--#{boundary}\r\n".b
327
+ body << "Content-Disposition: form-data; name=\"prompt\"\r\n\r\n".b
328
+ body << prompt.to_s.strip.b
329
+ end
330
+ body << "\r\n--#{boundary}--\r\n".b
331
+
332
+ begin
333
+ response = stt_connection.post("audio/transcriptions") do |req|
334
+ req.headers["Content-Type"] = "multipart/form-data; boundary=#{boundary}"
335
+ req.headers["Authorization"] = "Bearer #{@api_key}"
336
+ req.body = body
337
+ end
338
+ rescue Faraday::Error => e
339
+ return transcription_error_response(
340
+ error: "HTTP request failed: #{e.message}",
341
+ error_type: "network_error", provider: provider_id
342
+ )
343
+ end
344
+
345
+ unless response.success?
346
+ return transcription_error_response(
347
+ error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
348
+ error_type: "api_error", provider: provider_id
349
+ )
350
+ end
351
+
352
+ parsed = parse_json(response.body)
353
+ unless parsed.is_a?(Hash)
354
+ return transcription_error_response(
355
+ error: "Invalid JSON response from upstream",
356
+ error_type: "invalid_response", provider: provider_id
357
+ )
358
+ end
359
+
360
+ transcription_success_response(
361
+ text: parsed["text"].to_s.strip,
362
+ provider: provider_id,
363
+ extra: {
364
+ "usage" => parsed["usage"],
365
+ "cost_usd" => parsed["cost_usd"]
366
+ }.compact
367
+ )
368
+ end
369
+
370
+ def understand_video(video_base64:, mime_type:, prompt: nil, **_kwargs)
371
+ provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
372
+ prompt = "Describe what you see in this frame." if prompt.to_s.strip.empty?
373
+
374
+ if @api_key.to_s.empty?
375
+ return video_understanding_error_response(
376
+ error: "api_key not configured for video understanding model '#{@model}'",
377
+ error_type: "auth_required", provider: provider_id, prompt: prompt
378
+ )
379
+ end
380
+
381
+ data_url = "data:#{mime_type};base64,#{video_base64}"
382
+
383
+ payload = {
384
+ model: @model,
385
+ messages: [
386
+ {
387
+ role: "user",
388
+ content: [
389
+ { type: "text", text: prompt },
390
+ { type: "image_url", image_url: { url: data_url } }
391
+ ]
392
+ }
393
+ ]
394
+ }
395
+
396
+ begin
397
+ response = vu_connection.post("chat/completions") do |req|
398
+ req.headers["Content-Type"] = "application/json"
399
+ req.headers["Authorization"] = "Bearer #{@api_key}"
400
+ req.body = JSON.generate(payload)
401
+ end
402
+ rescue Faraday::Error => e
403
+ return video_understanding_error_response(
404
+ error: "HTTP request failed: #{e.message}",
405
+ error_type: "network_error", provider: provider_id, prompt: prompt
406
+ )
407
+ end
408
+
409
+ unless response.success?
410
+ return video_understanding_error_response(
411
+ error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
412
+ error_type: "api_error", provider: provider_id, prompt: prompt
413
+ )
414
+ end
415
+
416
+ parsed = parse_json(response.body)
417
+ unless parsed.is_a?(Hash)
418
+ return video_understanding_error_response(
419
+ error: "Invalid JSON response from upstream",
420
+ error_type: "invalid_response", provider: provider_id, prompt: prompt
421
+ )
422
+ end
423
+
424
+ choices = parsed["choices"]
425
+ if choices.nil? || choices.empty?
426
+ return video_understanding_error_response(
427
+ error: "Upstream returned no content",
428
+ error_type: "empty_response", provider: provider_id, prompt: prompt
429
+ )
430
+ end
431
+
432
+ text = choices.first.dig("message", "content").to_s.strip
433
+ if text.empty?
434
+ return video_understanding_error_response(
435
+ error: "Upstream returned empty analysis",
436
+ error_type: "empty_response", provider: provider_id, prompt: prompt
437
+ )
438
+ end
439
+
440
+ video_understanding_success_response(
441
+ analysis: text,
442
+ prompt: prompt,
443
+ provider: provider_id,
444
+ extra: {
445
+ "usage" => parsed["usage"],
446
+ "cost_usd" => parsed["cost_usd"]
447
+ }.compact
448
+ )
449
+ end
450
+
299
451
  private def connection
300
452
  Faraday.new(url: normalized_base_url) do |f|
301
453
  f.options.timeout = 240
@@ -320,6 +472,20 @@ module Clacky
320
472
  end
321
473
  end
322
474
 
475
+ private def stt_connection
476
+ Faraday.new(url: normalized_base_url) do |f|
477
+ f.options.timeout = 30
478
+ f.options.open_timeout = 10
479
+ end
480
+ end
481
+
482
+ private def vu_connection
483
+ Faraday.new(url: normalized_base_url) do |f|
484
+ f.options.timeout = 60
485
+ f.options.open_timeout = 10
486
+ end
487
+ end
488
+
323
489
  private def gemini_family?(model_name)
324
490
  model_name.to_s.match?(/gemini|imagen/i)
325
491
  end