RubyGems - openclacky - Versions diffs - 1.3.4 → 1.3.5 - Mend

openclacky 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +27 -0
data/lib/clacky/agent/fake_tool_call_detector.rb +52 -0
data/lib/clacky/agent/session_serializer.rb +3 -2
data/lib/clacky/agent/tool_executor.rb +0 -12
data/lib/clacky/agent.rb +74 -9
data/lib/clacky/api_extension.rb +81 -0
data/lib/clacky/api_extension_loader.rb +13 -1
data/lib/clacky/client.rb +14 -17
data/lib/clacky/default_agents/_panels/time_machine/panel.js +22 -0
data/lib/clacky/default_agents/base_prompt.md +1 -0
data/lib/clacky/default_extensions/meeting/handler.rb +331 -0
data/lib/clacky/default_extensions/meeting/meeting.js +790 -0
data/lib/clacky/default_extensions/meeting/meta.yml +3 -0
data/lib/clacky/default_extensions/meeting/skills/meeting-summarizer/SKILL.md +44 -0
data/lib/clacky/default_skills/media-gen/SKILL.md +63 -0
data/lib/clacky/default_skills/media-gen/scripts/video_seq.sh +114 -0
data/lib/clacky/json_ui_controller.rb +1 -1
data/lib/clacky/media/base.rb +60 -0
data/lib/clacky/media/dashscope.rb +385 -21
data/lib/clacky/media/gemini.rb +9 -0
data/lib/clacky/media/generator.rb +52 -0
data/lib/clacky/media/openai_compat.rb +166 -0
data/lib/clacky/null_ui_controller.rb +13 -0
data/lib/clacky/plain_ui_controller.rb +1 -1
data/lib/clacky/providers.rb +50 -2
data/lib/clacky/rich_ui/rich_ui_controller.rb +1 -1
data/lib/clacky/server/channel/channel_ui_controller.rb +1 -1
data/lib/clacky/server/http_server.rb +144 -9
data/lib/clacky/server/session_registry.rb +4 -2
data/lib/clacky/server/web_ui_controller.rb +3 -2
data/lib/clacky/skill_loader.rb +14 -2
data/lib/clacky/tools/terminal/output_cleaner.rb +1 -3
data/lib/clacky/tools/terminal.rb +0 -43
data/lib/clacky/ui2/components/modal_component.rb +1 -1
data/lib/clacky/ui2/ui_controller.rb +140 -31
data/lib/clacky/ui_interface.rb +10 -1
data/lib/clacky/utils/encoding.rb +25 -0
data/lib/clacky/version.rb +1 -1
data/lib/clacky/web/app.css +145 -22
data/lib/clacky/web/components/onboard.js +1 -14
data/lib/clacky/web/features/brand/view.js +8 -5
data/lib/clacky/web/features/channels/store.js +1 -20
data/lib/clacky/web/features/mcp/store.js +1 -20
data/lib/clacky/web/features/profile/store.js +1 -13
data/lib/clacky/web/features/profile/view.js +16 -4
data/lib/clacky/web/features/skills/store.js +6 -21
data/lib/clacky/web/features/version/store.js +2 -0
data/lib/clacky/web/i18n.js +24 -1
data/lib/clacky/web/index.html +15 -0
data/lib/clacky/web/sessions.js +141 -51
data/lib/clacky/web/settings.js +34 -2
data/lib/clacky/web/ws-dispatcher.js +11 -3
data/lib/clacky.rb +12 -5
metadata +8 -1

data/lib/clacky/media/dashscope.rb CHANGED Viewed

@@ -7,33 +7,48 @@ require_relative "base"
 module Clacky
   module Media
-    # Alibaba DashScope (Qwen-Image) image generation provider.
+    # Alibaba DashScope (Qwen-Image / CosyVoice / HappyHorse) media generation provider.
     #
-    # DashScope is NOT an OpenAI-compatible image API. It has its own
-    # endpoint, request envelope and response schema:
-    #
-    #   POST <host>/api/v1/services/aigc/multimodal-generation/generation
-    #   Authorization: Bearer <key>
-    #   { "model": "qwen-image-2.0-pro",
-    #     "input":      { "messages": [ { "role": "user",
-    #                                     "content": [ { "text": "<prompt>" } ] } ] },
-    #     "parameters": { "size": "2048*2048", "n": 1,
-    #                     "prompt_extend": true, "watermark": false } }
-    #
-    #   => { "output": { "choices": [ { "message": { "content": [
-    #          { "image": "https://...png?Expires=..." } ] } } ] },
-    #        "usage": { "width": 2048, "height": 2048, "image_count": 1 } }
-    #
-    # The image link expires after 24h, so we download and persist it under
-    # <output_dir>/assets/generated/ (via Base#save_image_from_url), matching
-    # the on-disk shape of the base64 providers.
+    # DashScope is NOT an OpenAI-compatible API. It has its own endpoint,
+    # request envelope and response schema for image, speech (TTS), and video generation.
     #
     # Routing: Generator sends any base_url under *.aliyuncs.com here. We
     # derive the real generation endpoint from the host so users can paste
     # the compatible-mode base_url (…/compatible-mode/v1) they already use
-    # for Qwen text models and still get working image generation.
+    # for Qwen text models and still get working media generation.
+    #
+    # --- Endpoint migration TODO (2026-06) ---------------------------------
+    # Aliyun is gradually deprecating the shared `dashscope.aliyuncs.com`
+    # host in favor of the per-workspace MaaS domain
+    # `https://{WorkspaceId}.cn-beijing.maas.aliyuncs.com` (intl:
+    # `{WorkspaceId}.dashscope-intl.aliyuncs.com`). Docs have already moved
+    # to the new domain; the old host still works for most models but is
+    # expected to be sunset eventually.
+    #
+    # Current stance: keep accepting the old shared host as the default
+    # (zero-config for users + compatibility with third-party aggregators
+    # that don't use aliyuncs.com at all). The new MaaS domain already
+    # works today via endpoint_base derivation. Non-real-time TTS
+    # (qwen3-tts) does NOT work on the shared host and already emits a
+    # hint pointing users at the MaaS domain — see the "url error" branch
+    # in generate_speech.
+    #
+    # Action when Aliyun announces the sunset of compatible-mode:
+    #   1. Flip the default expectation to the WorkspaceId MaaS domain.
+    #   2. Add a setup flow / docs explaining how to find WorkspaceId.
+    #   3. Keep accepting aggregator base_urls unchanged.
+    # Do NOT pre-emptively migrate before an official sunset notice — it
+    # would break zero-config UX and aggregator users for no current gain.
     class DashScope < Base
-      GENERATION_PATH = "/api/v1/services/aigc/multimodal-generation/generation"
+      GENERATION_PATH   = "/api/v1/services/aigc/multimodal-generation/generation"
+      SPEECH_PATH_COSY  = "/api/v1/services/audio/tts/SpeechSynthesizer"
+      VIDEO_PATH        = "/api/v1/services/aigc/video-generation/video-synthesis"
+      TASK_PATH         = "/api/v1/tasks/"
+      # Default voice per TTS model family. CosyVoice defaults to longanyang;
+      # Qwen3-TTS defaults to Cherry (most common Chinese female voice).
+      DEFAULT_SPEECH_VOICE_COSY = "longanyang"
+      DEFAULT_SPEECH_VOICE_QWEN = "Cherry"
       # aspect_ratio -> "<width>*<height>" (DashScope uses '*' not 'x').
       # qwen-image-2.0 / -plus / -max share these recommended resolutions;
@@ -178,6 +193,314 @@ module Clacky
         )
       end
+      # Synthesizes speech (TTS) using Alibaba CosyVoice models (e.g. cosyvoice-v3-flash).
+      # This is a synchronous call.
+      #
+      # @param input [String] the text to synthesize
+      # @param voice [String, nil] the voice name; defaults to "longanyang" for CosyVoice or "Cherry" for Qwen3-TTS
+      # @param output_dir [String, nil] the directory to save the output audio
+      # @param language_type [String, nil] language hint for Qwen3-TTS (default "Chinese"); ignored by CosyVoice
+      # @return [Hash] audio_success_response or audio_error_response
+      def generate_speech(input:, voice: nil, output_dir: nil, language_type: nil, **_kwargs)
+        if input.to_s.strip.empty?
+          return audio_error_response(
+            error: "Input text is required and must be a non-empty string",
+            error_type: "invalid_argument",
+            provider: PROVIDER_ID,
+            voice: voice.to_s
+          )
+        end
+        if @api_key.to_s.empty?
+          return audio_error_response(
+            error: "api_key not configured for audio model '#{@model}'",
+            error_type: "auth_required",
+            provider: PROVIDER_ID,
+            input: input,
+            voice: voice.to_s
+          )
+        end
+        # Pick endpoint and payload shape based on model family. CosyVoice
+        # uses the dedicated TTS endpoint and accepts format/sample_rate;
+        # Qwen3-TTS is a multimodal-generation model and expects
+        # language_type instead.
+        endpoint     = speech_endpoint
+        chosen_voice = voice || default_speech_voice
+        payload      = speech_payload(input: input, voice: chosen_voice, language_type: language_type)
+        begin
+          response = connection.post(endpoint) do |req|
+            req.headers["Content-Type"]  = "application/json"
+            req.headers["Authorization"] = "Bearer #{@api_key}"
+            req.body = JSON.generate(payload)
+          end
+        rescue Faraday::Error => e
+          return audio_error_response(
+            error: "HTTP request failed: #{e.message}",
+            error_type: "network_error",
+            provider: PROVIDER_ID,
+            input: input,
+            voice: voice.to_s
+          )
+        end
+        body = parse_json(response.body)
+        unless body.is_a?(Hash)
+          return audio_error_response(
+            error: "Invalid JSON response from upstream",
+            error_type: "invalid_response",
+            provider: PROVIDER_ID,
+            input: input,
+            voice: voice.to_s
+          )
+        end
+        # Inspect any business level errors from DashScope
+        if body["code"] && !body["code"].to_s.empty?
+          err_msg = body["message"].to_s
+          if err_msg.include?("url error") && @base_url.to_s.include?("dashscope.aliyuncs.com")
+            err_msg += " (Note: Alibaba Model Studio non-real-time TTS does not support the public shared endpoint. " \
+                       "Set the model's Base URL to your dedicated MaaS domain, e.g. " \
+                       "https://{WorkspaceId}.cn-beijing.maas.aliyuncs.com)"
+          end
+          return audio_error_response(
+            error: "Upstream error #{body["code"]}: #{err_msg}",
+            error_type: "api_error",
+            provider: PROVIDER_ID,
+            input: input,
+            voice: voice.to_s
+          )
+        end
+        unless response.success?
+          return audio_error_response(
+            error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
+            error_type: "api_error",
+            provider: PROVIDER_ID,
+            input: input,
+            voice: voice.to_s
+          )
+        end
+        audio_url = body.dig("output", "audio", "url")
+        if audio_url.nil? || audio_url.empty?
+          return audio_error_response(
+            error: "Upstream returned no audio data",
+            error_type: "empty_response",
+            provider: PROVIDER_ID,
+            input: input,
+            voice: voice.to_s
+          )
+        end
+        # Download the audio file from OSS and save it locally in the target output directory
+        local_path = save_image_from_url(audio_url, output_dir: output_dir || Dir.pwd, prefix: "tts", extension: "wav")
+        if local_path.nil?
+          return audio_error_response(
+            error: "Failed to download generated audio from #{audio_url}",
+            error_type: "download_failed",
+            provider: PROVIDER_ID,
+            input: input,
+            voice: voice.to_s
+          )
+        end
+        audio_success_response(
+          audio: local_path,
+          input: input,
+          voice: chosen_voice,
+          provider: PROVIDER_ID,
+          extra: {
+            "request_id" => body["request_id"]
+          }.compact
+        )
+      end
+      # Generates a video using Alibaba HappyHorse or Wanx models.
+      # This is a mandatory asynchronous API. We submit the task, and poll
+      # the task status until it succeeds, fails, or times out.
+      #
+      # @param prompt [String] the video prompt
+      # @param aspect_ratio [String] "landscape", "portrait", or "square"
+      # @param duration_seconds [Integer, nil] duration in seconds
+      # @param output_dir [String, nil] the directory to save the output video
+      # @return [Hash] video_success_response or video_error_response
+      def generate_video(prompt:, aspect_ratio: "landscape", duration_seconds: nil, output_dir: nil, **_kwargs)
+        if prompt.to_s.strip.empty?
+          return video_error_response(
+            error: "Prompt is required and must be a non-empty string",
+            error_type: "invalid_argument",
+            provider: PROVIDER_ID,
+            aspect_ratio: aspect_ratio
+          )
+        end
+        if @api_key.to_s.empty?
+          return video_error_response(
+            error: "api_key not configured for video model '#{@model}'",
+            error_type: "auth_required",
+            provider: PROVIDER_ID,
+            prompt: prompt,
+            aspect_ratio: aspect_ratio
+          )
+        end
+        # Map aspect ratio strings to Alibaba's ratio values (e.g. 16:9).
+        ratio = case aspect_ratio
+                when "portrait" then "9:16"
+                when "square"   then "1:1"
+                else "16:9"
+                end
+        # Construct payload. Ratio and resolution are placed under the "parameters" key.
+        payload = {
+          model: @model,
+          input: {
+            prompt: prompt
+          },
+          parameters: {
+            resolution: "720P",
+            ratio: ratio
+          }
+        }
+        payload[:parameters][:duration] = duration_seconds if duration_seconds
+        begin
+          # Submit the task. Alibaba requires 'X-DashScope-Async: enable' header for video synthesis.
+          response = connection.post(VIDEO_PATH) do |req|
+            req.headers["Content-Type"]      = "application/json"
+            req.headers["Authorization"]     = "Bearer #{@api_key}"
+            req.headers["X-DashScope-Async"] = "enable"
+            req.body = JSON.generate(payload)
+          end
+        rescue Faraday::Error => e
+          return video_error_response(
+            error: "HTTP request failed: #{e.message}",
+            error_type: "network_error",
+            provider: PROVIDER_ID,
+            prompt: prompt,
+            aspect_ratio: aspect_ratio
+          )
+        end
+        body = parse_json(response.body)
+        unless body.is_a?(Hash)
+          return video_error_response(
+            error: "Invalid JSON response from upstream",
+            error_type: "invalid_response",
+            provider: PROVIDER_ID,
+            prompt: prompt,
+            aspect_ratio: aspect_ratio
+          )
+        end
+        if body["code"] && !body["code"].to_s.empty?
+          return video_error_response(
+            error: "Upstream error #{body["code"]}: #{body["message"]}",
+            error_type: "api_error",
+            provider: PROVIDER_ID,
+            prompt: prompt,
+            aspect_ratio: aspect_ratio
+          )
+        end
+        unless response.success?
+          return video_error_response(
+            error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
+            error_type: "api_error",
+            provider: PROVIDER_ID,
+            prompt: prompt,
+            aspect_ratio: aspect_ratio
+          )
+        end
+        task_id = body.dig("output", "task_id")
+        if task_id.nil? || task_id.empty?
+          return video_error_response(
+            error: "Upstream did not return a task_id",
+            error_type: "empty_response",
+            provider: PROVIDER_ID,
+            prompt: prompt,
+            aspect_ratio: aspect_ratio
+          )
+        end
+        # Poll the task status asynchronously. Alibaba limits video tasks, so we check
+        # status at interval blocks until completion or timeout.
+        max_duration = 300
+        interval     = 5
+        elapsed      = 0
+        video_url    = nil
+        polling_err  = nil
+        while elapsed < max_duration
+          begin
+            task_resp = connection.get("#{TASK_PATH}#{task_id}") do |req|
+              req.headers["Authorization"] = "Bearer #{@api_key}"
+            end
+          rescue Faraday::Error => e
+            polling_err = "Polling request failed: #{e.message}"
+            break
+          end
+          task_body = parse_json(task_resp.body)
+          unless task_body.is_a?(Hash)
+            polling_err = "Invalid polling response JSON"
+            break
+          end
+          task_output = task_body["output"] || {}
+          status = task_output["task_status"]
+          if status == "SUCCEEDED"
+            video_url = task_output["video_url"]
+            break
+          elsif status == "FAILED"
+            polling_err = "Task failed: #{task_output["message"] || 'Unknown error'}"
+            break
+          elsif status == "CANCELED"
+            polling_err = "Task was canceled"
+            break
+          end
+          sleep interval
+          elapsed += interval
+        end
+        if video_url.nil?
+          return video_error_response(
+            error: polling_err || "Polling timed out after #{max_duration} seconds",
+            error_type: "polling_failed",
+            provider: PROVIDER_ID,
+            prompt: prompt,
+            aspect_ratio: aspect_ratio
+          )
+        end
+        # Download the final MP4 video file and save it locally
+        local_path = save_image_from_url(video_url, output_dir: output_dir || Dir.pwd, prefix: "vid", extension: "mp4")
+        if local_path.nil?
+          return video_error_response(
+            error: "Failed to download generated video from #{video_url}",
+            error_type: "download_failed",
+            provider: PROVIDER_ID,
+            prompt: prompt,
+            aspect_ratio: aspect_ratio
+          )
+        end
+        video_success_response(
+          video: local_path,
+          prompt: prompt,
+          aspect_ratio: aspect_ratio,
+          provider: PROVIDER_ID,
+          extra: {
+            "request_id" => body["request_id"]
+          }.compact
+        )
+      end
       # qwen-image-max / qwen-image-plus accept only the fixed resolution set;
       # everything else (qwen-image-2.0 family, plain qwen-image) uses the 2.0
       # recommended sizes.
@@ -189,6 +512,47 @@ module Clacky
         end
       end
+      # CosyVoice models (cosyvoice-*, cosyvoice-v3-flash, etc.) use the
+      # dedicated TTS endpoint; Qwen3-TTS models (qwen3-tts-flash,
+      # qwen3-tts-instruct-flash) are served via the multimodal-generation
+      # endpoint despite being TTS — see Aliyun docs:
+      # https://help.aliyun.com/zh/model-studio/qwen-tts-api
+      #
+      # Matching is POSITIVE (by model-name pattern) so third-party
+      # aggregators that keep the official model names keep working, and
+      # unknown TTS models are not silently misrouted. Anything not
+      # recognized as Qwen3-TTS falls back to the CosyVoice endpoint for
+      # backward compatibility — every TTS model clacky supported before
+      # qwen3-tts was a CosyVoice model.
+      private def speech_endpoint
+        m = @model.to_s
+        if m.match?(/(^|[-_])qwen3-tts(-|$)/i)
+          GENERATION_PATH
+        else
+          SPEECH_PATH_COSY
+        end
+      end
+      private def default_speech_voice
+        speech_endpoint == GENERATION_PATH ? DEFAULT_SPEECH_VOICE_QWEN : DEFAULT_SPEECH_VOICE_COSY
+      end
+      # Each model family has its own payload shape. We branch on endpoint
+      # because the endpoint identity uniquely identifies the family here.
+      private def speech_payload(input:, voice:, language_type: nil)
+        input_body = { text: input, voice: voice }
+        if speech_endpoint == GENERATION_PATH
+          # Qwen3-TTS expects language_type; default to Chinese when caller
+          # doesn't specify, since most users run Chinese TTS.
+          input_body[:language_type] = (language_type.to_s.empty? ? "Chinese" : language_type)
+        else
+          # CosyVoice expects format + sample_rate.
+          input_body[:format]      = "wav"
+          input_body[:sample_rate] = 24000
+        end
+        { model: @model, input: input_body }
+      end
       # output.choices[].message.content[].image -> first image URL
       private def extract_image_url(body)
         choices = body.dig("output", "choices")

data/lib/clacky/media/gemini.rb CHANGED Viewed

@@ -41,6 +41,15 @@ module Clacky
           aspect_ratio: aspect_ratio
         )
       end
+      def understand_video(video_base64:, mime_type:, prompt: nil, **_kwargs)
+        video_understanding_error_response(
+          error: "Direct Google AI Studio video understanding is not supported. Use the openclacky gateway (base_url https://api.openclacky.com) with a video understanding model such as or-gemini-3-5-flash.",
+          error_type: "not_implemented",
+          provider: "gemini-direct",
+          prompt: prompt || ""
+        )
+      end
     end
   end
 end

data/lib/clacky/media/generator.rb CHANGED Viewed

@@ -124,6 +124,58 @@ module Clacky
         )
       end
+      def stt_model_entry
+        @agent_config.find_model_by_type("stt")
+      end
+      def video_understanding_model_entry
+        @agent_config.find_model_by_type("video_understanding")
+      end
+      def generate_transcription(audio_base64:, mime_type:, **kwargs)
+        entry = stt_model_entry
+        if entry.nil?
+          return {
+            "success"    => false,
+            "text"       => nil,
+            "error"      => "No STT model configured. Add a model with type=stt in settings.",
+            "error_type" => "not_configured",
+            "provider"   => "",
+            "model"      => ""
+          }
+        end
+        provider = build_provider_for(entry)
+        provider.generate_transcription(
+          audio_base64: audio_base64,
+          mime_type: mime_type,
+          **kwargs
+        )
+      end
+      def understand_video(video_base64:, mime_type:, prompt: nil, **kwargs)
+        entry = video_understanding_model_entry
+        if entry.nil?
+          return {
+            "success"    => false,
+            "analysis"   => nil,
+            "error"      => "No video understanding model configured. Add a model with type=video_understanding in settings.",
+            "error_type" => "not_configured",
+            "provider"   => "",
+            "model"      => "",
+            "prompt"     => prompt
+          }
+        end
+        provider = build_provider_for(entry)
+        provider.understand_video(
+          video_base64: video_base64,
+          mime_type: mime_type,
+          prompt: prompt,
+          **kwargs
+        )
+      end
       # Pick the adapter class for a media model entry.
       #
       # Routing rules:

data/lib/clacky/media/openai_compat.rb CHANGED Viewed

@@ -3,6 +3,7 @@
 require "faraday"
 require "json"
 require "base64"
+require "securerandom"
 require_relative "base"
 module Clacky
@@ -296,6 +297,157 @@ module Clacky
         )
       end
+      def generate_transcription(audio_base64:, mime_type:, prompt: nil, **_kwargs)
+        provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
+        if @api_key.to_s.empty?
+          return transcription_error_response(
+            error: "api_key not configured for STT model '#{@model}'",
+            error_type: "auth_required", provider: provider_id
+          )
+        end
+        ext = mime_type.split(";").first.split("/").last.then { |e| e == "mpeg" ? "mp3" : e }
+        filename = "chunk.#{ext}"
+        audio_data = Base64.decode64(audio_base64)
+        boundary = "----FormBoundary#{SecureRandom.hex(8)}"
+        # A multipart body is a byte stream: build it in binary so UTF-8 text
+        # parts (e.g. a non-ASCII vocabulary prompt) don't clash with the
+        # ASCII-8BIT audio bytes.
+        body = "".b
+        body << "--#{boundary}\r\n".b
+        body << "Content-Disposition: form-data; name=\"file\"; filename=\"#{filename}\"\r\n".b
+        body << "Content-Type: #{mime_type.split(';').first}\r\n\r\n".b
+        body << audio_data.b
+        body << "\r\n--#{boundary}\r\n".b
+        body << "Content-Disposition: form-data; name=\"model\"\r\n\r\n".b
+        body << @model.to_s.b
+        unless prompt.to_s.strip.empty?
+          body << "\r\n--#{boundary}\r\n".b
+          body << "Content-Disposition: form-data; name=\"prompt\"\r\n\r\n".b
+          body << prompt.to_s.strip.b
+        end
+        body << "\r\n--#{boundary}--\r\n".b
+        begin
+          response = stt_connection.post("audio/transcriptions") do |req|
+            req.headers["Content-Type"]  = "multipart/form-data; boundary=#{boundary}"
+            req.headers["Authorization"] = "Bearer #{@api_key}"
+            req.body = body
+          end
+        rescue Faraday::Error => e
+          return transcription_error_response(
+            error: "HTTP request failed: #{e.message}",
+            error_type: "network_error", provider: provider_id
+          )
+        end
+        unless response.success?
+          return transcription_error_response(
+            error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
+            error_type: "api_error", provider: provider_id
+          )
+        end
+        parsed = parse_json(response.body)
+        unless parsed.is_a?(Hash)
+          return transcription_error_response(
+            error: "Invalid JSON response from upstream",
+            error_type: "invalid_response", provider: provider_id
+          )
+        end
+        transcription_success_response(
+          text: parsed["text"].to_s.strip,
+          provider: provider_id,
+          extra: {
+            "usage"    => parsed["usage"],
+            "cost_usd" => parsed["cost_usd"]
+          }.compact
+        )
+      end
+      def understand_video(video_base64:, mime_type:, prompt: nil, **_kwargs)
+        provider_id = Clacky::Providers.find_by_base_url(@base_url) || "custom"
+        prompt = "Describe what you see in this frame." if prompt.to_s.strip.empty?
+        if @api_key.to_s.empty?
+          return video_understanding_error_response(
+            error: "api_key not configured for video understanding model '#{@model}'",
+            error_type: "auth_required", provider: provider_id, prompt: prompt
+          )
+        end
+        data_url = "data:#{mime_type};base64,#{video_base64}"
+        payload = {
+          model: @model,
+          messages: [
+            {
+              role: "user",
+              content: [
+                { type: "text", text: prompt },
+                { type: "image_url", image_url: { url: data_url } }
+              ]
+            }
+          ]
+        }
+        begin
+          response = vu_connection.post("chat/completions") do |req|
+            req.headers["Content-Type"]  = "application/json"
+            req.headers["Authorization"] = "Bearer #{@api_key}"
+            req.body = JSON.generate(payload)
+          end
+        rescue Faraday::Error => e
+          return video_understanding_error_response(
+            error: "HTTP request failed: #{e.message}",
+            error_type: "network_error", provider: provider_id, prompt: prompt
+          )
+        end
+        unless response.success?
+          return video_understanding_error_response(
+            error: "Upstream #{response.status}: #{truncate(response.body, 500)}",
+            error_type: "api_error", provider: provider_id, prompt: prompt
+          )
+        end
+        parsed = parse_json(response.body)
+        unless parsed.is_a?(Hash)
+          return video_understanding_error_response(
+            error: "Invalid JSON response from upstream",
+            error_type: "invalid_response", provider: provider_id, prompt: prompt
+          )
+        end
+        choices = parsed["choices"]
+        if choices.nil? || choices.empty?
+          return video_understanding_error_response(
+            error: "Upstream returned no content",
+            error_type: "empty_response", provider: provider_id, prompt: prompt
+          )
+        end
+        text = choices.first.dig("message", "content").to_s.strip
+        if text.empty?
+          return video_understanding_error_response(
+            error: "Upstream returned empty analysis",
+            error_type: "empty_response", provider: provider_id, prompt: prompt
+          )
+        end
+        video_understanding_success_response(
+          analysis: text,
+          prompt: prompt,
+          provider: provider_id,
+          extra: {
+            "usage"    => parsed["usage"],
+            "cost_usd" => parsed["cost_usd"]
+          }.compact
+        )
+      end
       private def connection
         Faraday.new(url: normalized_base_url) do |f|
           f.options.timeout      = 240
@@ -320,6 +472,20 @@ module Clacky
         end
       end
+      private def stt_connection
+        Faraday.new(url: normalized_base_url) do |f|
+          f.options.timeout      = 30
+          f.options.open_timeout = 10
+        end
+      end
+      private def vu_connection
+        Faraday.new(url: normalized_base_url) do |f|
+          f.options.timeout      = 60
+          f.options.open_timeout = 10
+        end
+      end
       private def gemini_family?(model_name)
         model_name.to_s.match?(/gemini|imagen/i)
       end