openclacky 1.2.12 → 1.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.clacky/skills/gem-release/SKILL.md +5 -1
  3. data/.clacky/skills/gem-release/scripts/release.sh +4 -1
  4. data/CHANGELOG.md +39 -0
  5. data/lib/clacky/agent/llm_caller.rb +40 -25
  6. data/lib/clacky/agent/memory_updater.rb +12 -0
  7. data/lib/clacky/agent/session_serializer.rb +1 -0
  8. data/lib/clacky/agent/skill_auto_creator.rb +7 -4
  9. data/lib/clacky/agent/skill_evolution.rb +23 -5
  10. data/lib/clacky/agent/skill_manager.rb +86 -1
  11. data/lib/clacky/agent/skill_reflector.rb +18 -23
  12. data/lib/clacky/agent.rb +132 -15
  13. data/lib/clacky/agent_config.rb +183 -22
  14. data/lib/clacky/cli.rb +55 -0
  15. data/lib/clacky/client.rb +11 -1
  16. data/lib/clacky/default_parsers/pdf_parser.rb +70 -86
  17. data/lib/clacky/default_parsers/pdf_parser_vlm.py +136 -0
  18. data/lib/clacky/default_skills/persist-memory/SKILL.md +4 -3
  19. data/lib/clacky/default_skills/search-skills/SKILL.md +61 -0
  20. data/lib/clacky/idle_compression_timer.rb +1 -1
  21. data/lib/clacky/message_format/open_ai.rb +7 -1
  22. data/lib/clacky/openai_stream_aggregator.rb +4 -1
  23. data/lib/clacky/providers.rb +77 -12
  24. data/lib/clacky/server/http_server.rb +296 -7
  25. data/lib/clacky/server/session_registry.rb +30 -8
  26. data/lib/clacky/server/web_ui_controller.rb +24 -1
  27. data/lib/clacky/session_manager.rb +120 -0
  28. data/lib/clacky/tools/web_search.rb +59 -8
  29. data/lib/clacky/ui2/layout_manager.rb +15 -5
  30. data/lib/clacky/ui2/progress_handle.rb +18 -8
  31. data/lib/clacky/ui2/ui_controller.rb +27 -0
  32. data/lib/clacky/ui_interface.rb +22 -0
  33. data/lib/clacky/utils/model_pricing.rb +96 -0
  34. data/lib/clacky/version.rb +1 -1
  35. data/lib/clacky/vision/resolver.rb +157 -0
  36. data/lib/clacky/web/app.css +209 -4
  37. data/lib/clacky/web/app.js +6 -5
  38. data/lib/clacky/web/i18n.js +22 -6
  39. data/lib/clacky/web/index.html +2 -1
  40. data/lib/clacky/web/sessions.js +408 -80
  41. data/lib/clacky/web/settings.js +241 -60
  42. data/lib/clacky/web/skills.js +5 -14
  43. data/lib/clacky/web/utils.js +57 -0
  44. data/lib/clacky/web/ws-dispatcher.js +136 -0
  45. data/lib/clacky.rb +1 -0
  46. metadata +6 -2
@@ -120,17 +120,67 @@ module Clacky
120
120
 
121
121
  # ── Bing ───────────────────────────────────────────────────────────────
122
122
 
123
+ BING_ENDPOINTS = [
124
+ ["cn.bing.com", "zh-CN,zh;q=0.9,en;q=0.8"],
125
+ ["www.bing.com", "en-US,en;q=0.9"]
126
+ ].freeze
127
+
128
+ # Race both Bing endpoints in parallel and return the first relevant result.
129
+ # cn.bing.com works best from mainland China; www.bing.com works best from
130
+ # overseas. Racing avoids guessing the network egress and recovers from
131
+ # one endpoint temporarily returning anti-scrape filler. If both return
132
+ # irrelevant garbage, fall back to whichever came back non-empty.
123
133
  private def search_bing(query, max_results)
124
- encoded_query = CGI.escape(query)
125
- # cn.bing.com redirects to www.bing.com for non-China IPs (e.g. GitHub CI);
126
- # follow_redirects ensures both environments work with the same code path.
127
- url = URI("https://cn.bing.com/search?q=#{encoded_query}&count=#{max_results}")
128
- response = http_get(url, accept_language: "zh-CN,zh;q=0.9,en;q=0.8", follow_redirects: 2)
134
+ queue = Queue.new
135
+ threads = BING_ENDPOINTS.map do |host, lang|
136
+ Thread.new do
137
+ results = bing_fetch(host, lang, query, max_results)
138
+ queue.push([host, results])
139
+ rescue StandardError
140
+ queue.push([host, []])
141
+ end
142
+ end
143
+
144
+ winner = nil
145
+ runner_up = nil
146
+ BING_ENDPOINTS.length.times do
147
+ _host, results = queue.pop
148
+ if bing_results_relevant?(results, query)
149
+ winner = results
150
+ break
151
+ elsif !results.empty? && runner_up.nil?
152
+ runner_up = results
153
+ end
154
+ end
155
+
156
+ threads.each(&:kill)
157
+ winner || runner_up || []
158
+ end
159
+
160
+ private def bing_fetch(host, lang, query, max_results)
161
+ url = URI("https://#{host}/search?q=#{CGI.escape(query)}&count=#{max_results}&form=QBLH")
162
+ response = http_get(url, accept_language: lang, follow_redirects: 2,
163
+ referer: "https://#{host}/")
129
164
  return [] unless response.is_a?(Net::HTTPSuccess)
130
165
 
131
166
  parse_bing_html(response.body, max_results)
132
167
  end
133
168
 
169
+ # A real Bing answer mentions at least one query token in the titles or
170
+ # snippets. The anti-scrape fallback returns top-domain filler (Yandex,
171
+ # Bunnings, WikiLeaks, …) that shares nothing with the query.
172
+ private def bing_results_relevant?(results, query)
173
+ return false if results.empty?
174
+
175
+ tokens = query.downcase.scan(/[\p{L}\p{N}]+/).reject { |t| t.length < 2 }
176
+ return true if tokens.empty?
177
+
178
+ results.any? do |r|
179
+ haystack = "#{r[:title]} #{r[:snippet]}".downcase
180
+ tokens.any? { |t| haystack.include?(t) }
181
+ end
182
+ end
183
+
134
184
  private def parse_bing_html(html, max_results)
135
185
  results = []
136
186
  html = Clacky::Utils::Encoding.to_utf8(html)
@@ -199,7 +249,7 @@ module Clacky
199
249
 
200
250
  # Shared browser-like GET request — no Accept-Encoding to avoid gzip/br
201
251
  # detection tricks used by Bing. Supports redirect following.
202
- private def http_get(url, accept_language: "en-US,en;q=0.9", follow_redirects: 0)
252
+ private def http_get(url, accept_language: "en-US,en;q=0.9", follow_redirects: 0, referer: nil)
203
253
  request = Net::HTTP::Get.new(url)
204
254
  request["User-Agent"] = USER_AGENTS.sample
205
255
  request["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
@@ -208,8 +258,9 @@ module Clacky
208
258
  # a JS-only skeleton (~39KB) instead of the real HTML results (~120KB)
209
259
  request["Sec-Fetch-Dest"] = "document"
210
260
  request["Sec-Fetch-Mode"] = "navigate"
211
- request["Sec-Fetch-Site"] = "none"
261
+ request["Sec-Fetch-Site"] = referer ? "same-origin" : "none"
212
262
  request["Upgrade-Insecure-Requests"] = "1"
263
+ request["Referer"] = referer if referer
213
264
 
214
265
  response = Net::HTTP.start(url.hostname, url.port,
215
266
  use_ssl: url.scheme == "https",
@@ -220,7 +271,7 @@ module Clacky
220
271
  if follow_redirects > 0 && response.is_a?(Net::HTTPRedirection)
221
272
  location = response["location"]
222
273
  redirect_url = location.start_with?("http") ? URI(location) : URI("#{url.scheme}://#{url.hostname}#{location}")
223
- return http_get(redirect_url, accept_language: accept_language, follow_redirects: follow_redirects - 1)
274
+ return http_get(redirect_url, accept_language: accept_language, follow_redirects: follow_redirects - 1, referer: referer)
224
275
  end
225
276
 
226
277
  response
@@ -119,18 +119,29 @@ module Clacky
119
119
 
120
120
  @render_mutex.synchronize do
121
121
  entry = @buffer.entry_by_id(id)
122
- # Skip if gone, fully committed, or only partially visible (its
123
- # prefix is already in terminal scrollback and cannot be edited).
124
- return if entry.nil? || entry.committed
125
- return if (entry.committed_line_offset || 0) > 0
122
+ if entry.nil?
123
+ Clacky::Logger.warn("[ph_debug] replace_entry_nil", id: id, content: content.to_s[0, 120])
124
+ return
125
+ end
126
+ if entry.committed
127
+ Clacky::Logger.warn("[ph_debug] replace_entry_committed", id: id, content: content.to_s[0, 120])
128
+ return
129
+ end
130
+ if (entry.committed_line_offset || 0) > 0
131
+ Clacky::Logger.warn("[ph_debug] replace_entry_partial", id: id, offset: entry.committed_line_offset, content: content.to_s[0, 120])
132
+ return
133
+ end
126
134
 
127
135
  old_lines = entry.lines.dup
128
136
  new_lines = wrap_content_to_lines(content)
129
137
  if old_lines == new_lines
138
+ Clacky::Logger.warn("[ph_debug] replace_entry_same", id: id)
130
139
  screen.flush
131
140
  return
132
141
  end
133
142
  @buffer.replace(id, new_lines)
143
+ is_tail = @buffer.live_entries.last&.id == id
144
+ Clacky::Logger.warn("[ph_debug] replace_entry_paint", id: id, is_tail: is_tail, old_n: old_lines.length, new_n: new_lines.length, content: content.to_s[0, 120])
134
145
 
135
146
  unless @fullscreen_mode
136
147
  # repaint_entry_in_place relies on the entry being the tail of
@@ -147,7 +158,6 @@ module Clacky
147
158
  # For non-tail replaces, fall back to a full rebuild of the
148
159
  # output area from the buffer. Slower, but correct regardless
149
160
  # of where the entry lives.
150
- is_tail = @buffer.live_entries.last&.id == id
151
161
  if is_tail
152
162
  repaint_entry_in_place(entry, old_lines, new_lines)
153
163
  else
@@ -127,6 +127,7 @@ module Clacky
127
127
  @start_time = nil
128
128
  @ticker = nil
129
129
  @state = :fresh # :fresh → :running → :closed
130
+ @unregistered = false
130
131
  @metadata = {}
131
132
  @last_chunk_at = nil
132
133
  @monitor = Monitor.new
@@ -172,29 +173,38 @@ module Clacky
172
173
  end
173
174
 
174
175
  # Stop the ticker, render one final frame, and unregister from the
175
- # owner. Idempotent calling twice is a no-op.
176
+ # owner. Idempotent and crash-safe if a previous finish was
177
+ # interrupted (e.g. Thread#raise(AgentInterrupted) hit between
178
+ # +stop_ticker+ and +unregister_progress+), a follow-up finish
179
+ # will still complete the unregister so the handle does not stay
180
+ # orphaned on the owner's progress stack.
176
181
  #
177
182
  # @param final_message [String, nil] Optional override for the last
178
183
  # frame. If nil, the handle composes "<message>… (<elapsed>s)".
179
184
  def finish(final_message: nil)
185
+ Clacky::Logger.warn("[ph_debug] finish_entry", oid: object_id, state: @state, unreg: @unregistered, msg: @message, eid: @entry_id)
180
186
  snapshot = @monitor.synchronize do
181
- return if @state != :running
182
- @state = :closed
183
- { message: final_message || @message, elapsed: elapsed_seconds }
187
+ return if @unregistered
188
+ first_close = @state == :running
189
+ @state = :closed if first_close
190
+ {
191
+ first_close: first_close,
192
+ message: final_message || @message,
193
+ elapsed: elapsed_seconds,
194
+ }
184
195
  end
185
196
 
186
197
  stop_ticker
187
- # Collapse fast-finishers to a removed entry so tools that complete
188
- # in under FAST_FINISH_THRESHOLD_SECONDS don't leave a permanent
189
- # "Executing foo… (0s)" line. The owner interprets final_frame: nil
190
- # as "remove the entry entirely".
191
198
  final_frame =
192
199
  if @quiet_on_fast_finish && snapshot[:elapsed] < FAST_FINISH_THRESHOLD_SECONDS
193
200
  nil
194
201
  else
195
202
  compose_final_frame(snapshot[:message], snapshot[:elapsed])
196
203
  end
204
+ Clacky::Logger.warn("[ph_debug] finish_unregister", oid: object_id, eid: @entry_id, first_close: snapshot[:first_close], final_frame: final_frame.to_s[0, 200])
197
205
  @owner.unregister_progress(self, final_frame: final_frame)
206
+ @monitor.synchronize { @unregistered = true }
207
+ Clacky::Logger.warn("[ph_debug] finish_done", oid: object_id)
198
208
  end
199
209
  alias_method :cancel, :finish
200
210
 
@@ -655,6 +655,7 @@ module Clacky
655
655
 
656
656
  # Called by ProgressHandle#finish.
657
657
  def unregister_progress(handle, final_frame:)
658
+ Clacky::Logger.warn("[ph_debug] unreg_entry", oid: handle.object_id, eid: handle.entry_id, top: @progress_stack.last == handle, stack_size: @progress_stack.size, ff: final_frame.to_s[0, 200])
658
659
  @progress_mutex.synchronize do
659
660
  # If this handle still holds its entry (it's currently top), we
660
661
  # render one last frame there and release the id. If it was
@@ -662,10 +663,14 @@ module Clacky
662
663
  # is already gone and the final_frame is simply dropped.
663
664
  if handle.entry_id
664
665
  if final_frame && !final_frame.to_s.strip.empty?
666
+ Clacky::Logger.warn("[ph_debug] unreg_update_entry", oid: handle.object_id, eid: handle.entry_id)
665
667
  update_entry(handle.entry_id, @renderer.render_progress(final_frame))
666
668
  else
669
+ Clacky::Logger.warn("[ph_debug] unreg_remove_entry", oid: handle.object_id, eid: handle.entry_id)
667
670
  remove_entry(handle.entry_id)
668
671
  end
672
+ else
673
+ Clacky::Logger.warn("[ph_debug] unreg_no_entry_id", oid: handle.object_id)
669
674
  end
670
675
 
671
676
  @progress_stack.delete(handle)
@@ -873,6 +878,28 @@ module Clacky
873
878
  append_output(output)
874
879
  end
875
880
 
881
+ def phase_start(kind:, label:)
882
+ phase_id = SecureRandom.uuid
883
+ @active_phases ||= {}
884
+ @active_phases[phase_id] = { kind: kind, label: label, started_at: Time.now }
885
+ Thread.current[:clacky_phase_id] = phase_id
886
+
887
+ banner = "──────── ▼ #{label} ────────"
888
+ append_output(@renderer.render_system_message(banner, prefix_newline: true))
889
+ phase_id
890
+ end
891
+
892
+ def phase_end(phase_id, summary: nil)
893
+ Thread.current[:clacky_phase_id] = nil
894
+ return unless @active_phases&.key?(phase_id)
895
+
896
+ info = @active_phases.delete(phase_id)
897
+ label = info[:label]
898
+ tail = summary && !summary.to_s.strip.empty? ? " — #{summary.to_s.strip}" : ""
899
+ banner = "──────── ▲ #{label} done#{tail} ────────"
900
+ append_output(@renderer.render_system_message(banner, prefix_newline: false))
901
+ end
902
+
876
903
  # Set workspace status to idle (called when agent stops working)
877
904
  def set_idle_status
878
905
  # Safety net: close any legacy progress slots that were opened via
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "securerandom"
4
+
3
5
  module Clacky
4
6
  # UIInterface defines the standard interface between Agent/CLI and UI implementations.
5
7
  # All UI controllers (UIController, JsonUIController) must implement these methods.
@@ -136,5 +138,25 @@ module Clacky
136
138
  # === Path redaction (for encrypted brand skill tmpdirs) ===
137
139
  # === Lifecycle ===
138
140
  def stop(clear_screen: false); end
141
+
142
+ # === Phase grouping (optional, web UI uses this to fold subagent runs) ===
143
+ # Begin a logical phase. Events emitted between phase_start and phase_end
144
+ # carry the phase_id so the UI can group them visually.
145
+ # Returns the phase_id (caller is responsible for passing it to phase_end).
146
+ def phase_start(kind:, label: nil)
147
+ SecureRandom.uuid
148
+ end
149
+
150
+ def phase_end(phase_id, summary: nil); end
151
+
152
+ # Run block within a phase. Always closes via ensure.
153
+ def with_phase(kind:, label: nil)
154
+ pid = phase_start(kind: kind, label: label)
155
+ begin
156
+ yield pid
157
+ ensure
158
+ phase_end(pid)
159
+ end
160
+ end
139
161
  end
140
162
  end
@@ -145,6 +145,47 @@ module Clacky
145
145
  }
146
146
  },
147
147
 
148
+ # Xiaomi MiMo — USD per 1M tokens, international (海外) list price.
149
+ # Source: https://platform.xiaomimimo.com/docs/zh-CN/price/pay-as-you-go
150
+ # Effective 2026-05-27 (V2.5 launch price cut). Cache write is "limited-
151
+ # time free" per Xiaomi's notice; per the project's "displayed ≤ actual"
152
+ # convention we bill writes at the input-miss rate so that when the
153
+ # promo ends users won't see a cost spike. Cache hits use the explicit
154
+ # cache-hit rate.
155
+ #
156
+ # As of 2026-06-01, mimo-v2-pro/omni are forwarded to the V2.5 series
157
+ # and billed at V2.5 rates; mimo-v2-pro mirrors mimo-v2.5-pro and
158
+ # mimo-v2-omni mirrors mimo-v2.5. Both will be retired 2026-06-30.
159
+ "mimo-v2.5-pro" => {
160
+ input: { default: 0.435, over_200k: 0.435 },
161
+ output: { default: 0.87, over_200k: 0.87 },
162
+ cache: { write: 0.435, read: 0.0036 }
163
+ },
164
+
165
+ "mimo-v2.5" => {
166
+ input: { default: 0.14, over_200k: 0.14 },
167
+ output: { default: 0.28, over_200k: 0.28 },
168
+ cache: { write: 0.14, read: 0.0028 }
169
+ },
170
+
171
+ "mimo-v2-pro" => {
172
+ input: { default: 0.435, over_200k: 0.435 },
173
+ output: { default: 0.87, over_200k: 0.87 },
174
+ cache: { write: 0.435, read: 0.0036 }
175
+ },
176
+
177
+ "mimo-v2-omni" => {
178
+ input: { default: 0.14, over_200k: 0.14 },
179
+ output: { default: 0.28, over_200k: 0.28 },
180
+ cache: { write: 0.14, read: 0.0028 }
181
+ },
182
+
183
+ "mimo-v2-flash" => {
184
+ input: { default: 0.10, over_200k: 0.10 },
185
+ output: { default: 0.30, over_200k: 0.30 },
186
+ cache: { write: 0.10, read: 0.01 }
187
+ },
188
+
148
189
  # Kimi K2.5 / K2.6 multimodal models
149
190
  # Source: https://platform.moonshot.cn (USD / 1M tokens)
150
191
  # Kimi billing model (same shape as DeepSeek):
@@ -181,6 +222,38 @@ module Clacky
181
222
  }
182
223
  },
183
224
 
225
+ # Google Gemini 3 series (via Vertex AI). Tiered at 200K input tokens
226
+ # for Pro; Flash has flat pricing.
227
+ "gemini-3.1-pro" => {
228
+ input: {
229
+ default: 2.00,
230
+ over_200k: 4.00
231
+ },
232
+ output: {
233
+ default: 12.00,
234
+ over_200k: 18.00
235
+ },
236
+ cache: {
237
+ write: 2.00,
238
+ read: 0.50
239
+ }
240
+ },
241
+
242
+ "gemini-3-flash" => {
243
+ input: {
244
+ default: 0.50,
245
+ over_200k: 0.50
246
+ },
247
+ output: {
248
+ default: 3.00,
249
+ over_200k: 3.00
250
+ },
251
+ cache: {
252
+ write: 0.50,
253
+ read: 0.05
254
+ }
255
+ },
256
+
184
257
  # OpenAI GPT-5.5 / GPT-5.4 — breakpoint at 272K input tokens
185
258
  # Source: https://openai.com/api/pricing/ (USD / 1M tokens)
186
259
  # Note: OpenAI's actual tiered-pricing threshold is 272K, not the
@@ -581,6 +654,22 @@ module Clacky
581
654
  # non-thinking / thinking modes respectively. Bill at flash rates.
582
655
  when /^deepseek-chat$/i, /^deepseek-reasoner$/i
583
656
  "deepseek-v4-flash"
657
+ # Xiaomi MiMo — strict anchored match per registered model id in
658
+ # providers.rb (currently mimo-v2.5-pro / mimo-v2-pro / mimo-v2-omni).
659
+ # mimo-v2.5 / mimo-v2-flash are also priced ahead of provider-side
660
+ # registration. Per Xiaomi's 2026-06 schedule, mimo-v2-pro/omni are
661
+ # transparently routed to V2.5 — keys are listed independently so
662
+ # both old and new ids resolve to the right rate.
663
+ when /^mimo-v2\.?5-pro$/i
664
+ "mimo-v2.5-pro"
665
+ when /^mimo-v2\.?5$/i
666
+ "mimo-v2.5"
667
+ when /^mimo-v2-pro$/i
668
+ "mimo-v2-pro"
669
+ when /^mimo-v2-omni$/i
670
+ "mimo-v2-omni"
671
+ when /^mimo-v2-flash$/i
672
+ "mimo-v2-flash"
584
673
  # Kimi K2.5 / K2.6 — strict match only. K2 text-only models
585
674
  # (kimi-k2-0905-preview, kimi-k2-thinking, etc.) are not yet
586
675
  # registered in providers.rb and will be added in a follow-up
@@ -636,6 +725,13 @@ module Clacky
636
725
  when /^qwen3-vl-plus$/i
637
726
  "qwen3-vl-plus"
638
727
 
728
+ # Google Gemini 3 series. Match the platform aliases (or-gemini-*)
729
+ # and the bare upstream ids returned by Vertex.
730
+ when /^or-gemini-3-1-pro$/i, /^gemini-3\.1-pro(-preview)?$/i
731
+ "gemini-3.1-pro"
732
+ when /^or-gemini-3-5-flash$/i, /^gemini-3\.5-flash$/i, /^gemini-3-flash(-preview)?$/i
733
+ "gemini-3-flash"
734
+
639
735
  # OpenAI GPT-5.x models — match various dashed/dotted/compact forms
640
736
  # (e.g. "gpt-5.5", "gpt-5-5", "gpt5.5", "gpt55")
641
737
  when /^gpt-?5\.?5$/i, /^gpt-?5[\.-]?5$/i
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Clacky
4
- VERSION = "1.2.12"
4
+ VERSION = "1.2.14"
5
5
  end
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "base64"
5
+ require "fileutils"
6
+ require "json"
7
+ require_relative "../utils/file_processor"
8
+
9
+ module Clacky
10
+ module Vision
11
+ # OCR sidecar — turns image bytes into a text description by calling a
12
+ # vision-capable model. Used when the user's primary model is text-only
13
+ # (e.g. DeepSeek V4) so that uploaded images and tool screenshots still
14
+ # reach the conversation as useful context.
15
+ #
16
+ # Routes through Clacky::Client so we get the same OpenAI/Anthropic/
17
+ # Bedrock format negotiation, retry, and credit-error handling as the
18
+ # main agent path. Image content travels as a canonical `image_url`
19
+ # block (the unified internal shape understood by all three formats).
20
+ class Resolver
21
+ DEFAULT_PROMPT = <<~PROMPT.strip
22
+ Extract every legible text and describe the visual content of this image.
23
+ Output as Markdown. Preserve table layout where possible (use Markdown tables).
24
+ For UI screenshots, describe the layout, visible labels, and active state.
25
+ Be thorough but concise — the user cannot see the image and must rely on
26
+ your description.
27
+ PROMPT
28
+
29
+ MAX_TOKENS = 8192
30
+ CACHE_DIR = File.join(Dir.home, ".clacky", "ocr_cache")
31
+ CACHE_VERSION = 1
32
+
33
+ Result = Struct.new(:status, :text, :error, keyword_init: true) do
34
+ def ok?; status == :ok; end
35
+ def empty?; status == :empty; end
36
+ def call_failed?; status == :call_failed; end
37
+ def bad_image?; status == :bad_image; end
38
+ end
39
+
40
+ def initialize(model_entry)
41
+ @model_entry = model_entry
42
+ @model = model_entry["model"]
43
+ @base_url = model_entry["base_url"]
44
+ @api_key = model_entry["api_key"]
45
+ @anthropic = !!model_entry["anthropic_format"]
46
+ end
47
+
48
+ # @return [Result] one of:
49
+ # status=:ok + text — sidecar produced a description
50
+ # status=:empty — sidecar returned 200 but no usable text (e.g. token budget exhausted by reasoning)
51
+ # status=:call_failed + error — network/parse/auth error from the sidecar
52
+ # status=:bad_image — image bytes unreadable / empty
53
+ def describe(image, prompt: nil)
54
+ prompt = prompt.to_s.strip
55
+ prompt = DEFAULT_PROMPT if prompt.empty?
56
+
57
+ bytes, mime = read_image(image)
58
+ return Result.new(status: :bad_image) if bytes.nil? || bytes.empty?
59
+
60
+ cached = cache_get(bytes, prompt)
61
+ return Result.new(status: :ok, text: cached) if cached
62
+
63
+ text = call_vlm(bytes, mime, prompt)
64
+ return Result.new(status: :empty) if text.nil? || text.strip.empty?
65
+
66
+ cache_put(bytes, prompt, text)
67
+ Result.new(status: :ok, text: text)
68
+ rescue => e
69
+ Clacky::Logger.warn("[Vision::Resolver] failed: #{e.class}: #{e.message}") if defined?(Clacky::Logger)
70
+ Result.new(status: :call_failed, error: "#{e.class}: #{e.message}")
71
+ end
72
+
73
+ private def read_image(image)
74
+ if image[:bytes]
75
+ [image[:bytes], image[:mime_type] || "image/png"]
76
+ elsif image[:data_url] || image["data_url"]
77
+ url = image[:data_url] || image["data_url"]
78
+ m = url.match(/\Adata:([^;]+);base64,(.*)\z/m)
79
+ return [nil, nil] unless m
80
+ [Base64.decode64(m[2]), m[1]]
81
+ elsif image[:path] || image["path"]
82
+ path = image[:path] || image["path"]
83
+ return [nil, nil] unless File.exist?(path)
84
+ [File.binread(path), Utils::FileProcessor.detect_mime_type(path, nil) || "image/png"]
85
+ else
86
+ [nil, nil]
87
+ end
88
+ end
89
+
90
+ private def call_vlm(bytes, mime, prompt)
91
+ data_url = "data:#{mime};base64,#{Base64.strict_encode64(bytes)}"
92
+ message = {
93
+ role: "user",
94
+ content: [
95
+ { type: "text", text: prompt },
96
+ { type: "image_url", image_url: { url: data_url } }
97
+ ]
98
+ }
99
+
100
+ client = Clacky::Client.new(
101
+ @api_key,
102
+ base_url: @base_url,
103
+ model: @model,
104
+ anthropic_format: @anthropic
105
+ )
106
+ response = client.send_messages([message], model: @model, max_tokens: MAX_TOKENS)
107
+ extract_text(response)
108
+ end
109
+
110
+ # Client#send_messages returns the raw upstream string for OpenAI/Anthropic;
111
+ # for Bedrock it returns the parsed text content. Normalise to String.
112
+ private def extract_text(response)
113
+ case response
114
+ when String then response
115
+ when Hash then response[:content] || response["content"] || response.to_s
116
+ else response.to_s
117
+ end
118
+ end
119
+
120
+ # ── Cache ─────────────────────────────────────────────────────────────
121
+
122
+ private def cache_key(bytes, prompt)
123
+ sha = Digest::SHA256.hexdigest(bytes)
124
+ prompt_sha = Digest::SHA256.hexdigest(prompt)[0, 12]
125
+ "#{sha}_#{@model.gsub(/[^A-Za-z0-9_.-]/, '_')}_#{prompt_sha}"
126
+ end
127
+
128
+ private def cache_path(key)
129
+ File.join(CACHE_DIR, "#{key}.json")
130
+ end
131
+
132
+ private def cache_get(bytes, prompt)
133
+ path = cache_path(cache_key(bytes, prompt))
134
+ return nil unless File.exist?(path)
135
+ data = JSON.parse(File.read(path))
136
+ return nil unless data["v"] == CACHE_VERSION
137
+ data["text"]
138
+ rescue JSON::ParserError, Errno::ENOENT
139
+ nil
140
+ end
141
+
142
+ private def cache_put(bytes, prompt, text)
143
+ FileUtils.mkdir_p(CACHE_DIR)
144
+ path = cache_path(cache_key(bytes, prompt))
145
+ File.write(path, JSON.generate({
146
+ "v" => CACHE_VERSION,
147
+ "model" => @model,
148
+ "text" => text,
149
+ "ts" => Time.now.to_i
150
+ }))
151
+ rescue => _
152
+ # Cache is best-effort — never fail the request because we can't write.
153
+ nil
154
+ end
155
+ end
156
+ end
157
+ end