openclacky 1.2.13 → 1.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -127,6 +127,7 @@ module Clacky
127
127
  @start_time = nil
128
128
  @ticker = nil
129
129
  @state = :fresh # :fresh → :running → :closed
130
+ @unregistered = false
130
131
  @metadata = {}
131
132
  @last_chunk_at = nil
132
133
  @monitor = Monitor.new
@@ -172,34 +173,37 @@ module Clacky
172
173
  end
173
174
 
174
175
  # Stop the ticker, render one final frame, and unregister from the
175
- # owner. Idempotent calling twice is a no-op.
176
+ # owner. Idempotent and crash-safe if a previous finish was
177
+ # interrupted (e.g. Thread#raise(AgentInterrupted) hit between
178
+ # +stop_ticker+ and +unregister_progress+), a follow-up finish
179
+ # will still complete the unregister so the handle does not stay
180
+ # orphaned on the owner's progress stack.
176
181
  #
177
182
  # @param final_message [String, nil] Optional override for the last
178
183
  # frame. If nil, the handle composes "<message>… (<elapsed>s)".
179
184
  def finish(final_message: nil)
180
- Clacky::Logger.warn("[ph_debug] finish_entry", oid: object_id, state: @state, msg: @message, eid: @entry_id)
185
+ Clacky::Logger.warn("[ph_debug] finish_entry", oid: object_id, state: @state, unreg: @unregistered, msg: @message, eid: @entry_id)
181
186
  snapshot = @monitor.synchronize do
182
- if @state != :running
183
- Clacky::Logger.warn("[ph_debug] finish_noop_state", oid: object_id, state: @state)
184
- return
185
- end
186
- @state = :closed
187
- { message: final_message || @message, elapsed: elapsed_seconds }
187
+ return if @unregistered
188
+ first_close = @state == :running
189
+ @state = :closed if first_close
190
+ {
191
+ first_close: first_close,
192
+ message: final_message || @message,
193
+ elapsed: elapsed_seconds,
194
+ }
188
195
  end
189
196
 
190
197
  stop_ticker
191
- # Collapse fast-finishers to a removed entry so tools that complete
192
- # in under FAST_FINISH_THRESHOLD_SECONDS don't leave a permanent
193
- # "Executing foo… (0s)" line. The owner interprets final_frame: nil
194
- # as "remove the entry entirely".
195
198
  final_frame =
196
199
  if @quiet_on_fast_finish && snapshot[:elapsed] < FAST_FINISH_THRESHOLD_SECONDS
197
200
  nil
198
201
  else
199
202
  compose_final_frame(snapshot[:message], snapshot[:elapsed])
200
203
  end
201
- Clacky::Logger.warn("[ph_debug] finish_unregister", oid: object_id, eid: @entry_id, final_frame: final_frame.to_s[0, 200])
204
+ Clacky::Logger.warn("[ph_debug] finish_unregister", oid: object_id, eid: @entry_id, first_close: snapshot[:first_close], final_frame: final_frame.to_s[0, 200])
202
205
  @owner.unregister_progress(self, final_frame: final_frame)
206
+ @monitor.synchronize { @unregistered = true }
203
207
  Clacky::Logger.warn("[ph_debug] finish_done", oid: object_id)
204
208
  end
205
209
  alias_method :cancel, :finish
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Clacky
4
- VERSION = "1.2.13"
4
+ VERSION = "1.2.14"
5
5
  end
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "digest"
4
+ require "base64"
5
+ require "fileutils"
6
+ require "json"
7
+ require_relative "../utils/file_processor"
8
+
9
+ module Clacky
10
+ module Vision
11
+ # OCR sidecar — turns image bytes into a text description by calling a
12
+ # vision-capable model. Used when the user's primary model is text-only
13
+ # (e.g. DeepSeek V4) so that uploaded images and tool screenshots still
14
+ # reach the conversation as useful context.
15
+ #
16
+ # Routes through Clacky::Client so we get the same OpenAI/Anthropic/
17
+ # Bedrock format negotiation, retry, and credit-error handling as the
18
+ # main agent path. Image content travels as a canonical `image_url`
19
+ # block (the unified internal shape understood by all three formats).
20
+ class Resolver
21
+ DEFAULT_PROMPT = <<~PROMPT.strip
22
+ Extract every legible text and describe the visual content of this image.
23
+ Output as Markdown. Preserve table layout where possible (use Markdown tables).
24
+ For UI screenshots, describe the layout, visible labels, and active state.
25
+ Be thorough but concise — the user cannot see the image and must rely on
26
+ your description.
27
+ PROMPT
28
+
29
+ MAX_TOKENS = 8192
30
+ CACHE_DIR = File.join(Dir.home, ".clacky", "ocr_cache")
31
+ CACHE_VERSION = 1
32
+
33
+ Result = Struct.new(:status, :text, :error, keyword_init: true) do
34
+ def ok?; status == :ok; end
35
+ def empty?; status == :empty; end
36
+ def call_failed?; status == :call_failed; end
37
+ def bad_image?; status == :bad_image; end
38
+ end
39
+
40
+ def initialize(model_entry)
41
+ @model_entry = model_entry
42
+ @model = model_entry["model"]
43
+ @base_url = model_entry["base_url"]
44
+ @api_key = model_entry["api_key"]
45
+ @anthropic = !!model_entry["anthropic_format"]
46
+ end
47
+
48
+ # @return [Result] one of:
49
+ # status=:ok + text — sidecar produced a description
50
+ # status=:empty — sidecar returned 200 but no usable text (e.g. token budget exhausted by reasoning)
51
+ # status=:call_failed + error — network/parse/auth error from the sidecar
52
+ # status=:bad_image — image bytes unreadable / empty
53
+ def describe(image, prompt: nil)
54
+ prompt = prompt.to_s.strip
55
+ prompt = DEFAULT_PROMPT if prompt.empty?
56
+
57
+ bytes, mime = read_image(image)
58
+ return Result.new(status: :bad_image) if bytes.nil? || bytes.empty?
59
+
60
+ cached = cache_get(bytes, prompt)
61
+ return Result.new(status: :ok, text: cached) if cached
62
+
63
+ text = call_vlm(bytes, mime, prompt)
64
+ return Result.new(status: :empty) if text.nil? || text.strip.empty?
65
+
66
+ cache_put(bytes, prompt, text)
67
+ Result.new(status: :ok, text: text)
68
+ rescue => e
69
+ Clacky::Logger.warn("[Vision::Resolver] failed: #{e.class}: #{e.message}") if defined?(Clacky::Logger)
70
+ Result.new(status: :call_failed, error: "#{e.class}: #{e.message}")
71
+ end
72
+
73
+ private def read_image(image)
74
+ if image[:bytes]
75
+ [image[:bytes], image[:mime_type] || "image/png"]
76
+ elsif image[:data_url] || image["data_url"]
77
+ url = image[:data_url] || image["data_url"]
78
+ m = url.match(/\Adata:([^;]+);base64,(.*)\z/m)
79
+ return [nil, nil] unless m
80
+ [Base64.decode64(m[2]), m[1]]
81
+ elsif image[:path] || image["path"]
82
+ path = image[:path] || image["path"]
83
+ return [nil, nil] unless File.exist?(path)
84
+ [File.binread(path), Utils::FileProcessor.detect_mime_type(path, nil) || "image/png"]
85
+ else
86
+ [nil, nil]
87
+ end
88
+ end
89
+
90
+ private def call_vlm(bytes, mime, prompt)
91
+ data_url = "data:#{mime};base64,#{Base64.strict_encode64(bytes)}"
92
+ message = {
93
+ role: "user",
94
+ content: [
95
+ { type: "text", text: prompt },
96
+ { type: "image_url", image_url: { url: data_url } }
97
+ ]
98
+ }
99
+
100
+ client = Clacky::Client.new(
101
+ @api_key,
102
+ base_url: @base_url,
103
+ model: @model,
104
+ anthropic_format: @anthropic
105
+ )
106
+ response = client.send_messages([message], model: @model, max_tokens: MAX_TOKENS)
107
+ extract_text(response)
108
+ end
109
+
110
+ # Client#send_messages returns the raw upstream string for OpenAI/Anthropic;
111
+ # for Bedrock it returns the parsed text content. Normalise to String.
112
+ private def extract_text(response)
113
+ case response
114
+ when String then response
115
+ when Hash then response[:content] || response["content"] || response.to_s
116
+ else response.to_s
117
+ end
118
+ end
119
+
120
+ # ── Cache ─────────────────────────────────────────────────────────────
121
+
122
+ private def cache_key(bytes, prompt)
123
+ sha = Digest::SHA256.hexdigest(bytes)
124
+ prompt_sha = Digest::SHA256.hexdigest(prompt)[0, 12]
125
+ "#{sha}_#{@model.gsub(/[^A-Za-z0-9_.-]/, '_')}_#{prompt_sha}"
126
+ end
127
+
128
+ private def cache_path(key)
129
+ File.join(CACHE_DIR, "#{key}.json")
130
+ end
131
+
132
+ private def cache_get(bytes, prompt)
133
+ path = cache_path(cache_key(bytes, prompt))
134
+ return nil unless File.exist?(path)
135
+ data = JSON.parse(File.read(path))
136
+ return nil unless data["v"] == CACHE_VERSION
137
+ data["text"]
138
+ rescue JSON::ParserError, Errno::ENOENT
139
+ nil
140
+ end
141
+
142
+ private def cache_put(bytes, prompt, text)
143
+ FileUtils.mkdir_p(CACHE_DIR)
144
+ path = cache_path(cache_key(bytes, prompt))
145
+ File.write(path, JSON.generate({
146
+ "v" => CACHE_VERSION,
147
+ "model" => @model,
148
+ "text" => text,
149
+ "ts" => Time.now.to_i
150
+ }))
151
+ rescue => _
152
+ # Cache is best-effort — never fail the request because we can't write.
153
+ nil
154
+ end
155
+ end
156
+ end
157
+ end
@@ -505,12 +505,13 @@ const I18n = (() => {
505
505
  "settings.models.badge.default": "Default",
506
506
  "settings.models.badge.lite": "Lite",
507
507
  "settings.media.title": "Media Generation",
508
- "settings.media.desc": "Optional. Image / video / audio generation models.",
508
+ "settings.media.desc": "Optional. Image / video / audio / OCR sidecar models.",
509
509
  "settings.media.loading": "Loading…",
510
510
  "settings.media.error": "Failed to load: {{msg}}",
511
511
  "settings.media.kind.image": "Image",
512
512
  "settings.media.kind.video": "Video",
513
513
  "settings.media.kind.audio": "Audio",
514
+ "settings.media.kind.ocr": "OCR",
514
515
  "settings.media.source.off": "Off",
515
516
  "settings.media.source.auto": "Auto",
516
517
  "settings.media.source.custom": "Custom",
@@ -1245,12 +1246,13 @@ const I18n = (() => {
1245
1246
  "settings.models.badge.default": "默认",
1246
1247
  "settings.models.badge.lite": "轻量",
1247
1248
  "settings.media.title": "媒体生成",
1248
- "settings.media.desc": "可选。图片 / 视频 / 音频 生成模型。",
1249
+ "settings.media.desc": "可选。图片 / 视频 / 音频 / 图片理解(OCR)副模型。",
1249
1250
  "settings.media.loading": "加载中…",
1250
1251
  "settings.media.error": "加载失败:{{msg}}",
1251
1252
  "settings.media.kind.image": "图片",
1252
1253
  "settings.media.kind.video": "视频",
1253
1254
  "settings.media.kind.audio": "音频",
1255
+ "settings.media.kind.ocr": "OCR",
1254
1256
  "settings.media.source.off": "关闭",
1255
1257
  "settings.media.source.auto": "自动",
1256
1258
  "settings.media.source.custom": "自定义",
@@ -1528,7 +1528,7 @@ const Settings = (() => {
1528
1528
  // The state object per kind:
1529
1529
  // { source, configured, model, base_url, api_key_masked, provider, available }
1530
1530
 
1531
- const MEDIA_KINDS = ["image", "video", "audio"];
1531
+ const MEDIA_KINDS = ["image", "video", "audio", "ocr"];
1532
1532
  let _mediaState = null;
1533
1533
  let _mediaDefaults = null;
1534
1534
  const _mediaCustomDraft = {};
@@ -1538,10 +1538,16 @@ const Settings = (() => {
1538
1538
  if (!container) return;
1539
1539
  container.innerHTML = `<div class="settings-loading">${I18n.t("settings.media.loading")}</div>`;
1540
1540
  try {
1541
- const res = await fetch("/api/config/media");
1542
- const data = await res.json();
1543
- _mediaState = data.media || {};
1544
- _mediaDefaults = data.default_provider || {};
1541
+ const [mediaRes, ocrRes] = await Promise.all([
1542
+ fetch("/api/config/media"),
1543
+ fetch("/api/config/ocr")
1544
+ ]);
1545
+ const mediaData = await mediaRes.json();
1546
+ const ocrData = await ocrRes.json();
1547
+ _mediaState = mediaData.media || {};
1548
+ _mediaDefaults = mediaData.default_provider || {};
1549
+ _mediaState["ocr"] = ocrData.ocr || { source: "off", available: [] };
1550
+ _mediaDefaults["ocr"] = ocrData.default_provider || { available: [] };
1545
1551
  _renderMediaRows();
1546
1552
  } catch (e) {
1547
1553
  container.innerHTML = `<div class="settings-error">${I18n.t("settings.media.error", { msg: e.message })}</div>`;
@@ -1557,6 +1563,14 @@ const Settings = (() => {
1557
1563
  });
1558
1564
  }
1559
1565
 
1566
+ function _refreshKindRows(_kind) {
1567
+ _renderMediaRows();
1568
+ }
1569
+
1570
+ async function _reloadKind(_kind) {
1571
+ await _loadMedia();
1572
+ }
1573
+
1560
1574
  function _renderMediaRow(kind) {
1561
1575
  const state = (_mediaState && _mediaState[kind]) || { source: "off", available: [] };
1562
1576
  const def = (_mediaDefaults && _mediaDefaults[kind]) || { available: [] };
@@ -1662,7 +1676,7 @@ const Settings = (() => {
1662
1676
  _setMediaResult(kind, "testing", I18n.t("settings.media.action.saving"));
1663
1677
  try {
1664
1678
  await _saveMediaConfig(kind, payload);
1665
- await _loadMedia();
1679
+ await _reloadKind(kind);
1666
1680
  } catch (e) {
1667
1681
  sel.disabled = false;
1668
1682
  _setMediaResult(kind, "fail", e.message);
@@ -1742,7 +1756,7 @@ const Settings = (() => {
1742
1756
  base_url: state.base_url || "",
1743
1757
  api_key: ""
1744
1758
  };
1745
- _renderMediaRows();
1759
+ _refreshKindRows(kind);
1746
1760
  });
1747
1761
 
1748
1762
  const testBtn = document.createElement("button");
@@ -1818,7 +1832,7 @@ const Settings = (() => {
1818
1832
  const fallback = (_mediaDefaults && _mediaDefaults[kind] && _mediaDefaults[kind].model) ? "auto" : "off";
1819
1833
  _mediaState[kind] = { ..._mediaState[kind], source: fallback };
1820
1834
  }
1821
- _renderMediaRows();
1835
+ _refreshKindRows(kind);
1822
1836
  });
1823
1837
 
1824
1838
  const saveBtn = document.createElement("button");
@@ -1838,7 +1852,7 @@ const Settings = (() => {
1838
1852
  api_key: d.api_key || ""
1839
1853
  });
1840
1854
  delete _mediaCustomDraft[kind];
1841
- await _loadMedia();
1855
+ await _reloadKind(kind);
1842
1856
  } catch (e) {
1843
1857
  saveBtn.disabled = false;
1844
1858
  cancelBtn.disabled = false;
@@ -1906,7 +1920,8 @@ const Settings = (() => {
1906
1920
  }
1907
1921
 
1908
1922
  async function _saveMediaConfig(kind, body) {
1909
- const res = await fetch(`/api/config/media/${kind}`, {
1923
+ const url = kind === "ocr" ? `/api/config/ocr` : `/api/config/media/${kind}`;
1924
+ const res = await fetch(url, {
1910
1925
  method: "PATCH",
1911
1926
  headers: { "Content-Type": "application/json" },
1912
1927
  body: JSON.stringify(body)
@@ -1920,10 +1935,14 @@ const Settings = (() => {
1920
1935
 
1921
1936
  async function _testMediaConfig(kind, { model, base_url, api_key }) {
1922
1937
  try {
1923
- const res = await fetch(`/api/config/media/test`, {
1938
+ const url = kind === "ocr" ? `/api/config/ocr/test` : `/api/config/media/test`;
1939
+ const payload = kind === "ocr"
1940
+ ? { model, base_url, api_key }
1941
+ : { kind, model, base_url, api_key };
1942
+ const res = await fetch(url, {
1924
1943
  method: "POST",
1925
1944
  headers: { "Content-Type": "application/json" },
1926
- body: JSON.stringify({ kind, model, base_url, api_key })
1945
+ body: JSON.stringify(payload)
1927
1946
  });
1928
1947
  const data = await res.json().catch(() => ({}));
1929
1948
  if (!res.ok) return { ok: false, message: data.error || `HTTP ${res.status}` };
data/lib/clacky.rb CHANGED
@@ -128,6 +128,7 @@ require_relative "clacky/mcp/skill_provider"
128
128
  require_relative "clacky/media/base"
129
129
  require_relative "clacky/media/openai_compat"
130
130
  require_relative "clacky/media/generator"
131
+ require_relative "clacky/vision/resolver"
131
132
  require_relative "clacky/telemetry"
132
133
  require_relative "clacky/agent"
133
134
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: openclacky
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.13
4
+ version: 1.2.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - windy
@@ -359,6 +359,7 @@ files:
359
359
  - lib/clacky/default_parsers/pdf_parser.rb
360
360
  - lib/clacky/default_parsers/pdf_parser_ocr.py
361
361
  - lib/clacky/default_parsers/pdf_parser_plumber.py
362
+ - lib/clacky/default_parsers/pdf_parser_vlm.py
362
363
  - lib/clacky/default_parsers/pptx_parser.rb
363
364
  - lib/clacky/default_parsers/wps_parser.rb
364
365
  - lib/clacky/default_parsers/xlsx_parser.rb
@@ -532,6 +533,7 @@ files:
532
533
  - lib/clacky/utils/trash_directory.rb
533
534
  - lib/clacky/utils/workspace_rules.rb
534
535
  - lib/clacky/version.rb
536
+ - lib/clacky/vision/resolver.rb
535
537
  - lib/clacky/web/app.css
536
538
  - lib/clacky/web/app.js
537
539
  - lib/clacky/web/apple-touch-icon-180.png