openclacky 1.2.13 → 1.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.clacky/skills/gem-release/SKILL.md +4 -0
- data/CHANGELOG.md +16 -0
- data/lib/clacky/agent/session_serializer.rb +1 -0
- data/lib/clacky/agent.rb +123 -14
- data/lib/clacky/agent_config.rb +125 -8
- data/lib/clacky/client.rb +11 -1
- data/lib/clacky/default_parsers/pdf_parser.rb +70 -86
- data/lib/clacky/default_parsers/pdf_parser_vlm.py +136 -0
- data/lib/clacky/providers.rb +37 -0
- data/lib/clacky/server/http_server.rb +179 -4
- data/lib/clacky/ui2/progress_handle.rb +17 -13
- data/lib/clacky/version.rb +1 -1
- data/lib/clacky/vision/resolver.rb +157 -0
- data/lib/clacky/web/i18n.js +4 -2
- data/lib/clacky/web/settings.js +31 -12
- data/lib/clacky.rb +1 -0
- metadata +3 -1
|
@@ -8,113 +8,97 @@
|
|
|
8
8
|
#
|
|
9
9
|
# Output:
|
|
10
10
|
# stdout — extracted text content (UTF-8)
|
|
11
|
-
# stderr — error messages
|
|
11
|
+
# stderr — error / progress messages
|
|
12
12
|
# exit 0 — success
|
|
13
|
-
# exit 1 — failure
|
|
13
|
+
# exit 1 — hard failure (file unreadable, pdftotext missing, etc.)
|
|
14
14
|
#
|
|
15
|
-
#
|
|
15
|
+
# Strategy
|
|
16
|
+
# --------
|
|
17
|
+
# PDF pages naturally fall into two kinds: pages with a real text layer,
|
|
18
|
+
# and scanned-image pages. The right tool is a per-page property, not a
|
|
19
|
+
# document-level one. So:
|
|
16
20
|
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
# (→ pdf_parser_ocr.py)
|
|
21
|
+
# 1. Run pdftotext once over the whole file (`-layout`), split by `\f`.
|
|
22
|
+
# 2. Pages with enough bytes → emit text directly.
|
|
23
|
+
# 3. Pages below threshold → list page numbers in a Notice section
|
|
24
|
+
# with a shell command template the agent can run on demand to
|
|
25
|
+
# render a specific page to PNG, then file_reader that PNG.
|
|
23
26
|
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
#
|
|
27
|
+
# The parser does NOT pre-render images. Most weak pages will never be
|
|
28
|
+
# read (the answer is often already in the text-layer pages). Rendering
|
|
29
|
+
# all of them up front is wasteful — 55 pages takes ~14s and most goes
|
|
30
|
+
# to waste. The agent decides when (and which page) to OCR based on the
|
|
31
|
+
# user's actual question.
|
|
28
32
|
#
|
|
29
|
-
# VERSION:
|
|
33
|
+
# VERSION: 6
|
|
30
34
|
|
|
31
35
|
require "open3"
|
|
32
36
|
|
|
33
|
-
|
|
34
|
-
# miss and the next fallback is tried.
|
|
35
|
-
MIN_CONTENT_BYTES = 20
|
|
37
|
+
MIN_PAGE_BYTES = 20
|
|
36
38
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
SCRIPT_DIR = File.dirname(File.expand_path(__FILE__))
|
|
41
|
-
|
|
42
|
-
def try_pdftotext(path)
|
|
43
|
-
stdout, _stderr, status = Open3.capture3("pdftotext", "-layout", "-enc", "UTF-8", path, "-")
|
|
44
|
-
return nil unless status.success?
|
|
45
|
-
text = stdout.strip
|
|
46
|
-
return nil if text.bytesize < MIN_CONTENT_BYTES
|
|
47
|
-
text
|
|
48
|
-
rescue Errno::ENOENT
|
|
49
|
-
nil # pdftotext not installed
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
def try_pdfplumber(path)
|
|
53
|
-
script = File.join(SCRIPT_DIR, "pdf_parser_plumber.py")
|
|
54
|
-
return nil unless File.exist?(script)
|
|
55
|
-
|
|
56
|
-
stdout, _stderr, status = Open3.capture3("python3", script, path)
|
|
57
|
-
return nil unless status.success?
|
|
58
|
-
text = stdout.strip
|
|
59
|
-
return nil if text.bytesize < MIN_CONTENT_BYTES
|
|
60
|
-
text
|
|
61
|
-
rescue Errno::ENOENT
|
|
62
|
-
nil # python3 not available
|
|
39
|
+
def die(msg)
|
|
40
|
+
warn msg
|
|
41
|
+
exit 1
|
|
63
42
|
end
|
|
64
43
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
# macOS: brew install tesseract tesseract-lang poppler
|
|
70
|
-
# pip3 install pytesseract pdf2image
|
|
71
|
-
# Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
|
|
72
|
-
# pip3 install pytesseract pdf2image
|
|
73
|
-
def try_ocr(path)
|
|
74
|
-
# Quick capability check — avoid spawning python if tesseract is missing.
|
|
75
|
-
_stdout, _stderr, status = Open3.capture3("tesseract", "--version")
|
|
76
|
-
return nil unless status.success?
|
|
77
|
-
|
|
78
|
-
script = File.join(SCRIPT_DIR, "pdf_parser_ocr.py")
|
|
79
|
-
return nil unless File.exist?(script)
|
|
80
|
-
|
|
81
|
-
stdout, stderr, status = Open3.capture3("python3", script, path)
|
|
44
|
+
def pdftotext_pages(path)
|
|
45
|
+
stdout, stderr, status = Open3.capture3(
|
|
46
|
+
"pdftotext", "-layout", "-enc", "UTF-8", path, "-"
|
|
47
|
+
)
|
|
82
48
|
unless status.success?
|
|
83
|
-
warn
|
|
49
|
+
warn "pdftotext failed: #{stderr.strip}"
|
|
84
50
|
return nil
|
|
85
51
|
end
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
52
|
+
pages = stdout.split("\f", -1)
|
|
53
|
+
pages.pop if pages.last && pages.last.strip.empty?
|
|
54
|
+
pages.map(&:strip)
|
|
89
55
|
rescue Errno::ENOENT
|
|
90
|
-
|
|
56
|
+
warn "pdftotext not found. Install poppler (`brew install poppler` / `apt install poppler-utils`)."
|
|
57
|
+
nil
|
|
91
58
|
end
|
|
92
59
|
|
|
93
|
-
|
|
60
|
+
def main(argv)
|
|
61
|
+
die "Usage: pdf_parser.rb <file_path>" if argv.empty?
|
|
62
|
+
path = argv[0]
|
|
63
|
+
die "File not found: #{path}" unless File.file?(path)
|
|
94
64
|
|
|
95
|
-
|
|
65
|
+
pages = pdftotext_pages(path)
|
|
66
|
+
die "Could not extract text from PDF." if pages.nil?
|
|
96
67
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
68
|
+
weak = []
|
|
69
|
+
body_chunks = []
|
|
70
|
+
pages.each_with_index do |text, idx|
|
|
71
|
+
n = idx + 1
|
|
72
|
+
if text.bytesize >= MIN_PAGE_BYTES
|
|
73
|
+
body_chunks << "--- Page #{n} ---\n\n#{text}"
|
|
74
|
+
else
|
|
75
|
+
body_chunks << "--- Page #{n} ---\n\n[no extractable text layer]"
|
|
76
|
+
weak << n
|
|
77
|
+
end
|
|
78
|
+
end
|
|
101
79
|
|
|
102
|
-
|
|
103
|
-
warn "File not found: #{path}"
|
|
104
|
-
exit 1
|
|
105
|
-
end
|
|
80
|
+
output = body_chunks.join("\n\n")
|
|
106
81
|
|
|
107
|
-
|
|
108
|
-
|
|
82
|
+
if weak.any?
|
|
83
|
+
abs_path = File.expand_path(path)
|
|
84
|
+
notice = +"\n\n--- Notice ---\n\n"
|
|
85
|
+
notice << "#{weak.size} of #{pages.size} pages have no extractable text layer "
|
|
86
|
+
notice << "(likely scanned images).\n"
|
|
87
|
+
notice << "Pages without text: #{weak.join(', ')}\n\n"
|
|
88
|
+
notice << "To OCR a specific page, render it to PNG via shell, then "
|
|
89
|
+
notice << "file_reader the PNG (it will be transcribed via the "
|
|
90
|
+
notice << "vision/OCR pipeline):\n\n"
|
|
91
|
+
notice << " pdftoppm -r 150 -f <N> -l <N> -png -singlefile "
|
|
92
|
+
notice << "#{abs_path.inspect} /tmp/clacky-pdf-page-<N>\n"
|
|
93
|
+
notice << " # produces /tmp/clacky-pdf-page-<N>.png\n\n"
|
|
94
|
+
notice << "Only render pages you actually need. If the user's question "
|
|
95
|
+
notice << "is already answered by the extracted text above, skip OCR.\n"
|
|
96
|
+
output << notice
|
|
97
|
+
end
|
|
109
98
|
|
|
110
|
-
|
|
111
|
-
|
|
99
|
+
$stdout.write(output)
|
|
100
|
+
$stdout.write("\n") unless output.end_with?("\n")
|
|
112
101
|
exit 0
|
|
113
|
-
else
|
|
114
|
-
warn "Could not extract text from PDF."
|
|
115
|
-
warn "For text-based PDFs, install poppler: brew install poppler (macOS) / apt install poppler-utils (Linux)"
|
|
116
|
-
warn "For scanned PDFs (OCR):"
|
|
117
|
-
warn " macOS: brew install tesseract tesseract-lang poppler && pip3 install pytesseract pdf2image"
|
|
118
|
-
warn " Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils && pip3 install pytesseract pdf2image"
|
|
119
|
-
exit 1
|
|
120
102
|
end
|
|
103
|
+
|
|
104
|
+
main(ARGV) if __FILE__ == $PROGRAM_NAME
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Clacky PDF Parser — VLM (Vision Language Model) extractor
|
|
5
|
+
|
|
6
|
+
Renders each PDF page to PNG via pdftoppm (poppler), then asks the
|
|
7
|
+
configured OCR sidecar (e.g. gemini-3-5-flash, gpt-4o-mini) to transcribe
|
|
8
|
+
each page through the local Clacky server's internal OCR endpoint.
|
|
9
|
+
|
|
10
|
+
Why through HTTP and not direct API call?
|
|
11
|
+
The OCR sidecar config (model, base_url, api_key) lives in the agent's
|
|
12
|
+
~/.clacky/config.yml. We don't re-implement that lookup here — instead
|
|
13
|
+
the local Clacky server exposes /api/internal/ocr-image which already
|
|
14
|
+
has the agent_config in scope. This parser stays a thin client.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python3 pdf_parser_vlm.py <file_path>
|
|
18
|
+
|
|
19
|
+
Stdout: extracted text (UTF-8), pages separated by `\\n\\n--- Page N ---\\n\\n`
|
|
20
|
+
Stderr: progress + error messages
|
|
21
|
+
Exit: 0 on success, 1 on failure (server unavailable, no sidecar, etc.)
|
|
22
|
+
|
|
23
|
+
Environment:
|
|
24
|
+
CLACKY_SERVER_HOST default 127.0.0.1
|
|
25
|
+
CLACKY_SERVER_PORT default 7070
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
import subprocess
|
|
31
|
+
import sys
|
|
32
|
+
import tempfile
|
|
33
|
+
import urllib.error
|
|
34
|
+
import urllib.request
|
|
35
|
+
|
|
36
|
+
PAGE_SEPARATOR = "\n\n--- Page {n} ---\n\n"
|
|
37
|
+
RENDER_DPI = 150
|
|
38
|
+
REQUEST_TIMEOUT = 120 # seconds; VLMs can be slow
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def server_url():
|
|
42
|
+
host = os.environ.get("CLACKY_SERVER_HOST", "127.0.0.1")
|
|
43
|
+
port = os.environ.get("CLACKY_SERVER_PORT", "7070")
|
|
44
|
+
return f"http://{host}:{port}/api/internal/ocr-image"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def render_pages(pdf_path, out_dir):
|
|
48
|
+
prefix = os.path.join(out_dir, "page")
|
|
49
|
+
cmd = ["pdftoppm", "-r", str(RENDER_DPI), "-png", pdf_path, prefix]
|
|
50
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
51
|
+
if proc.returncode != 0:
|
|
52
|
+
sys.stderr.write(f"pdftoppm failed: {proc.stderr.strip()}\n")
|
|
53
|
+
return []
|
|
54
|
+
pages = sorted(
|
|
55
|
+
os.path.join(out_dir, f) for f in os.listdir(out_dir)
|
|
56
|
+
if f.startswith("page-") and f.endswith(".png")
|
|
57
|
+
)
|
|
58
|
+
return pages
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def transcribe_page(image_path, page_num):
|
|
62
|
+
with open(image_path, "rb") as f:
|
|
63
|
+
body = f.read()
|
|
64
|
+
|
|
65
|
+
boundary = "----clacky-vlm-boundary"
|
|
66
|
+
parts = []
|
|
67
|
+
parts.append(f"--{boundary}\r\n".encode())
|
|
68
|
+
parts.append(
|
|
69
|
+
b'Content-Disposition: form-data; name="image"; filename="page.png"\r\n'
|
|
70
|
+
b"Content-Type: image/png\r\n\r\n"
|
|
71
|
+
)
|
|
72
|
+
parts.append(body)
|
|
73
|
+
parts.append(f"\r\n--{boundary}\r\n".encode())
|
|
74
|
+
parts.append(
|
|
75
|
+
b'Content-Disposition: form-data; name="prompt"\r\n\r\n'
|
|
76
|
+
)
|
|
77
|
+
parts.append(
|
|
78
|
+
f"This is page {page_num} of a scanned PDF. Extract every legible text "
|
|
79
|
+
"verbatim, preserving reading order. Render tables as Markdown tables. "
|
|
80
|
+
"Skip decorative elements. Output plain Markdown only — no commentary."
|
|
81
|
+
.encode()
|
|
82
|
+
)
|
|
83
|
+
parts.append(f"\r\n--{boundary}--\r\n".encode())
|
|
84
|
+
payload = b"".join(parts)
|
|
85
|
+
|
|
86
|
+
req = urllib.request.Request(
|
|
87
|
+
server_url(),
|
|
88
|
+
data=payload,
|
|
89
|
+
headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
|
|
90
|
+
method="POST",
|
|
91
|
+
)
|
|
92
|
+
try:
|
|
93
|
+
with urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT) as resp:
|
|
94
|
+
data = json.loads(resp.read().decode("utf-8"))
|
|
95
|
+
except urllib.error.URLError as e:
|
|
96
|
+
sys.stderr.write(f"page {page_num}: server unreachable ({e})\n")
|
|
97
|
+
return None
|
|
98
|
+
except Exception as e:
|
|
99
|
+
sys.stderr.write(f"page {page_num}: {e}\n")
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
if not data.get("ok"):
|
|
103
|
+
sys.stderr.write(f"page {page_num}: {data.get('message', 'unknown error')}\n")
|
|
104
|
+
return None
|
|
105
|
+
return data.get("text", "")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def main():
|
|
109
|
+
if len(sys.argv) != 2:
|
|
110
|
+
sys.stderr.write("Usage: pdf_parser_vlm.py <file_path>\n")
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
path = sys.argv[1]
|
|
113
|
+
if not os.path.exists(path):
|
|
114
|
+
sys.stderr.write(f"File not found: {path}\n")
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
|
|
117
|
+
with tempfile.TemporaryDirectory(prefix="clacky_vlm_") as tmp:
|
|
118
|
+
pages = render_pages(path, tmp)
|
|
119
|
+
if not pages:
|
|
120
|
+
sys.stderr.write("Failed to render PDF pages (is poppler installed?)\n")
|
|
121
|
+
sys.exit(1)
|
|
122
|
+
|
|
123
|
+
sys.stderr.write(f"VLM OCR: {len(pages)} page(s) to transcribe...\n")
|
|
124
|
+
chunks = []
|
|
125
|
+
for i, page in enumerate(pages, 1):
|
|
126
|
+
text = transcribe_page(page, i)
|
|
127
|
+
if text is None:
|
|
128
|
+
# Server unreachable / no sidecar — bail so caller falls back.
|
|
129
|
+
sys.exit(1)
|
|
130
|
+
chunks.append(PAGE_SEPARATOR.format(n=i) + text)
|
|
131
|
+
|
|
132
|
+
sys.stdout.write("".join(chunks).strip())
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
main()
|
data/lib/clacky/providers.rb
CHANGED
|
@@ -59,6 +59,10 @@ module Clacky
|
|
|
59
59
|
"or-gpt-image-2" => "GPT Image 2"
|
|
60
60
|
},
|
|
61
61
|
"default_image_model" => "or-gpt-image-2",
|
|
62
|
+
# Default OCR sidecar — used when the primary model is text-only.
|
|
63
|
+
# Candidates are derived from the provider's vision-capable models;
|
|
64
|
+
# this just picks the cheap+fast default to surface in "auto" mode.
|
|
65
|
+
"default_ocr_model" => "or-gemini-3-5-flash",
|
|
62
66
|
# Provider-level default: the Claude family served here is vision-capable.
|
|
63
67
|
"capabilities" => { "vision" => true }.freeze,
|
|
64
68
|
# Model-level overrides: DeepSeek models routed through this provider
|
|
@@ -145,6 +149,7 @@ module Clacky
|
|
|
145
149
|
# until we ship a dedicated client-side adapter for that protocol.
|
|
146
150
|
"image_models" => [],
|
|
147
151
|
"default_image_model" => nil,
|
|
152
|
+
"default_ocr_model" => "google/gemini-2.5-flash",
|
|
148
153
|
"website_url" => "https://openrouter.ai/keys"
|
|
149
154
|
}.freeze,
|
|
150
155
|
|
|
@@ -192,6 +197,7 @@ module Clacky
|
|
|
192
197
|
"model_capabilities" => {
|
|
193
198
|
"MiniMax-M3" => { "vision" => true }.freeze
|
|
194
199
|
}.freeze,
|
|
200
|
+
"default_ocr_model" => "MiniMax-M3",
|
|
195
201
|
"website_url" => "https://www.minimaxi.com/user-center/basic-information/interface-key"
|
|
196
202
|
}.freeze,
|
|
197
203
|
|
|
@@ -218,6 +224,7 @@ module Clacky
|
|
|
218
224
|
].freeze,
|
|
219
225
|
# k2.5 / k2.6 are multimodal; legacy k2 text-only models need model_capabilities override if added.
|
|
220
226
|
"capabilities" => { "vision" => true }.freeze,
|
|
227
|
+
"default_ocr_model" => "kimi-k2.5",
|
|
221
228
|
"website_url" => "https://platform.moonshot.cn/console/api-keys"
|
|
222
229
|
}.freeze,
|
|
223
230
|
|
|
@@ -265,6 +272,7 @@ module Clacky
|
|
|
265
272
|
"api" => "anthropic-messages",
|
|
266
273
|
"default_model" => "claude-sonnet-4-6",
|
|
267
274
|
"models" => ["claude-opus-4-8", "claude-opus-4-7", "claude-opus-4-6", "claude-sonnet-4-6", "claude-haiku-4-5"],
|
|
275
|
+
"default_ocr_model" => "claude-haiku-4-5",
|
|
268
276
|
"website_url" => "https://console.anthropic.com/settings/keys"
|
|
269
277
|
}.freeze,
|
|
270
278
|
|
|
@@ -279,6 +287,7 @@ module Clacky
|
|
|
279
287
|
"model_capabilities" => {
|
|
280
288
|
"mimo-v2-omni" => { "vision" => true }.freeze
|
|
281
289
|
}.freeze,
|
|
290
|
+
"default_ocr_model" => "mimo-v2-omni",
|
|
282
291
|
"website_url" => "https://platform.xiaomimimo.com/"
|
|
283
292
|
}.freeze,
|
|
284
293
|
|
|
@@ -308,6 +317,7 @@ module Clacky
|
|
|
308
317
|
"model_capabilities" => {
|
|
309
318
|
"glm-5v-turbo" => { "vision" => true }.freeze
|
|
310
319
|
}.freeze,
|
|
320
|
+
"default_ocr_model" => "glm-5v-turbo",
|
|
311
321
|
"website_url" => "https://open.bigmodel.cn/usercenter/apikeys"
|
|
312
322
|
}.freeze,
|
|
313
323
|
|
|
@@ -338,6 +348,7 @@ module Clacky
|
|
|
338
348
|
"gpt-image-2"
|
|
339
349
|
],
|
|
340
350
|
"default_image_model" => "gpt-image-2",
|
|
351
|
+
"default_ocr_model" => "gpt-5.4-mini",
|
|
341
352
|
"website_url" => "https://platform.openai.com/api-keys"
|
|
342
353
|
}.freeze,
|
|
343
354
|
|
|
@@ -363,6 +374,7 @@ module Clacky
|
|
|
363
374
|
"model_capabilities" => {
|
|
364
375
|
"qwen3.7-max" => { "vision" => false }.freeze
|
|
365
376
|
}.freeze,
|
|
377
|
+
"default_ocr_model" => "qwen3.6-flash",
|
|
366
378
|
"lite_models" => {
|
|
367
379
|
"qwen3.7-max" => "qwen3.6-flash",
|
|
368
380
|
"qwen3.6-plus" => "qwen3.6-flash",
|
|
@@ -529,6 +541,31 @@ module Clacky
|
|
|
529
541
|
preset&.dig("audio_models") || []
|
|
530
542
|
end
|
|
531
543
|
|
|
544
|
+
# OCR sidecar candidates: every chat model under this provider that's
|
|
545
|
+
# vision-capable. Derived from `vision` capability so we don't have
|
|
546
|
+
# to maintain a parallel list — a model that can see is by definition
|
|
547
|
+
# a candidate for "describe an image as text". Image-generation models
|
|
548
|
+
# are excluded (they take prompts and return pixels, not the other way).
|
|
549
|
+
# @param provider_id [String]
|
|
550
|
+
# @return [Array<String>]
|
|
551
|
+
def ocr_models(provider_id)
|
|
552
|
+
preset = PRESETS[provider_id]
|
|
553
|
+
return [] unless preset
|
|
554
|
+
(preset["models"] || []).select { |m| supports?(provider_id, :vision, model_name: m) }
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
# Default OCR sidecar model for a provider. Falls back to the first
|
|
558
|
+
# vision-capable model if the preset doesn't pin an explicit default.
|
|
559
|
+
# @param provider_id [String]
|
|
560
|
+
# @return [String, nil] nil when the provider has zero vision-capable models
|
|
561
|
+
def default_ocr_model(provider_id)
|
|
562
|
+
preset = PRESETS[provider_id]
|
|
563
|
+
return nil unless preset
|
|
564
|
+
explicit = preset["default_ocr_model"]
|
|
565
|
+
return explicit if explicit && ocr_models(provider_id).include?(explicit)
|
|
566
|
+
ocr_models(provider_id).first
|
|
567
|
+
end
|
|
568
|
+
|
|
532
569
|
# Unified entry for media model lookup by kind.
|
|
533
570
|
# @param provider_id [String]
|
|
534
571
|
# @param kind [String] one of "image" / "video" / "audio"
|
|
@@ -40,15 +40,13 @@ module Clacky
|
|
|
40
40
|
url = f[:data_url] || f["data_url"]
|
|
41
41
|
name = f[:name] || f["name"]
|
|
42
42
|
path = f[:path] || f["path"]
|
|
43
|
+
type = f[:type] || f["type"] || ""
|
|
43
44
|
|
|
44
45
|
if url
|
|
45
46
|
url
|
|
46
|
-
elsif path && File.exist?(path.to_s)
|
|
47
|
-
# Reconstruct data_url from the tmp file (still present on disk)
|
|
47
|
+
elsif type.to_s == "image" && path && File.exist?(path.to_s)
|
|
48
48
|
Utils::FileProcessor.image_path_to_data_url(path) rescue "expired:#{name}"
|
|
49
49
|
elsif name
|
|
50
|
-
# File badge for non-image disk files, or image whose tmp file is gone
|
|
51
|
-
type = f[:type] || f["type"] || ""
|
|
52
50
|
type.to_s == "image" ? "expired:#{name}" : "pdf:#{name}"
|
|
53
51
|
end
|
|
54
52
|
end
|
|
@@ -440,6 +438,10 @@ module Clacky
|
|
|
440
438
|
when ["POST", "/api/config/test"] then api_test_config(req, res)
|
|
441
439
|
when ["POST", "/api/config/media/test"] then api_test_media_config(req, res)
|
|
442
440
|
when ["GET", "/api/config/media"] then api_get_media_config(res)
|
|
441
|
+
when ["GET", "/api/config/ocr"] then api_get_ocr_config(res)
|
|
442
|
+
when ["PATCH", "/api/config/ocr"] then api_update_ocr_config(req, res)
|
|
443
|
+
when ["POST", "/api/config/ocr/test"] then api_test_ocr_config(req, res)
|
|
444
|
+
when ["POST", "/api/internal/ocr-image"] then api_internal_ocr_image(req, res)
|
|
443
445
|
when ["GET", "/api/providers"] then api_list_providers(res)
|
|
444
446
|
when ["GET", "/api/onboard/status"] then api_onboard_status(res)
|
|
445
447
|
when ["GET", "/api/browser/status"] then api_browser_status(res)
|
|
@@ -1103,6 +1105,179 @@ module Clacky
|
|
|
1103
1105
|
json_response(res, 422, { error: e.message })
|
|
1104
1106
|
end
|
|
1105
1107
|
|
|
1108
|
+
# GET /api/config/ocr
|
|
1109
|
+
# Returns the OCR sidecar state for the Settings UI. Mirrors media_state
|
|
1110
|
+
# in shape so the UI can render OCR with the same row component.
|
|
1111
|
+
def api_get_ocr_config(res)
|
|
1112
|
+
state = @agent_config.ocr_state
|
|
1113
|
+
entry = @agent_config.find_model_by_type("ocr")
|
|
1114
|
+
|
|
1115
|
+
out = {
|
|
1116
|
+
source: state["source"],
|
|
1117
|
+
model: state["model"],
|
|
1118
|
+
base_url: state["base_url"],
|
|
1119
|
+
api_key_masked: entry ? mask_api_key(entry["api_key"]) : nil,
|
|
1120
|
+
provider: state["provider"],
|
|
1121
|
+
available: state["available"],
|
|
1122
|
+
stale: state["stale"] || false,
|
|
1123
|
+
requested_model: state["requested_model"],
|
|
1124
|
+
configured: state["configured"],
|
|
1125
|
+
primary: state["primary"] || false
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
# Auto-mode preview: surface what the OCR sidecar *would* be if the
|
|
1129
|
+
# user flipped to "auto" — derived from the same provider as the
|
|
1130
|
+
# current default model.
|
|
1131
|
+
default = @agent_config.find_model_by_type("default")
|
|
1132
|
+
provider_id = default && Clacky::Providers.resolve_provider(
|
|
1133
|
+
base_url: default["base_url"],
|
|
1134
|
+
api_key: default["api_key"]
|
|
1135
|
+
)
|
|
1136
|
+
default_preview = {
|
|
1137
|
+
provider: provider_id,
|
|
1138
|
+
model: provider_id ? Clacky::Providers.default_ocr_model(provider_id) : nil,
|
|
1139
|
+
available: provider_id ? Clacky::Providers.ocr_models(provider_id) : []
|
|
1140
|
+
}
|
|
1141
|
+
|
|
1142
|
+
json_response(res, 200, { ocr: out, default_provider: default_preview })
|
|
1143
|
+
end
|
|
1144
|
+
|
|
1145
|
+
# PATCH /api/config/ocr
|
|
1146
|
+
# Body: { source: "off"|"auto"|"custom", model?, base_url?, api_key?,
|
|
1147
|
+
# anthropic_format? }
|
|
1148
|
+
# Mirrors api_update_media_config but for the single "ocr" type.
|
|
1149
|
+
def api_update_ocr_config(req, res)
|
|
1150
|
+
body = parse_json_body(req) || {}
|
|
1151
|
+
source = body["source"].to_s
|
|
1152
|
+
unless %w[off auto custom].include?(source)
|
|
1153
|
+
return json_response(res, 422, { error: "invalid source" })
|
|
1154
|
+
end
|
|
1155
|
+
|
|
1156
|
+
@agent_config.models.reject! { |m| m["type"] == "ocr" }
|
|
1157
|
+
|
|
1158
|
+
case source
|
|
1159
|
+
when "off"
|
|
1160
|
+
@agent_config.models << {
|
|
1161
|
+
"id" => SecureRandom.uuid,
|
|
1162
|
+
"type" => "ocr",
|
|
1163
|
+
"disabled" => true
|
|
1164
|
+
}
|
|
1165
|
+
when "auto"
|
|
1166
|
+
override = body["model"].to_s.strip
|
|
1167
|
+
unless override.empty?
|
|
1168
|
+
@agent_config.models << {
|
|
1169
|
+
"id" => SecureRandom.uuid,
|
|
1170
|
+
"type" => "ocr",
|
|
1171
|
+
"model" => override
|
|
1172
|
+
}
|
|
1173
|
+
end
|
|
1174
|
+
when "custom"
|
|
1175
|
+
model = body["model"].to_s.strip
|
|
1176
|
+
base_url = body["base_url"].to_s.strip
|
|
1177
|
+
api_key = body["api_key"].to_s
|
|
1178
|
+
if api_key.include?("****")
|
|
1179
|
+
existing = @agent_config.models.find { |m| m["type"] == "ocr" && m["api_key"] }
|
|
1180
|
+
api_key = existing ? existing["api_key"].to_s : ""
|
|
1181
|
+
end
|
|
1182
|
+
if model.empty? || base_url.empty? || api_key.empty?
|
|
1183
|
+
return json_response(res, 422, { error: "model, base_url, api_key are required" })
|
|
1184
|
+
end
|
|
1185
|
+
|
|
1186
|
+
@agent_config.models << {
|
|
1187
|
+
"id" => SecureRandom.uuid,
|
|
1188
|
+
"model" => model,
|
|
1189
|
+
"base_url" => base_url,
|
|
1190
|
+
"api_key" => api_key,
|
|
1191
|
+
"anthropic_format" => body["anthropic_format"] || false,
|
|
1192
|
+
"type" => "ocr"
|
|
1193
|
+
}
|
|
1194
|
+
end
|
|
1195
|
+
|
|
1196
|
+
@agent_config.save
|
|
1197
|
+
json_response(res, 200, { ok: true, state: @agent_config.ocr_state })
|
|
1198
|
+
rescue => e
|
|
1199
|
+
json_response(res, 422, { error: e.message })
|
|
1200
|
+
end
|
|
1201
|
+
|
|
1202
|
+
# POST /api/config/ocr/test
|
|
1203
|
+
# Reuses the media preflight (GET /models) — same connectivity check.
|
|
1204
|
+
def api_test_ocr_config(req, res)
|
|
1205
|
+
body = parse_json_body(req) || {}
|
|
1206
|
+
api_key = body["api_key"].to_s
|
|
1207
|
+
if api_key.empty? || api_key.include?("****")
|
|
1208
|
+
existing = @agent_config.find_model_by_type("ocr") || {}
|
|
1209
|
+
api_key = existing["api_key"].to_s
|
|
1210
|
+
end
|
|
1211
|
+
|
|
1212
|
+
model = body["model"].to_s.strip
|
|
1213
|
+
base_url = body["base_url"].to_s.strip
|
|
1214
|
+
|
|
1215
|
+
if model.empty? || base_url.empty? || api_key.empty?
|
|
1216
|
+
return json_response(res, 200, { ok: false, message: "model, base_url, api_key are required" })
|
|
1217
|
+
end
|
|
1218
|
+
|
|
1219
|
+
result = preflight_media_endpoint(base_url: base_url, api_key: api_key, model: model)
|
|
1220
|
+
json_response(res, 200, result)
|
|
1221
|
+
rescue => e
|
|
1222
|
+
json_response(res, 200, { ok: false, message: e.message })
|
|
1223
|
+
end
|
|
1224
|
+
|
|
1225
|
+
# POST /api/internal/ocr-image
|
|
1226
|
+
# Internal endpoint used by parser scripts (e.g. pdf_parser_vlm.py) to
|
|
1227
|
+
# transcribe a single image via the configured OCR sidecar. Localhost-
|
|
1228
|
+
# only by virtue of the standard auth path: when the server binds to
|
|
1229
|
+
# 127.0.0.1 (@localhost_only), check_access_key returns true without
|
|
1230
|
+
# requiring a token, so parsers running on the same host can call this
|
|
1231
|
+
# endpoint with no extra wiring.
|
|
1232
|
+
#
|
|
1233
|
+
# Request: multipart/form-data with field "image" (binary), optional "prompt"
|
|
1234
|
+
# OR JSON body { "data_url": "data:image/png;base64,...", "prompt": "..." }
|
|
1235
|
+
# Response: { ok: true, text: "..." } or { ok: false, message: "..." }
|
|
1236
|
+
def api_internal_ocr_image(req, res)
|
|
1237
|
+
entry = @agent_config.find_model_by_type("ocr")
|
|
1238
|
+
unless entry
|
|
1239
|
+
return json_response(res, 503, { ok: false, message: "OCR sidecar not configured" })
|
|
1240
|
+
end
|
|
1241
|
+
|
|
1242
|
+
prompt = nil
|
|
1243
|
+
data_url = nil
|
|
1244
|
+
bytes = nil
|
|
1245
|
+
mime = "image/png"
|
|
1246
|
+
|
|
1247
|
+
ctype = req.content_type.to_s
|
|
1248
|
+
if ctype.start_with?("multipart/form-data")
|
|
1249
|
+
parts = req.query
|
|
1250
|
+
if (img = parts["image"])
|
|
1251
|
+
bytes = img.respond_to?(:read) ? img.read : img.to_s
|
|
1252
|
+
mime = (img.respond_to?(:[]) ? img["content-type"].to_s : nil)
|
|
1253
|
+
mime = "image/png" if mime.nil? || mime.empty?
|
|
1254
|
+
end
|
|
1255
|
+
prompt = parts["prompt"].to_s if parts["prompt"]
|
|
1256
|
+
else
|
|
1257
|
+
body = parse_json_body(req) || {}
|
|
1258
|
+
data_url = body["data_url"].to_s
|
|
1259
|
+
prompt = body["prompt"].to_s if body["prompt"]
|
|
1260
|
+
end
|
|
1261
|
+
|
|
1262
|
+
image =
|
|
1263
|
+
if bytes && !bytes.empty?
|
|
1264
|
+
{ bytes: bytes, mime_type: mime }
|
|
1265
|
+
elsif data_url && !data_url.empty?
|
|
1266
|
+
{ data_url: data_url }
|
|
1267
|
+
else
|
|
1268
|
+
return json_response(res, 400, { ok: false, message: "image or data_url required" })
|
|
1269
|
+
end
|
|
1270
|
+
|
|
1271
|
+
text = Clacky::Vision::Resolver.new(entry).describe(image, prompt: prompt)
|
|
1272
|
+
if text && !text.strip.empty?
|
|
1273
|
+
json_response(res, 200, { ok: true, text: text })
|
|
1274
|
+
else
|
|
1275
|
+
json_response(res, 200, { ok: false, message: "OCR returned empty result" })
|
|
1276
|
+
end
|
|
1277
|
+
rescue => e
|
|
1278
|
+
json_response(res, 500, { ok: false, message: e.message })
|
|
1279
|
+
end
|
|
1280
|
+
|
|
1106
1281
|
# POST /api/onboard/complete
|
|
1107
1282
|
# Called after key setup is done (soul_setup is optional/skipped).
|
|
1108
1283
|
# Creates the default session if none exists yet, returns it.
|