openclacky 1.2.13 → 1.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,113 +8,97 @@
8
8
  #
9
9
  # Output:
10
10
  # stdout — extracted text content (UTF-8)
11
- # stderr — error messages
11
+ # stderr — error / progress messages
12
12
  # exit 0 — success
13
- # exit 1 — failure
13
+ # exit 1 — hard failure (file unreadable, pdftotext missing, etc.)
14
14
  #
15
- # This file lives in ~/.clacky/parsers/ and can be modified by the LLM.
15
+ # Strategy
16
+ # --------
17
+ # PDF pages naturally fall into two kinds: pages with a real text layer,
18
+ # and scanned-image pages. The right tool is a per-page property, not a
19
+ # document-level one. So:
16
20
  #
17
- # Extraction pipeline (first successful step wins):
18
- # 1. pdftotext (poppler) — fastest, text-based PDFs
19
- # 2. pdfplumber (Python) — handles more layouts
20
- # (→ pdf_parser_plumber.py)
21
- # 3. OCR (tesseract) — scanned / image-only PDFs
22
- # (→ pdf_parser_ocr.py)
21
+ # 1. Run pdftotext once over the whole file (`-layout`), split by `\f`.
22
+ # 2. Pages with enough bytes → emit text directly.
23
+ # 3. Pages below threshold list page numbers in a Notice section
24
+ # with a shell command template the agent can run on demand to
25
+ # render a specific page to PNG, then file_reader that PNG.
23
26
  #
24
- # Each extractor is a plain, self-contained function. Python-backed steps
25
- # shell out to a sibling .py script so the LLM can edit them directly
26
- # (with proper syntax highlighting, linters, and per-file run/debug)
27
- # instead of wrestling with embedded heredocs.
27
+ # The parser does NOT pre-render images. Most weak pages will never be
28
+ # read (the answer is often already in the text-layer pages). Rendering
29
+ # all of them up front is wasteful — 55 pages takes ~14s and most goes
30
+ # to waste. The agent decides when (and which page) to OCR based on the
31
+ # user's actual question.
28
32
  #
29
- # VERSION: 3
33
+ # VERSION: 6
30
34
 
31
35
  require "open3"
32
36
 
33
- # Minimum useful output (in bytes). Below this, a step is considered a
34
- # miss and the next fallback is tried.
35
- MIN_CONTENT_BYTES = 20
37
+ MIN_PAGE_BYTES = 20
36
38
 
37
- # Script directory — resolve sibling .py helpers relative to this file
38
- # so it works both from the gem's default_parsers/ dir and from the
39
- # copied-to-user ~/.clacky/parsers/ dir.
40
- SCRIPT_DIR = File.dirname(File.expand_path(__FILE__))
41
-
42
- def try_pdftotext(path)
43
- stdout, _stderr, status = Open3.capture3("pdftotext", "-layout", "-enc", "UTF-8", path, "-")
44
- return nil unless status.success?
45
- text = stdout.strip
46
- return nil if text.bytesize < MIN_CONTENT_BYTES
47
- text
48
- rescue Errno::ENOENT
49
- nil # pdftotext not installed
50
- end
51
-
52
- def try_pdfplumber(path)
53
- script = File.join(SCRIPT_DIR, "pdf_parser_plumber.py")
54
- return nil unless File.exist?(script)
55
-
56
- stdout, _stderr, status = Open3.capture3("python3", script, path)
57
- return nil unless status.success?
58
- text = stdout.strip
59
- return nil if text.bytesize < MIN_CONTENT_BYTES
60
- text
61
- rescue Errno::ENOENT
62
- nil # python3 not available
39
+ def die(msg)
40
+ warn msg
41
+ exit 1
63
42
  end
64
43
 
65
- # OCR fallback for scanned/image-only PDFs.
66
- # See pdf_parser_ocr.py for the actual extraction logic.
67
- #
68
- # Installation hints (also printed on final failure):
69
- # macOS: brew install tesseract tesseract-lang poppler
70
- # pip3 install pytesseract pdf2image
71
- # Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
72
- # pip3 install pytesseract pdf2image
73
- def try_ocr(path)
74
- # Quick capability check — avoid spawning python if tesseract is missing.
75
- _stdout, _stderr, status = Open3.capture3("tesseract", "--version")
76
- return nil unless status.success?
77
-
78
- script = File.join(SCRIPT_DIR, "pdf_parser_ocr.py")
79
- return nil unless File.exist?(script)
80
-
81
- stdout, stderr, status = Open3.capture3("python3", script, path)
44
+ def pdftotext_pages(path)
45
+ stdout, stderr, status = Open3.capture3(
46
+ "pdftotext", "-layout", "-enc", "UTF-8", path, "-"
47
+ )
82
48
  unless status.success?
83
- warn stderr.strip unless stderr.strip.empty?
49
+ warn "pdftotext failed: #{stderr.strip}"
84
50
  return nil
85
51
  end
86
- text = stdout.strip
87
- return nil if text.bytesize < MIN_CONTENT_BYTES
88
- text
52
+ pages = stdout.split("\f", -1)
53
+ pages.pop if pages.last && pages.last.strip.empty?
54
+ pages.map(&:strip)
89
55
  rescue Errno::ENOENT
90
- nil # tesseract or python3 not available
56
+ warn "pdftotext not found. Install poppler (`brew install poppler` / `apt install poppler-utils`)."
57
+ nil
91
58
  end
92
59
 
93
- # --- main ---
60
+ def main(argv)
61
+ die "Usage: pdf_parser.rb <file_path>" if argv.empty?
62
+ path = argv[0]
63
+ die "File not found: #{path}" unless File.file?(path)
94
64
 
95
- path = ARGV[0]
65
+ pages = pdftotext_pages(path)
66
+ die "Could not extract text from PDF." if pages.nil?
96
67
 
97
- if path.nil? || path.empty?
98
- warn "Usage: ruby pdf_parser.rb <file_path>"
99
- exit 1
100
- end
68
+ weak = []
69
+ body_chunks = []
70
+ pages.each_with_index do |text, idx|
71
+ n = idx + 1
72
+ if text.bytesize >= MIN_PAGE_BYTES
73
+ body_chunks << "--- Page #{n} ---\n\n#{text}"
74
+ else
75
+ body_chunks << "--- Page #{n} ---\n\n[no extractable text layer]"
76
+ weak << n
77
+ end
78
+ end
101
79
 
102
- unless File.exist?(path)
103
- warn "File not found: #{path}"
104
- exit 1
105
- end
80
+ output = body_chunks.join("\n\n")
106
81
 
107
- # Try each extractor in order; first non-nil result wins.
108
- text = try_pdftotext(path) || try_pdfplumber(path) || try_ocr(path)
82
+ if weak.any?
83
+ abs_path = File.expand_path(path)
84
+ notice = +"\n\n--- Notice ---\n\n"
85
+ notice << "#{weak.size} of #{pages.size} pages have no extractable text layer "
86
+ notice << "(likely scanned images).\n"
87
+ notice << "Pages without text: #{weak.join(', ')}\n\n"
88
+ notice << "To OCR a specific page, render it to PNG via shell, then "
89
+ notice << "file_reader the PNG (it will be transcribed via the "
90
+ notice << "vision/OCR pipeline):\n\n"
91
+ notice << " pdftoppm -r 150 -f <N> -l <N> -png -singlefile "
92
+ notice << "#{abs_path.inspect} /tmp/clacky-pdf-page-<N>\n"
93
+ notice << " # produces /tmp/clacky-pdf-page-<N>.png\n\n"
94
+ notice << "Only render pages you actually need. If the user's question "
95
+ notice << "is already answered by the extracted text above, skip OCR.\n"
96
+ output << notice
97
+ end
109
98
 
110
- if text
111
- print text
99
+ $stdout.write(output)
100
+ $stdout.write("\n") unless output.end_with?("\n")
112
101
  exit 0
113
- else
114
- warn "Could not extract text from PDF."
115
- warn "For text-based PDFs, install poppler: brew install poppler (macOS) / apt install poppler-utils (Linux)"
116
- warn "For scanned PDFs (OCR):"
117
- warn " macOS: brew install tesseract tesseract-lang poppler && pip3 install pytesseract pdf2image"
118
- warn " Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils && pip3 install pytesseract pdf2image"
119
- exit 1
120
102
  end
103
+
104
+ main(ARGV) if __FILE__ == $PROGRAM_NAME
@@ -0,0 +1,136 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Clacky PDF Parser — VLM (Vision Language Model) extractor
5
+
6
+ Renders each PDF page to PNG via pdftoppm (poppler), then asks the
7
+ configured OCR sidecar (e.g. gemini-3-5-flash, gpt-4o-mini) to transcribe
8
+ each page through the local Clacky server's internal OCR endpoint.
9
+
10
+ Why through HTTP and not direct API call?
11
+ The OCR sidecar config (model, base_url, api_key) lives in the agent's
12
+ ~/.clacky/config.yml. We don't re-implement that lookup here — instead
13
+ the local Clacky server exposes /api/internal/ocr-image which already
14
+ has the agent_config in scope. This parser stays a thin client.
15
+
16
+ Usage:
17
+ python3 pdf_parser_vlm.py <file_path>
18
+
19
+ Stdout: extracted text (UTF-8), pages separated by `\\n\\n--- Page N ---\\n\\n`
20
+ Stderr: progress + error messages
21
+ Exit: 0 on success, 1 on failure (server unavailable, no sidecar, etc.)
22
+
23
+ Environment:
24
+ CLACKY_SERVER_HOST default 127.0.0.1
25
+ CLACKY_SERVER_PORT default 7070
26
+ """
27
+
28
+ import json
29
+ import os
30
+ import subprocess
31
+ import sys
32
+ import tempfile
33
+ import urllib.error
34
+ import urllib.request
35
+
36
+ PAGE_SEPARATOR = "\n\n--- Page {n} ---\n\n"
37
+ RENDER_DPI = 150
38
+ REQUEST_TIMEOUT = 120 # seconds; VLMs can be slow
39
+
40
+
41
+ def server_url():
42
+ host = os.environ.get("CLACKY_SERVER_HOST", "127.0.0.1")
43
+ port = os.environ.get("CLACKY_SERVER_PORT", "7070")
44
+ return f"http://{host}:{port}/api/internal/ocr-image"
45
+
46
+
47
+ def render_pages(pdf_path, out_dir):
48
+ prefix = os.path.join(out_dir, "page")
49
+ cmd = ["pdftoppm", "-r", str(RENDER_DPI), "-png", pdf_path, prefix]
50
+ proc = subprocess.run(cmd, capture_output=True, text=True)
51
+ if proc.returncode != 0:
52
+ sys.stderr.write(f"pdftoppm failed: {proc.stderr.strip()}\n")
53
+ return []
54
+ pages = sorted(
55
+ os.path.join(out_dir, f) for f in os.listdir(out_dir)
56
+ if f.startswith("page-") and f.endswith(".png")
57
+ )
58
+ return pages
59
+
60
+
61
+ def transcribe_page(image_path, page_num):
62
+ with open(image_path, "rb") as f:
63
+ body = f.read()
64
+
65
+ boundary = "----clacky-vlm-boundary"
66
+ parts = []
67
+ parts.append(f"--{boundary}\r\n".encode())
68
+ parts.append(
69
+ b'Content-Disposition: form-data; name="image"; filename="page.png"\r\n'
70
+ b"Content-Type: image/png\r\n\r\n"
71
+ )
72
+ parts.append(body)
73
+ parts.append(f"\r\n--{boundary}\r\n".encode())
74
+ parts.append(
75
+ b'Content-Disposition: form-data; name="prompt"\r\n\r\n'
76
+ )
77
+ parts.append(
78
+ f"This is page {page_num} of a scanned PDF. Extract every legible text "
79
+ "verbatim, preserving reading order. Render tables as Markdown tables. "
80
+ "Skip decorative elements. Output plain Markdown only — no commentary."
81
+ .encode()
82
+ )
83
+ parts.append(f"\r\n--{boundary}--\r\n".encode())
84
+ payload = b"".join(parts)
85
+
86
+ req = urllib.request.Request(
87
+ server_url(),
88
+ data=payload,
89
+ headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
90
+ method="POST",
91
+ )
92
+ try:
93
+ with urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT) as resp:
94
+ data = json.loads(resp.read().decode("utf-8"))
95
+ except urllib.error.URLError as e:
96
+ sys.stderr.write(f"page {page_num}: server unreachable ({e})\n")
97
+ return None
98
+ except Exception as e:
99
+ sys.stderr.write(f"page {page_num}: {e}\n")
100
+ return None
101
+
102
+ if not data.get("ok"):
103
+ sys.stderr.write(f"page {page_num}: {data.get('message', 'unknown error')}\n")
104
+ return None
105
+ return data.get("text", "")
106
+
107
+
108
+ def main():
109
+ if len(sys.argv) != 2:
110
+ sys.stderr.write("Usage: pdf_parser_vlm.py <file_path>\n")
111
+ sys.exit(1)
112
+ path = sys.argv[1]
113
+ if not os.path.exists(path):
114
+ sys.stderr.write(f"File not found: {path}\n")
115
+ sys.exit(1)
116
+
117
+ with tempfile.TemporaryDirectory(prefix="clacky_vlm_") as tmp:
118
+ pages = render_pages(path, tmp)
119
+ if not pages:
120
+ sys.stderr.write("Failed to render PDF pages (is poppler installed?)\n")
121
+ sys.exit(1)
122
+
123
+ sys.stderr.write(f"VLM OCR: {len(pages)} page(s) to transcribe...\n")
124
+ chunks = []
125
+ for i, page in enumerate(pages, 1):
126
+ text = transcribe_page(page, i)
127
+ if text is None:
128
+ # Server unreachable / no sidecar — bail so caller falls back.
129
+ sys.exit(1)
130
+ chunks.append(PAGE_SEPARATOR.format(n=i) + text)
131
+
132
+ sys.stdout.write("".join(chunks).strip())
133
+
134
+
135
+ if __name__ == "__main__":
136
+ main()
@@ -59,6 +59,10 @@ module Clacky
59
59
  "or-gpt-image-2" => "GPT Image 2"
60
60
  },
61
61
  "default_image_model" => "or-gpt-image-2",
62
+ # Default OCR sidecar — used when the primary model is text-only.
63
+ # Candidates are derived from the provider's vision-capable models;
64
+ # this just picks the cheap+fast default to surface in "auto" mode.
65
+ "default_ocr_model" => "or-gemini-3-5-flash",
62
66
  # Provider-level default: the Claude family served here is vision-capable.
63
67
  "capabilities" => { "vision" => true }.freeze,
64
68
  # Model-level overrides: DeepSeek models routed through this provider
@@ -145,6 +149,7 @@ module Clacky
145
149
  # until we ship a dedicated client-side adapter for that protocol.
146
150
  "image_models" => [],
147
151
  "default_image_model" => nil,
152
+ "default_ocr_model" => "google/gemini-2.5-flash",
148
153
  "website_url" => "https://openrouter.ai/keys"
149
154
  }.freeze,
150
155
 
@@ -192,6 +197,7 @@ module Clacky
192
197
  "model_capabilities" => {
193
198
  "MiniMax-M3" => { "vision" => true }.freeze
194
199
  }.freeze,
200
+ "default_ocr_model" => "MiniMax-M3",
195
201
  "website_url" => "https://www.minimaxi.com/user-center/basic-information/interface-key"
196
202
  }.freeze,
197
203
 
@@ -218,6 +224,7 @@ module Clacky
218
224
  ].freeze,
219
225
  # k2.5 / k2.6 are multimodal; legacy k2 text-only models need model_capabilities override if added.
220
226
  "capabilities" => { "vision" => true }.freeze,
227
+ "default_ocr_model" => "kimi-k2.5",
221
228
  "website_url" => "https://platform.moonshot.cn/console/api-keys"
222
229
  }.freeze,
223
230
 
@@ -265,6 +272,7 @@ module Clacky
265
272
  "api" => "anthropic-messages",
266
273
  "default_model" => "claude-sonnet-4-6",
267
274
  "models" => ["claude-opus-4-8", "claude-opus-4-7", "claude-opus-4-6", "claude-sonnet-4-6", "claude-haiku-4-5"],
275
+ "default_ocr_model" => "claude-haiku-4-5",
268
276
  "website_url" => "https://console.anthropic.com/settings/keys"
269
277
  }.freeze,
270
278
 
@@ -279,6 +287,7 @@ module Clacky
279
287
  "model_capabilities" => {
280
288
  "mimo-v2-omni" => { "vision" => true }.freeze
281
289
  }.freeze,
290
+ "default_ocr_model" => "mimo-v2-omni",
282
291
  "website_url" => "https://platform.xiaomimimo.com/"
283
292
  }.freeze,
284
293
 
@@ -308,6 +317,7 @@ module Clacky
308
317
  "model_capabilities" => {
309
318
  "glm-5v-turbo" => { "vision" => true }.freeze
310
319
  }.freeze,
320
+ "default_ocr_model" => "glm-5v-turbo",
311
321
  "website_url" => "https://open.bigmodel.cn/usercenter/apikeys"
312
322
  }.freeze,
313
323
 
@@ -338,6 +348,7 @@ module Clacky
338
348
  "gpt-image-2"
339
349
  ],
340
350
  "default_image_model" => "gpt-image-2",
351
+ "default_ocr_model" => "gpt-5.4-mini",
341
352
  "website_url" => "https://platform.openai.com/api-keys"
342
353
  }.freeze,
343
354
 
@@ -363,6 +374,7 @@ module Clacky
363
374
  "model_capabilities" => {
364
375
  "qwen3.7-max" => { "vision" => false }.freeze
365
376
  }.freeze,
377
+ "default_ocr_model" => "qwen3.6-flash",
366
378
  "lite_models" => {
367
379
  "qwen3.7-max" => "qwen3.6-flash",
368
380
  "qwen3.6-plus" => "qwen3.6-flash",
@@ -529,6 +541,31 @@ module Clacky
529
541
  preset&.dig("audio_models") || []
530
542
  end
531
543
 
544
+ # OCR sidecar candidates: every chat model under this provider that's
545
+ # vision-capable. Derived from `vision` capability so we don't have
546
+ # to maintain a parallel list — a model that can see is by definition
547
+ # a candidate for "describe an image as text". Image-generation models
548
+ # are excluded (they take prompts and return pixels, not the other way).
549
+ # @param provider_id [String]
550
+ # @return [Array<String>]
551
+ def ocr_models(provider_id)
552
+ preset = PRESETS[provider_id]
553
+ return [] unless preset
554
+ (preset["models"] || []).select { |m| supports?(provider_id, :vision, model_name: m) }
555
+ end
556
+
557
+ # Default OCR sidecar model for a provider. Falls back to the first
558
+ # vision-capable model if the preset doesn't pin an explicit default.
559
+ # @param provider_id [String]
560
+ # @return [String, nil] nil when the provider has zero vision-capable models
561
+ def default_ocr_model(provider_id)
562
+ preset = PRESETS[provider_id]
563
+ return nil unless preset
564
+ explicit = preset["default_ocr_model"]
565
+ return explicit if explicit && ocr_models(provider_id).include?(explicit)
566
+ ocr_models(provider_id).first
567
+ end
568
+
532
569
  # Unified entry for media model lookup by kind.
533
570
  # @param provider_id [String]
534
571
  # @param kind [String] one of "image" / "video" / "audio"
@@ -40,15 +40,13 @@ module Clacky
40
40
  url = f[:data_url] || f["data_url"]
41
41
  name = f[:name] || f["name"]
42
42
  path = f[:path] || f["path"]
43
+ type = f[:type] || f["type"] || ""
43
44
 
44
45
  if url
45
46
  url
46
- elsif path && File.exist?(path.to_s)
47
- # Reconstruct data_url from the tmp file (still present on disk)
47
+ elsif type.to_s == "image" && path && File.exist?(path.to_s)
48
48
  Utils::FileProcessor.image_path_to_data_url(path) rescue "expired:#{name}"
49
49
  elsif name
50
- # File badge for non-image disk files, or image whose tmp file is gone
51
- type = f[:type] || f["type"] || ""
52
50
  type.to_s == "image" ? "expired:#{name}" : "pdf:#{name}"
53
51
  end
54
52
  end
@@ -440,6 +438,10 @@ module Clacky
440
438
  when ["POST", "/api/config/test"] then api_test_config(req, res)
441
439
  when ["POST", "/api/config/media/test"] then api_test_media_config(req, res)
442
440
  when ["GET", "/api/config/media"] then api_get_media_config(res)
441
+ when ["GET", "/api/config/ocr"] then api_get_ocr_config(res)
442
+ when ["PATCH", "/api/config/ocr"] then api_update_ocr_config(req, res)
443
+ when ["POST", "/api/config/ocr/test"] then api_test_ocr_config(req, res)
444
+ when ["POST", "/api/internal/ocr-image"] then api_internal_ocr_image(req, res)
443
445
  when ["GET", "/api/providers"] then api_list_providers(res)
444
446
  when ["GET", "/api/onboard/status"] then api_onboard_status(res)
445
447
  when ["GET", "/api/browser/status"] then api_browser_status(res)
@@ -1103,6 +1105,179 @@ module Clacky
1103
1105
  json_response(res, 422, { error: e.message })
1104
1106
  end
1105
1107
 
1108
+ # GET /api/config/ocr
1109
+ # Returns the OCR sidecar state for the Settings UI. Mirrors media_state
1110
+ # in shape so the UI can render OCR with the same row component.
1111
+ def api_get_ocr_config(res)
1112
+ state = @agent_config.ocr_state
1113
+ entry = @agent_config.find_model_by_type("ocr")
1114
+
1115
+ out = {
1116
+ source: state["source"],
1117
+ model: state["model"],
1118
+ base_url: state["base_url"],
1119
+ api_key_masked: entry ? mask_api_key(entry["api_key"]) : nil,
1120
+ provider: state["provider"],
1121
+ available: state["available"],
1122
+ stale: state["stale"] || false,
1123
+ requested_model: state["requested_model"],
1124
+ configured: state["configured"],
1125
+ primary: state["primary"] || false
1126
+ }
1127
+
1128
+ # Auto-mode preview: surface what the OCR sidecar *would* be if the
1129
+ # user flipped to "auto" — derived from the same provider as the
1130
+ # current default model.
1131
+ default = @agent_config.find_model_by_type("default")
1132
+ provider_id = default && Clacky::Providers.resolve_provider(
1133
+ base_url: default["base_url"],
1134
+ api_key: default["api_key"]
1135
+ )
1136
+ default_preview = {
1137
+ provider: provider_id,
1138
+ model: provider_id ? Clacky::Providers.default_ocr_model(provider_id) : nil,
1139
+ available: provider_id ? Clacky::Providers.ocr_models(provider_id) : []
1140
+ }
1141
+
1142
+ json_response(res, 200, { ocr: out, default_provider: default_preview })
1143
+ end
1144
+
1145
+ # PATCH /api/config/ocr
1146
+ # Body: { source: "off"|"auto"|"custom", model?, base_url?, api_key?,
1147
+ # anthropic_format? }
1148
+ # Mirrors api_update_media_config but for the single "ocr" type.
1149
+ def api_update_ocr_config(req, res)
1150
+ body = parse_json_body(req) || {}
1151
+ source = body["source"].to_s
1152
+ unless %w[off auto custom].include?(source)
1153
+ return json_response(res, 422, { error: "invalid source" })
1154
+ end
1155
+
1156
+ @agent_config.models.reject! { |m| m["type"] == "ocr" }
1157
+
1158
+ case source
1159
+ when "off"
1160
+ @agent_config.models << {
1161
+ "id" => SecureRandom.uuid,
1162
+ "type" => "ocr",
1163
+ "disabled" => true
1164
+ }
1165
+ when "auto"
1166
+ override = body["model"].to_s.strip
1167
+ unless override.empty?
1168
+ @agent_config.models << {
1169
+ "id" => SecureRandom.uuid,
1170
+ "type" => "ocr",
1171
+ "model" => override
1172
+ }
1173
+ end
1174
+ when "custom"
1175
+ model = body["model"].to_s.strip
1176
+ base_url = body["base_url"].to_s.strip
1177
+ api_key = body["api_key"].to_s
1178
+ if api_key.include?("****")
1179
+ existing = @agent_config.models.find { |m| m["type"] == "ocr" && m["api_key"] }
1180
+ api_key = existing ? existing["api_key"].to_s : ""
1181
+ end
1182
+ if model.empty? || base_url.empty? || api_key.empty?
1183
+ return json_response(res, 422, { error: "model, base_url, api_key are required" })
1184
+ end
1185
+
1186
+ @agent_config.models << {
1187
+ "id" => SecureRandom.uuid,
1188
+ "model" => model,
1189
+ "base_url" => base_url,
1190
+ "api_key" => api_key,
1191
+ "anthropic_format" => body["anthropic_format"] || false,
1192
+ "type" => "ocr"
1193
+ }
1194
+ end
1195
+
1196
+ @agent_config.save
1197
+ json_response(res, 200, { ok: true, state: @agent_config.ocr_state })
1198
+ rescue => e
1199
+ json_response(res, 422, { error: e.message })
1200
+ end
1201
+
1202
+ # POST /api/config/ocr/test
1203
+ # Reuses the media preflight (GET /models) — same connectivity check.
1204
+ def api_test_ocr_config(req, res)
1205
+ body = parse_json_body(req) || {}
1206
+ api_key = body["api_key"].to_s
1207
+ if api_key.empty? || api_key.include?("****")
1208
+ existing = @agent_config.find_model_by_type("ocr") || {}
1209
+ api_key = existing["api_key"].to_s
1210
+ end
1211
+
1212
+ model = body["model"].to_s.strip
1213
+ base_url = body["base_url"].to_s.strip
1214
+
1215
+ if model.empty? || base_url.empty? || api_key.empty?
1216
+ return json_response(res, 200, { ok: false, message: "model, base_url, api_key are required" })
1217
+ end
1218
+
1219
+ result = preflight_media_endpoint(base_url: base_url, api_key: api_key, model: model)
1220
+ json_response(res, 200, result)
1221
+ rescue => e
1222
+ json_response(res, 200, { ok: false, message: e.message })
1223
+ end
1224
+
1225
+ # POST /api/internal/ocr-image
1226
+ # Internal endpoint used by parser scripts (e.g. pdf_parser_vlm.py) to
1227
+ # transcribe a single image via the configured OCR sidecar. Localhost-
1228
+ # only by virtue of the standard auth path: when the server binds to
1229
+ # 127.0.0.1 (@localhost_only), check_access_key returns true without
1230
+ # requiring a token, so parsers running on the same host can call this
1231
+ # endpoint with no extra wiring.
1232
+ #
1233
+ # Request: multipart/form-data with field "image" (binary), optional "prompt"
1234
+ # OR JSON body { "data_url": "data:image/png;base64,...", "prompt": "..." }
1235
+ # Response: { ok: true, text: "..." } or { ok: false, message: "..." }
1236
+ def api_internal_ocr_image(req, res)
1237
+ entry = @agent_config.find_model_by_type("ocr")
1238
+ unless entry
1239
+ return json_response(res, 503, { ok: false, message: "OCR sidecar not configured" })
1240
+ end
1241
+
1242
+ prompt = nil
1243
+ data_url = nil
1244
+ bytes = nil
1245
+ mime = "image/png"
1246
+
1247
+ ctype = req.content_type.to_s
1248
+ if ctype.start_with?("multipart/form-data")
1249
+ parts = req.query
1250
+ if (img = parts["image"])
1251
+ bytes = img.respond_to?(:read) ? img.read : img.to_s
1252
+ mime = (img.respond_to?(:[]) ? img["content-type"].to_s : nil)
1253
+ mime = "image/png" if mime.nil? || mime.empty?
1254
+ end
1255
+ prompt = parts["prompt"].to_s if parts["prompt"]
1256
+ else
1257
+ body = parse_json_body(req) || {}
1258
+ data_url = body["data_url"].to_s
1259
+ prompt = body["prompt"].to_s if body["prompt"]
1260
+ end
1261
+
1262
+ image =
1263
+ if bytes && !bytes.empty?
1264
+ { bytes: bytes, mime_type: mime }
1265
+ elsif data_url && !data_url.empty?
1266
+ { data_url: data_url }
1267
+ else
1268
+ return json_response(res, 400, { ok: false, message: "image or data_url required" })
1269
+ end
1270
+
1271
+ text = Clacky::Vision::Resolver.new(entry).describe(image, prompt: prompt)
1272
+ if text && !text.strip.empty?
1273
+ json_response(res, 200, { ok: true, text: text })
1274
+ else
1275
+ json_response(res, 200, { ok: false, message: "OCR returned empty result" })
1276
+ end
1277
+ rescue => e
1278
+ json_response(res, 500, { ok: false, message: e.message })
1279
+ end
1280
+
1106
1281
  # POST /api/onboard/complete
1107
1282
  # Called after key setup is done (soul_setup is optional/skipped).
1108
1283
  # Creates the default session if none exists yet, returns it.