openclacky 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +39 -0
  3. data/README.md +87 -53
  4. data/lib/clacky/agent/cost_tracker.rb +19 -2
  5. data/lib/clacky/agent/llm_caller.rb +218 -0
  6. data/lib/clacky/agent/message_compressor_helper.rb +32 -2
  7. data/lib/clacky/agent.rb +54 -22
  8. data/lib/clacky/client.rb +44 -5
  9. data/lib/clacky/default_parsers/pdf_parser.rb +58 -17
  10. data/lib/clacky/default_parsers/pdf_parser_ocr.py +103 -0
  11. data/lib/clacky/default_parsers/pdf_parser_plumber.py +62 -0
  12. data/lib/clacky/default_skills/deploy/SKILL.md +201 -77
  13. data/lib/clacky/default_skills/new/SKILL.md +3 -114
  14. data/lib/clacky/default_skills/onboard/SKILL.md +349 -133
  15. data/lib/clacky/default_skills/onboard/scripts/import_external_skills.rb +371 -0
  16. data/lib/clacky/default_skills/onboard/scripts/install_builtin_skills.rb +175 -0
  17. data/lib/clacky/default_skills/skill-add/scripts/install_from_zip.rb +59 -26
  18. data/lib/clacky/message_format/anthropic.rb +72 -8
  19. data/lib/clacky/message_format/bedrock.rb +6 -3
  20. data/lib/clacky/providers.rb +146 -3
  21. data/lib/clacky/server/channel/adapters/feishu/adapter.rb +14 -0
  22. data/lib/clacky/server/channel/adapters/feishu/bot.rb +10 -0
  23. data/lib/clacky/server/channel/adapters/feishu/message_parser.rb +1 -0
  24. data/lib/clacky/server/channel/channel_manager.rb +12 -4
  25. data/lib/clacky/server/channel/channel_ui_controller.rb +8 -2
  26. data/lib/clacky/server/http_server.rb +746 -13
  27. data/lib/clacky/server/session_registry.rb +55 -24
  28. data/lib/clacky/skill.rb +10 -9
  29. data/lib/clacky/skill_loader.rb +23 -11
  30. data/lib/clacky/tools/file_reader.rb +232 -127
  31. data/lib/clacky/tools/security.rb +42 -64
  32. data/lib/clacky/tools/terminal/persistent_session.rb +15 -4
  33. data/lib/clacky/tools/terminal/safe_rm.sh +106 -0
  34. data/lib/clacky/tools/terminal/session_manager.rb +8 -3
  35. data/lib/clacky/tools/terminal.rb +263 -16
  36. data/lib/clacky/ui2/layout_manager.rb +8 -1
  37. data/lib/clacky/ui2/output_buffer.rb +83 -23
  38. data/lib/clacky/ui2/ui_controller.rb +74 -7
  39. data/lib/clacky/utils/file_processor.rb +14 -40
  40. data/lib/clacky/utils/model_pricing.rb +215 -0
  41. data/lib/clacky/utils/parser_manager.rb +70 -6
  42. data/lib/clacky/utils/string_matcher.rb +23 -1
  43. data/lib/clacky/version.rb +1 -1
  44. data/lib/clacky/web/app.css +673 -9
  45. data/lib/clacky/web/app.js +40 -1608
  46. data/lib/clacky/web/i18n.js +209 -0
  47. data/lib/clacky/web/index.html +166 -2
  48. data/lib/clacky/web/onboard.js +77 -1
  49. data/lib/clacky/web/profile.js +442 -0
  50. data/lib/clacky/web/sessions.js +1034 -2
  51. data/lib/clacky/web/settings.js +127 -6
  52. data/lib/clacky/web/sidebar.js +39 -0
  53. data/lib/clacky/web/skills.js +460 -0
  54. data/lib/clacky/web/trash.js +343 -0
  55. data/lib/clacky/web/ws-dispatcher.js +255 -0
  56. data/lib/clacky.rb +5 -3
  57. metadata +16 -17
  58. data/lib/clacky/clacky_auth_client.rb +0 -152
  59. data/lib/clacky/clacky_cloud_config.rb +0 -123
  60. data/lib/clacky/cloud_project_client.rb +0 -169
  61. data/lib/clacky/default_skills/deploy/scripts/rails_deploy.rb +0 -1377
  62. data/lib/clacky/default_skills/deploy/tools/check_health.rb +0 -116
  63. data/lib/clacky/default_skills/deploy/tools/create_database_service.rb +0 -341
  64. data/lib/clacky/default_skills/deploy/tools/execute_deployment.rb +0 -99
  65. data/lib/clacky/default_skills/deploy/tools/fetch_runtime_logs.rb +0 -77
  66. data/lib/clacky/default_skills/deploy/tools/list_services.rb +0 -67
  67. data/lib/clacky/default_skills/deploy/tools/report_deploy_status.rb +0 -67
  68. data/lib/clacky/default_skills/deploy/tools/set_deploy_variables.rb +0 -189
  69. data/lib/clacky/default_skills/new/scripts/cloud_project_init.sh +0 -74
  70. data/lib/clacky/deploy_api_client.rb +0 -484
data/lib/clacky/agent.rb CHANGED
@@ -78,7 +78,6 @@ module Clacky
78
78
  @cost_source = :estimated # Track whether cost is from API or estimated
79
79
  @task_cost_source = :estimated # Track cost source for current task
80
80
  @previous_total_tokens = 0 # Track tokens from previous iteration for delta calculation
81
- @interrupted = false # Flag for user interrupt
82
81
  @latest_latency = nil # Most recent LLM call's latency metrics (see Client#send_messages_with_tools)
83
82
  @ui = ui # UIController for direct UI interaction
84
83
  @debug_logs = [] # Debug logs for troubleshooting
@@ -211,6 +210,7 @@ module Clacky
211
210
  @start_time = Time.now
212
211
  @task_truncation_count = 0 # Reset truncation counter for each task
213
212
  @task_timeout_hint_injected = false # Reset read-timeout hint injection (see LlmCaller)
213
+ @task_upstream_truncation_hint_injected = false # Reset upstream-truncation hint injection (see LlmCaller)
214
214
  @task_cost_source = :estimated # Reset for new task
215
215
  # Note: Do NOT reset @previous_total_tokens here - it should maintain the value from the last iteration
216
216
  # across tasks to correctly calculate delta tokens in each iteration
@@ -360,9 +360,6 @@ module Clacky
360
360
  task_interrupted = false
361
361
 
362
362
  loop do
363
-
364
- break if should_stop?
365
-
366
363
  @iterations += 1
367
364
  @hooks.trigger(:on_iteration, @iterations)
368
365
 
@@ -377,8 +374,58 @@ module Clacky
377
374
  # Skip if compression happened (response is nil)
378
375
  next if response.nil?
379
376
 
380
- # Check if done (no more tool calls needed)
381
- if response[:finish_reason] == "stop" || response[:tool_calls].nil? || response[:tool_calls].empty?
377
+ # [DIAG] Only log when finish_reason=="stop" AND tool_calls non-empty —
378
+ # the suspicious combo that indicates an upstream-truncated tool_use
379
+ # response. Normal responses produce no log line here to avoid noise.
380
+ begin
381
+ tool_calls = response[:tool_calls] || []
382
+ if response[:finish_reason] == "stop" && !tool_calls.empty?
383
+ tc_summary = tool_calls.map do |c|
384
+ args_str = c[:arguments].is_a?(String) ? c[:arguments] : c[:arguments].to_s
385
+ {
386
+ name: c[:name].to_s,
387
+ args_len: args_str.length,
388
+ args_head: args_str[0, 120]
389
+ }
390
+ end
391
+ Clacky::Logger.warn("agent.think_response",
392
+ session_id: @session_id,
393
+ iteration: @iterations,
394
+ finish_reason: response[:finish_reason].to_s,
395
+ tool_calls_count: tool_calls.size,
396
+ tool_calls: tc_summary,
397
+ content_len: response[:content].to_s.length,
398
+ completion_tokens: response.dig(:token_usage, :completion_tokens),
399
+ ttft_ms: response.dig(:latency, :ttft_ms),
400
+ suspicious_truncation: true
401
+ )
402
+ end
403
+ rescue StandardError => e
404
+ Clacky::Logger.warn("agent.think_response.log_failed", error: e.message)
405
+ end
406
+
407
+ # Check if done (no more tool calls needed).
408
+ #
409
+ # Defensive rule: we ONLY exit on empty/missing tool_calls.
410
+ # We used to also short-circuit on finish_reason=="stop", but
411
+ # upstream routers (OpenRouter → Anthropic/Bedrock) can return the
412
+ # contradictory combo `finish_reason=="stop" + non-empty tool_calls
413
+ # with truncated args`, which caused the agent to silently treat a
414
+ # truncated response as "task complete". Truncation is now caught
415
+ # earlier by LlmCaller#detect_upstream_truncation! (which raises
416
+ # UpstreamTruncatedError → RetryableError); this branch stays as
417
+ # a belt-and-braces guard: if that detector ever misses a new
418
+ # truncation pattern, we still won't silently exit while the model
419
+ # is mid-tool_call.
420
+ if response[:tool_calls].nil? || response[:tool_calls].empty?
421
+ # [DIAG] Pin down exactly which sub-condition triggered the task exit.
422
+ Clacky::Logger.info("agent.loop_break_normal",
423
+ session_id: @session_id,
424
+ iteration: @iterations,
425
+ branch: (response[:tool_calls].nil? ? "tool_calls_nil" : "tool_calls_empty"),
426
+ finish_reason: response[:finish_reason].to_s,
427
+ tool_calls_count: (response[:tool_calls] || []).size
428
+ )
382
429
  if response[:content] && !response[:content].empty?
383
430
  emit_assistant_message(response[:content])
384
431
  end
@@ -929,12 +976,6 @@ module Clacky
929
976
  end
930
977
  end
931
978
 
932
- # Interrupt the agent's current run
933
- # Called when user presses Ctrl+C during agent execution
934
- def interrupt!
935
- @interrupted = true
936
- end
937
-
938
979
  # Enqueue an inline skill injection to be flushed after observe().
939
980
  # Called by InvokeSkill#execute to avoid injecting during tool execution,
940
981
  # which would break Bedrock's toolUse/toolResult pairing requirement.
@@ -1001,16 +1042,7 @@ module Clacky
1001
1042
 
1002
1043
  # Check if agent is currently running
1003
1044
  def running?
1004
- @start_time != nil && !should_stop?
1005
- end
1006
-
1007
- private def should_stop?
1008
- if @interrupted
1009
- @interrupted = false # Reset for next run
1010
- return true
1011
- end
1012
-
1013
- false
1045
+ !@start_time.nil?
1014
1046
  end
1015
1047
 
1016
1048
  private def build_result(status = :success, error: nil)
data/lib/clacky/client.rb CHANGED
@@ -12,14 +12,29 @@ module Clacky
12
12
  @api_key = api_key
13
13
  @base_url = base_url
14
14
  @model = model
15
- @use_anthropic_format = anthropic_format
16
15
  # Detect Bedrock: ABSK key prefix (native AWS) or abs- model prefix (Clacky AI proxy)
17
16
  @use_bedrock = MessageFormat::Bedrock.bedrock_api_key?(api_key, model)
18
17
 
18
+ # Resolve provider once — reused for capability + api-type lookups.
19
+ provider_id = Providers.resolve_provider(base_url: @base_url, api_key: @api_key)
20
+
21
+ # Decide anthropic_format dynamically based on provider+model, falling
22
+ # back to the explicit constructor flag for unknown providers / custom
23
+ # base_urls. This lets e.g. OpenRouter's Claude models auto-route to the
24
+ # native /v1/messages endpoint (preserving cache_control byte-for-byte)
25
+ # without requiring any change to user YAML.
26
+ provider_prefers_anthropic = provider_id &&
27
+ Providers.anthropic_format_for_model?(provider_id, @model)
28
+ @use_anthropic_format = provider_prefers_anthropic || anthropic_format
29
+
30
+ # Remember the provider id so we can tune connection headers below
31
+ # (OpenRouter's /v1/messages accepts either Bearer or x-api-key, but
32
+ # some OpenRouter-compatible relays only honour Bearer — send both).
33
+ @provider_id = provider_id
34
+
19
35
  # Determine vision support once at construction time.
20
36
  # Non-vision models (DeepSeek, Kimi, MiniMax, etc.) reject image_url
21
37
  # content blocks; the conversion layer strips them when this is false.
22
- provider_id = Providers.resolve_provider(base_url: @base_url, api_key: @api_key)
23
38
  @vision_supported = Providers.supports?(provider_id, :vision, model_name: @model)
24
39
  end
25
40
 
@@ -47,7 +62,7 @@ module Clacky
47
62
  elsif anthropic_format?
48
63
  minimal_body = { model: model, max_tokens: 16,
49
64
  messages: [{ role: "user", content: "hi" }] }.to_json
50
- response = anthropic_connection.post("v1/messages") { |r| r.body = minimal_body }
65
+ response = anthropic_connection.post(anthropic_messages_path) { |r| r.body = minimal_body }
51
66
  else
52
67
  minimal_body = { model: model, max_tokens: 16,
53
68
  messages: [{ role: "user", content: "hi" }] }.to_json
@@ -77,7 +92,7 @@ module Clacky
77
92
  parse_simple_bedrock_response(response)
78
93
  elsif anthropic_format?
79
94
  body = MessageFormat::Anthropic.build_request_body(messages, model, [], max_tokens, false)
80
- response = anthropic_connection.post("v1/messages") { |r| r.body = body.to_json }
95
+ response = anthropic_connection.post(anthropic_messages_path) { |r| r.body = body.to_json }
81
96
  parse_simple_anthropic_response(response)
82
97
  else
83
98
  body = { model: model, max_tokens: max_tokens, messages: messages }
@@ -206,7 +221,7 @@ module Clacky
206
221
  messages = apply_message_caching(messages) if caching_enabled
207
222
 
208
223
  body = MessageFormat::Anthropic.build_request_body(messages, model, tools, max_tokens, caching_enabled)
209
- response = anthropic_connection.post("v1/messages") { |r| r.body = body.to_json }
224
+ response = anthropic_connection.post(anthropic_messages_path) { |r| r.body = body.to_json }
210
225
 
211
226
  raise_error(response) unless response.status == 200
212
227
  check_html_response(response)
@@ -333,6 +348,14 @@ module Clacky
333
348
  conn.headers["x-api-key"] = @api_key
334
349
  conn.headers["anthropic-version"] = "2023-06-01"
335
350
  conn.headers["anthropic-dangerous-direct-browser-access"] = "true"
351
+ # OpenRouter's /v1/messages endpoint authenticates with a Bearer
352
+ # token (the OpenRouter API key), not Anthropic's x-api-key. We send
353
+ # both so the same connection code works for direct Anthropic and
354
+ # for OpenRouter-proxied Claude — each endpoint ignores the header
355
+ # it doesn't recognise.
356
+ if @provider_id == "openrouter"
357
+ conn.headers["Authorization"] = "Bearer #{@api_key}"
358
+ end
336
359
  conn.options.timeout = 300
337
360
  conn.options.open_timeout = 10
338
361
  conn.ssl.verify = false
@@ -340,6 +363,22 @@ module Clacky
340
363
  end
341
364
  end
342
365
 
366
+ # Correct relative path for the Anthropic /v1/messages endpoint, accounting
367
+ # for whether the configured base_url already includes a "/v1" segment.
368
+ #
369
+ # Examples:
370
+ # base_url = "https://api.anthropic.com" → "v1/messages"
371
+ # base_url = "https://openrouter.ai/api/v1" → "messages"
372
+ # base_url = "https://openrouter.ai/api/v1/" → "messages"
373
+ #
374
+ # Without this, OpenRouter would receive POST /api/v1/v1/messages → 404
375
+ # (HTML error page), which bubbles up as the infamous
376
+ # "Invalid API endpoint or server error (received HTML instead of JSON)".
377
+ private def anthropic_messages_path
378
+ base = @base_url.to_s.chomp("/")
379
+ base.end_with?("/v1") ? "messages" : "v1/messages"
380
+ end
381
+
343
382
  # ── Error handling ────────────────────────────────────────────────────────
344
383
 
345
384
  def handle_test_response(response)
@@ -12,15 +12,33 @@
12
12
  # exit 0 — success
13
13
  # exit 1 — failure
14
14
  #
15
- # This file lives in ~/.clacky/parsers/ and can be modified by the LLM
16
- # to add new capabilities (e.g. OCR for scanned PDFs).
15
+ # This file lives in ~/.clacky/parsers/ and can be modified by the LLM.
17
16
  #
18
- # VERSION: 1
17
+ # Extraction pipeline (first successful step wins):
18
+ # 1. pdftotext (poppler) — fastest, text-based PDFs
19
+ # 2. pdfplumber (Python) — handles more layouts
20
+ # (→ pdf_parser_plumber.py)
21
+ # 3. OCR (tesseract) — scanned / image-only PDFs
22
+ # (→ pdf_parser_ocr.py)
23
+ #
24
+ # Each extractor is a plain, self-contained function. Python-backed steps
25
+ # shell out to a sibling .py script so the LLM can edit them directly
26
+ # (with proper syntax highlighting, linters, and per-file run/debug)
27
+ # instead of wrestling with embedded heredocs.
28
+ #
29
+ # VERSION: 3
19
30
 
20
31
  require "open3"
21
32
 
33
+ # Minimum useful output (in bytes). Below this, a step is considered a
34
+ # miss and the next fallback is tried.
22
35
  MIN_CONTENT_BYTES = 20
23
36
 
37
+ # Script directory — resolve sibling .py helpers relative to this file
38
+ # so it works both from the gem's default_parsers/ dir and from the
39
+ # copied-to-user ~/.clacky/parsers/ dir.
40
+ SCRIPT_DIR = File.dirname(File.expand_path(__FILE__))
41
+
24
42
  def try_pdftotext(path)
25
43
  stdout, _stderr, status = Open3.capture3("pdftotext", "-layout", "-enc", "UTF-8", path, "-")
26
44
  return nil unless status.success?
@@ -32,18 +50,10 @@ rescue Errno::ENOENT
32
50
  end
33
51
 
34
52
  def try_pdfplumber(path)
35
- script = <<~PYTHON
36
- import sys, pdfplumber
37
- with pdfplumber.open(sys.argv[1]) as pdf:
38
- pages = []
39
- for i, page in enumerate(pdf.pages, 1):
40
- t = page.extract_text()
41
- if t and t.strip():
42
- pages.append(f"--- Page {i} ---\\n{t.strip()}")
43
- print("\\n\\n".join(pages))
44
- PYTHON
53
+ script = File.join(SCRIPT_DIR, "pdf_parser_plumber.py")
54
+ return nil unless File.exist?(script)
45
55
 
46
- stdout, _stderr, status = Open3.capture3("python3", "-c", script, path)
56
+ stdout, _stderr, status = Open3.capture3("python3", script, path)
47
57
  return nil unless status.success?
48
58
  text = stdout.strip
49
59
  return nil if text.bytesize < MIN_CONTENT_BYTES
@@ -52,6 +62,34 @@ rescue Errno::ENOENT
52
62
  nil # python3 not available
53
63
  end
54
64
 
65
+ # OCR fallback for scanned/image-only PDFs.
66
+ # See pdf_parser_ocr.py for the actual extraction logic.
67
+ #
68
+ # Installation hints (also printed on final failure):
69
+ # macOS: brew install tesseract tesseract-lang poppler
70
+ # pip3 install pytesseract pdf2image
71
+ # Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
72
+ # pip3 install pytesseract pdf2image
73
+ def try_ocr(path)
74
+ # Quick capability check — avoid spawning python if tesseract is missing.
75
+ _stdout, _stderr, status = Open3.capture3("tesseract", "--version")
76
+ return nil unless status.success?
77
+
78
+ script = File.join(SCRIPT_DIR, "pdf_parser_ocr.py")
79
+ return nil unless File.exist?(script)
80
+
81
+ stdout, stderr, status = Open3.capture3("python3", script, path)
82
+ unless status.success?
83
+ warn stderr.strip unless stderr.strip.empty?
84
+ return nil
85
+ end
86
+ text = stdout.strip
87
+ return nil if text.bytesize < MIN_CONTENT_BYTES
88
+ text
89
+ rescue Errno::ENOENT
90
+ nil # tesseract or python3 not available
91
+ end
92
+
55
93
  # --- main ---
56
94
 
57
95
  path = ARGV[0]
@@ -66,14 +104,17 @@ unless File.exist?(path)
66
104
  exit 1
67
105
  end
68
106
 
69
- text = try_pdftotext(path) || try_pdfplumber(path)
107
+ # Try each extractor in order; first non-nil result wins.
108
+ text = try_pdftotext(path) || try_pdfplumber(path) || try_ocr(path)
70
109
 
71
110
  if text
72
111
  print text
73
112
  exit 0
74
113
  else
75
114
  warn "Could not extract text from PDF."
76
- warn "Tip: install poppler for text-based PDFs: brew install poppler"
77
- warn "For scanned PDFs, consider adding OCR support (e.g. tesseract)."
115
+ warn "For text-based PDFs, install poppler: brew install poppler (macOS) / apt install poppler-utils (Linux)"
116
+ warn "For scanned PDFs (OCR):"
117
+ warn " macOS: brew install tesseract tesseract-lang poppler && pip3 install pytesseract pdf2image"
118
+ warn " Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils && pip3 install pytesseract pdf2image"
78
119
  exit 1
79
120
  end
@@ -0,0 +1,103 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ pdf_parser_ocr.py — extract text from a scanned/image-only PDF using OCR.
4
+
5
+ Usage:
6
+ python3 pdf_parser_ocr.py <file_path>
7
+
8
+ Output:
9
+ stdout — extracted text, one block per page, separated by blank lines
10
+ stderr — error messages
11
+ exit 0 — success (text was extracted)
12
+ exit 1 — failure / no text found
13
+ exit 2 — dependency missing (pytesseract or pdf2image)
14
+ exit 3 — pdf2image couldn't rasterise the PDF (usually missing poppler)
15
+
16
+ Called from pdf_parser.rb as the third-tier fallback (after pdftotext and
17
+ pdfplumber). This script is copied into ~/.clacky/parsers/ and can be
18
+ edited freely by the LLM — common tweaks:
19
+ - Change DPI (higher = better accuracy, slower + more memory)
20
+ - Change OCR_LANG to match your document (e.g. "jpn+eng")
21
+ - Add image preprocessing (deskew, contrast, threshold) before OCR
22
+ - Adjust MAX_PAGES for very large scans
23
+
24
+ Environment variable overrides:
25
+ CLACKY_OCR_LANG — override OCR_LANG (e.g. "eng", "jpn+eng")
26
+ CLACKY_OCR_MAX_PAGES — override MAX_PAGES
27
+ CLACKY_OCR_DPI — override DPI
28
+
29
+ Install:
30
+ macOS: brew install tesseract tesseract-lang poppler
31
+ pip3 install pytesseract pdf2image
32
+ Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
33
+ pip3 install pytesseract pdf2image
34
+ """
35
+
36
+ # VERSION: 1
37
+
38
+ import os
39
+ import sys
40
+
41
+ # --- Config ---
42
+ # Simplified Chinese + English covers most mixed-language documents.
43
+ # For pure English scans, "eng" alone is faster and lighter.
44
+ OCR_LANG = "chi_sim+eng"
45
+
46
+ # 200 DPI is a good balance: tesseract's accuracy plateau starts around
47
+ # 300 DPI, but memory + time cost scales quadratically. Raise to 300 for
48
+ # small fonts or when accuracy matters more than speed.
49
+ DPI = 200
50
+
51
+ # Hard cap on pages to OCR. OCR is slow (~1-3s/page); for huge scans the
52
+ # LLM should be told to OCR in chunks instead.
53
+ MAX_PAGES = 50
54
+
55
+
56
+ def main():
57
+ if len(sys.argv) < 2:
58
+ sys.stderr.write("Usage: pdf_parser_ocr.py <file_path>\n")
59
+ sys.exit(1)
60
+
61
+ path = sys.argv[1]
62
+
63
+ try:
64
+ import pytesseract
65
+ from pdf2image import convert_from_path
66
+ except ImportError as e:
67
+ sys.stderr.write(f"OCR dependencies missing: {e}\n")
68
+ sys.stderr.write("Install with: pip3 install pytesseract pdf2image\n")
69
+ sys.exit(2)
70
+
71
+ lang = os.environ.get("CLACKY_OCR_LANG", OCR_LANG)
72
+ max_pages = int(os.environ.get("CLACKY_OCR_MAX_PAGES", MAX_PAGES))
73
+ dpi = int(os.environ.get("CLACKY_OCR_DPI", DPI))
74
+
75
+ try:
76
+ images = convert_from_path(path, dpi=dpi, last_page=max_pages)
77
+ except Exception as e:
78
+ sys.stderr.write(f"pdf2image failed: {e}\n")
79
+ sys.stderr.write("Is poppler installed? (brew install poppler / apt install poppler-utils)\n")
80
+ sys.exit(3)
81
+
82
+ pages = []
83
+ for i, image in enumerate(images, 1):
84
+ try:
85
+ text = pytesseract.image_to_string(image, lang=lang)
86
+ except pytesseract.TesseractError as e:
87
+ # Most common cause: requested language pack not installed.
88
+ # Fall back to English-only for this page rather than aborting.
89
+ sys.stderr.write(f"tesseract error on page {i}: {e}\n")
90
+ text = pytesseract.image_to_string(image, lang="eng")
91
+ text = text.strip()
92
+ if text:
93
+ pages.append(f"--- Page {i} (OCR) ---\n{text}")
94
+
95
+ if not pages:
96
+ sys.stderr.write("OCR produced no text — PDF may be blank or unreadable.\n")
97
+ sys.exit(1)
98
+
99
+ print("\n\n".join(pages))
100
+
101
+
102
+ if __name__ == "__main__":
103
+ main()
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ pdf_parser_plumber.py — extract text from a PDF using pdfplumber.
4
+
5
+ Usage:
6
+ python3 pdf_parser_plumber.py <file_path>
7
+
8
+ Output:
9
+ stdout — extracted text, one block per page, separated by blank lines
10
+ stderr — error messages
11
+ exit 0 — success (text was extracted)
12
+ exit 1 — failure / no text found
13
+ exit 2 — dependency missing
14
+
15
+ Called from pdf_parser.rb as the second-tier extractor (after pdftotext).
16
+ This script is copied into ~/.clacky/parsers/ and can be edited freely by
17
+ the LLM — e.g. to tune table extraction, layout heuristics, or filter out
18
+ boilerplate headers/footers. Edit, then re-run to test.
19
+
20
+ Install:
21
+ pip3 install pdfplumber
22
+ """
23
+
24
+ # VERSION: 1
25
+
26
+ import sys
27
+
28
+
29
+ def main():
30
+ if len(sys.argv) < 2:
31
+ sys.stderr.write("Usage: pdf_parser_plumber.py <file_path>\n")
32
+ sys.exit(1)
33
+
34
+ path = sys.argv[1]
35
+
36
+ try:
37
+ import pdfplumber
38
+ except ImportError as e:
39
+ sys.stderr.write(f"pdfplumber missing: {e}\n")
40
+ sys.stderr.write("Install with: pip3 install pdfplumber\n")
41
+ sys.exit(2)
42
+
43
+ pages = []
44
+ try:
45
+ with pdfplumber.open(path) as pdf:
46
+ for i, page in enumerate(pdf.pages, 1):
47
+ text = page.extract_text()
48
+ if text and text.strip():
49
+ pages.append(f"--- Page {i} ---\n{text.strip()}")
50
+ except Exception as e:
51
+ sys.stderr.write(f"pdfplumber failed: {e}\n")
52
+ sys.exit(1)
53
+
54
+ if not pages:
55
+ sys.stderr.write("pdfplumber produced no text.\n")
56
+ sys.exit(1)
57
+
58
+ print("\n\n".join(pages))
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()