openclacky 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -0
- data/README.md +87 -53
- data/lib/clacky/agent/cost_tracker.rb +19 -2
- data/lib/clacky/agent/llm_caller.rb +218 -0
- data/lib/clacky/agent/message_compressor_helper.rb +32 -2
- data/lib/clacky/agent.rb +54 -22
- data/lib/clacky/client.rb +44 -5
- data/lib/clacky/default_parsers/pdf_parser.rb +58 -17
- data/lib/clacky/default_parsers/pdf_parser_ocr.py +103 -0
- data/lib/clacky/default_parsers/pdf_parser_plumber.py +62 -0
- data/lib/clacky/default_skills/deploy/SKILL.md +201 -77
- data/lib/clacky/default_skills/new/SKILL.md +3 -114
- data/lib/clacky/default_skills/onboard/SKILL.md +349 -133
- data/lib/clacky/default_skills/onboard/scripts/import_external_skills.rb +371 -0
- data/lib/clacky/default_skills/onboard/scripts/install_builtin_skills.rb +175 -0
- data/lib/clacky/default_skills/skill-add/scripts/install_from_zip.rb +59 -26
- data/lib/clacky/message_format/anthropic.rb +72 -8
- data/lib/clacky/message_format/bedrock.rb +6 -3
- data/lib/clacky/providers.rb +146 -3
- data/lib/clacky/server/channel/adapters/feishu/adapter.rb +14 -0
- data/lib/clacky/server/channel/adapters/feishu/bot.rb +10 -0
- data/lib/clacky/server/channel/adapters/feishu/message_parser.rb +1 -0
- data/lib/clacky/server/channel/channel_manager.rb +12 -4
- data/lib/clacky/server/channel/channel_ui_controller.rb +8 -2
- data/lib/clacky/server/http_server.rb +746 -13
- data/lib/clacky/server/session_registry.rb +55 -24
- data/lib/clacky/skill.rb +10 -9
- data/lib/clacky/skill_loader.rb +23 -11
- data/lib/clacky/tools/file_reader.rb +232 -127
- data/lib/clacky/tools/security.rb +42 -64
- data/lib/clacky/tools/terminal/persistent_session.rb +15 -4
- data/lib/clacky/tools/terminal/safe_rm.sh +106 -0
- data/lib/clacky/tools/terminal/session_manager.rb +8 -3
- data/lib/clacky/tools/terminal.rb +263 -16
- data/lib/clacky/ui2/layout_manager.rb +8 -1
- data/lib/clacky/ui2/output_buffer.rb +83 -23
- data/lib/clacky/ui2/ui_controller.rb +74 -7
- data/lib/clacky/utils/file_processor.rb +14 -40
- data/lib/clacky/utils/model_pricing.rb +215 -0
- data/lib/clacky/utils/parser_manager.rb +70 -6
- data/lib/clacky/utils/string_matcher.rb +23 -1
- data/lib/clacky/version.rb +1 -1
- data/lib/clacky/web/app.css +673 -9
- data/lib/clacky/web/app.js +40 -1608
- data/lib/clacky/web/i18n.js +209 -0
- data/lib/clacky/web/index.html +166 -2
- data/lib/clacky/web/onboard.js +77 -1
- data/lib/clacky/web/profile.js +442 -0
- data/lib/clacky/web/sessions.js +1034 -2
- data/lib/clacky/web/settings.js +127 -6
- data/lib/clacky/web/sidebar.js +39 -0
- data/lib/clacky/web/skills.js +460 -0
- data/lib/clacky/web/trash.js +343 -0
- data/lib/clacky/web/ws-dispatcher.js +255 -0
- data/lib/clacky.rb +5 -3
- metadata +16 -17
- data/lib/clacky/clacky_auth_client.rb +0 -152
- data/lib/clacky/clacky_cloud_config.rb +0 -123
- data/lib/clacky/cloud_project_client.rb +0 -169
- data/lib/clacky/default_skills/deploy/scripts/rails_deploy.rb +0 -1377
- data/lib/clacky/default_skills/deploy/tools/check_health.rb +0 -116
- data/lib/clacky/default_skills/deploy/tools/create_database_service.rb +0 -341
- data/lib/clacky/default_skills/deploy/tools/execute_deployment.rb +0 -99
- data/lib/clacky/default_skills/deploy/tools/fetch_runtime_logs.rb +0 -77
- data/lib/clacky/default_skills/deploy/tools/list_services.rb +0 -67
- data/lib/clacky/default_skills/deploy/tools/report_deploy_status.rb +0 -67
- data/lib/clacky/default_skills/deploy/tools/set_deploy_variables.rb +0 -189
- data/lib/clacky/default_skills/new/scripts/cloud_project_init.sh +0 -74
- data/lib/clacky/deploy_api_client.rb +0 -484
data/lib/clacky/agent.rb
CHANGED
|
@@ -78,7 +78,6 @@ module Clacky
|
|
|
78
78
|
@cost_source = :estimated # Track whether cost is from API or estimated
|
|
79
79
|
@task_cost_source = :estimated # Track cost source for current task
|
|
80
80
|
@previous_total_tokens = 0 # Track tokens from previous iteration for delta calculation
|
|
81
|
-
@interrupted = false # Flag for user interrupt
|
|
82
81
|
@latest_latency = nil # Most recent LLM call's latency metrics (see Client#send_messages_with_tools)
|
|
83
82
|
@ui = ui # UIController for direct UI interaction
|
|
84
83
|
@debug_logs = [] # Debug logs for troubleshooting
|
|
@@ -211,6 +210,7 @@ module Clacky
|
|
|
211
210
|
@start_time = Time.now
|
|
212
211
|
@task_truncation_count = 0 # Reset truncation counter for each task
|
|
213
212
|
@task_timeout_hint_injected = false # Reset read-timeout hint injection (see LlmCaller)
|
|
213
|
+
@task_upstream_truncation_hint_injected = false # Reset upstream-truncation hint injection (see LlmCaller)
|
|
214
214
|
@task_cost_source = :estimated # Reset for new task
|
|
215
215
|
# Note: Do NOT reset @previous_total_tokens here - it should maintain the value from the last iteration
|
|
216
216
|
# across tasks to correctly calculate delta tokens in each iteration
|
|
@@ -360,9 +360,6 @@ module Clacky
|
|
|
360
360
|
task_interrupted = false
|
|
361
361
|
|
|
362
362
|
loop do
|
|
363
|
-
|
|
364
|
-
break if should_stop?
|
|
365
|
-
|
|
366
363
|
@iterations += 1
|
|
367
364
|
@hooks.trigger(:on_iteration, @iterations)
|
|
368
365
|
|
|
@@ -377,8 +374,58 @@ module Clacky
|
|
|
377
374
|
# Skip if compression happened (response is nil)
|
|
378
375
|
next if response.nil?
|
|
379
376
|
|
|
380
|
-
#
|
|
381
|
-
|
|
377
|
+
# [DIAG] Only log when finish_reason=="stop" AND tool_calls non-empty —
|
|
378
|
+
# the suspicious combo that indicates an upstream-truncated tool_use
|
|
379
|
+
# response. Normal responses produce no log line here to avoid noise.
|
|
380
|
+
begin
|
|
381
|
+
tool_calls = response[:tool_calls] || []
|
|
382
|
+
if response[:finish_reason] == "stop" && !tool_calls.empty?
|
|
383
|
+
tc_summary = tool_calls.map do |c|
|
|
384
|
+
args_str = c[:arguments].is_a?(String) ? c[:arguments] : c[:arguments].to_s
|
|
385
|
+
{
|
|
386
|
+
name: c[:name].to_s,
|
|
387
|
+
args_len: args_str.length,
|
|
388
|
+
args_head: args_str[0, 120]
|
|
389
|
+
}
|
|
390
|
+
end
|
|
391
|
+
Clacky::Logger.warn("agent.think_response",
|
|
392
|
+
session_id: @session_id,
|
|
393
|
+
iteration: @iterations,
|
|
394
|
+
finish_reason: response[:finish_reason].to_s,
|
|
395
|
+
tool_calls_count: tool_calls.size,
|
|
396
|
+
tool_calls: tc_summary,
|
|
397
|
+
content_len: response[:content].to_s.length,
|
|
398
|
+
completion_tokens: response.dig(:token_usage, :completion_tokens),
|
|
399
|
+
ttft_ms: response.dig(:latency, :ttft_ms),
|
|
400
|
+
suspicious_truncation: true
|
|
401
|
+
)
|
|
402
|
+
end
|
|
403
|
+
rescue StandardError => e
|
|
404
|
+
Clacky::Logger.warn("agent.think_response.log_failed", error: e.message)
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
# Check if done (no more tool calls needed).
|
|
408
|
+
#
|
|
409
|
+
# Defensive rule: we ONLY exit on empty/missing tool_calls.
|
|
410
|
+
# We used to also short-circuit on finish_reason=="stop", but
|
|
411
|
+
# upstream routers (OpenRouter → Anthropic/Bedrock) can return the
|
|
412
|
+
# contradictory combo `finish_reason=="stop" + non-empty tool_calls
|
|
413
|
+
# with truncated args`, which caused the agent to silently treat a
|
|
414
|
+
# truncated response as "task complete". Truncation is now caught
|
|
415
|
+
# earlier by LlmCaller#detect_upstream_truncation! (which raises
|
|
416
|
+
# UpstreamTruncatedError → RetryableError); this branch stays as
|
|
417
|
+
# a belt-and-braces guard: if that detector ever misses a new
|
|
418
|
+
# truncation pattern, we still won't silently exit while the model
|
|
419
|
+
# is mid-tool_call.
|
|
420
|
+
if response[:tool_calls].nil? || response[:tool_calls].empty?
|
|
421
|
+
# [DIAG] Pin down exactly which sub-condition triggered the task exit.
|
|
422
|
+
Clacky::Logger.info("agent.loop_break_normal",
|
|
423
|
+
session_id: @session_id,
|
|
424
|
+
iteration: @iterations,
|
|
425
|
+
branch: (response[:tool_calls].nil? ? "tool_calls_nil" : "tool_calls_empty"),
|
|
426
|
+
finish_reason: response[:finish_reason].to_s,
|
|
427
|
+
tool_calls_count: (response[:tool_calls] || []).size
|
|
428
|
+
)
|
|
382
429
|
if response[:content] && !response[:content].empty?
|
|
383
430
|
emit_assistant_message(response[:content])
|
|
384
431
|
end
|
|
@@ -929,12 +976,6 @@ module Clacky
|
|
|
929
976
|
end
|
|
930
977
|
end
|
|
931
978
|
|
|
932
|
-
# Interrupt the agent's current run
|
|
933
|
-
# Called when user presses Ctrl+C during agent execution
|
|
934
|
-
def interrupt!
|
|
935
|
-
@interrupted = true
|
|
936
|
-
end
|
|
937
|
-
|
|
938
979
|
# Enqueue an inline skill injection to be flushed after observe().
|
|
939
980
|
# Called by InvokeSkill#execute to avoid injecting during tool execution,
|
|
940
981
|
# which would break Bedrock's toolUse/toolResult pairing requirement.
|
|
@@ -1001,16 +1042,7 @@ module Clacky
|
|
|
1001
1042
|
|
|
1002
1043
|
# Check if agent is currently running
|
|
1003
1044
|
def running?
|
|
1004
|
-
|
|
1005
|
-
end
|
|
1006
|
-
|
|
1007
|
-
private def should_stop?
|
|
1008
|
-
if @interrupted
|
|
1009
|
-
@interrupted = false # Reset for next run
|
|
1010
|
-
return true
|
|
1011
|
-
end
|
|
1012
|
-
|
|
1013
|
-
false
|
|
1045
|
+
!@start_time.nil?
|
|
1014
1046
|
end
|
|
1015
1047
|
|
|
1016
1048
|
private def build_result(status = :success, error: nil)
|
data/lib/clacky/client.rb
CHANGED
|
@@ -12,14 +12,29 @@ module Clacky
|
|
|
12
12
|
@api_key = api_key
|
|
13
13
|
@base_url = base_url
|
|
14
14
|
@model = model
|
|
15
|
-
@use_anthropic_format = anthropic_format
|
|
16
15
|
# Detect Bedrock: ABSK key prefix (native AWS) or abs- model prefix (Clacky AI proxy)
|
|
17
16
|
@use_bedrock = MessageFormat::Bedrock.bedrock_api_key?(api_key, model)
|
|
18
17
|
|
|
18
|
+
# Resolve provider once — reused for capability + api-type lookups.
|
|
19
|
+
provider_id = Providers.resolve_provider(base_url: @base_url, api_key: @api_key)
|
|
20
|
+
|
|
21
|
+
# Decide anthropic_format dynamically based on provider+model, falling
|
|
22
|
+
# back to the explicit constructor flag for unknown providers / custom
|
|
23
|
+
# base_urls. This lets e.g. OpenRouter's Claude models auto-route to the
|
|
24
|
+
# native /v1/messages endpoint (preserving cache_control byte-for-byte)
|
|
25
|
+
# without requiring any change to user YAML.
|
|
26
|
+
provider_prefers_anthropic = provider_id &&
|
|
27
|
+
Providers.anthropic_format_for_model?(provider_id, @model)
|
|
28
|
+
@use_anthropic_format = provider_prefers_anthropic || anthropic_format
|
|
29
|
+
|
|
30
|
+
# Remember the provider id so we can tune connection headers below
|
|
31
|
+
# (OpenRouter's /v1/messages accepts either Bearer or x-api-key, but
|
|
32
|
+
# some OpenRouter-compatible relays only honour Bearer — send both).
|
|
33
|
+
@provider_id = provider_id
|
|
34
|
+
|
|
19
35
|
# Determine vision support once at construction time.
|
|
20
36
|
# Non-vision models (DeepSeek, Kimi, MiniMax, etc.) reject image_url
|
|
21
37
|
# content blocks; the conversion layer strips them when this is false.
|
|
22
|
-
provider_id = Providers.resolve_provider(base_url: @base_url, api_key: @api_key)
|
|
23
38
|
@vision_supported = Providers.supports?(provider_id, :vision, model_name: @model)
|
|
24
39
|
end
|
|
25
40
|
|
|
@@ -47,7 +62,7 @@ module Clacky
|
|
|
47
62
|
elsif anthropic_format?
|
|
48
63
|
minimal_body = { model: model, max_tokens: 16,
|
|
49
64
|
messages: [{ role: "user", content: "hi" }] }.to_json
|
|
50
|
-
response = anthropic_connection.post(
|
|
65
|
+
response = anthropic_connection.post(anthropic_messages_path) { |r| r.body = minimal_body }
|
|
51
66
|
else
|
|
52
67
|
minimal_body = { model: model, max_tokens: 16,
|
|
53
68
|
messages: [{ role: "user", content: "hi" }] }.to_json
|
|
@@ -77,7 +92,7 @@ module Clacky
|
|
|
77
92
|
parse_simple_bedrock_response(response)
|
|
78
93
|
elsif anthropic_format?
|
|
79
94
|
body = MessageFormat::Anthropic.build_request_body(messages, model, [], max_tokens, false)
|
|
80
|
-
response = anthropic_connection.post(
|
|
95
|
+
response = anthropic_connection.post(anthropic_messages_path) { |r| r.body = body.to_json }
|
|
81
96
|
parse_simple_anthropic_response(response)
|
|
82
97
|
else
|
|
83
98
|
body = { model: model, max_tokens: max_tokens, messages: messages }
|
|
@@ -206,7 +221,7 @@ module Clacky
|
|
|
206
221
|
messages = apply_message_caching(messages) if caching_enabled
|
|
207
222
|
|
|
208
223
|
body = MessageFormat::Anthropic.build_request_body(messages, model, tools, max_tokens, caching_enabled)
|
|
209
|
-
response = anthropic_connection.post(
|
|
224
|
+
response = anthropic_connection.post(anthropic_messages_path) { |r| r.body = body.to_json }
|
|
210
225
|
|
|
211
226
|
raise_error(response) unless response.status == 200
|
|
212
227
|
check_html_response(response)
|
|
@@ -333,6 +348,14 @@ module Clacky
|
|
|
333
348
|
conn.headers["x-api-key"] = @api_key
|
|
334
349
|
conn.headers["anthropic-version"] = "2023-06-01"
|
|
335
350
|
conn.headers["anthropic-dangerous-direct-browser-access"] = "true"
|
|
351
|
+
# OpenRouter's /v1/messages endpoint authenticates with a Bearer
|
|
352
|
+
# token (the OpenRouter API key), not Anthropic's x-api-key. We send
|
|
353
|
+
# both so the same connection code works for direct Anthropic and
|
|
354
|
+
# for OpenRouter-proxied Claude — each endpoint ignores the header
|
|
355
|
+
# it doesn't recognise.
|
|
356
|
+
if @provider_id == "openrouter"
|
|
357
|
+
conn.headers["Authorization"] = "Bearer #{@api_key}"
|
|
358
|
+
end
|
|
336
359
|
conn.options.timeout = 300
|
|
337
360
|
conn.options.open_timeout = 10
|
|
338
361
|
conn.ssl.verify = false
|
|
@@ -340,6 +363,22 @@ module Clacky
|
|
|
340
363
|
end
|
|
341
364
|
end
|
|
342
365
|
|
|
366
|
+
# Correct relative path for the Anthropic /v1/messages endpoint, accounting
|
|
367
|
+
# for whether the configured base_url already includes a "/v1" segment.
|
|
368
|
+
#
|
|
369
|
+
# Examples:
|
|
370
|
+
# base_url = "https://api.anthropic.com" → "v1/messages"
|
|
371
|
+
# base_url = "https://openrouter.ai/api/v1" → "messages"
|
|
372
|
+
# base_url = "https://openrouter.ai/api/v1/" → "messages"
|
|
373
|
+
#
|
|
374
|
+
# Without this, OpenRouter would receive POST /api/v1/v1/messages → 404
|
|
375
|
+
# (HTML error page), which bubbles up as the infamous
|
|
376
|
+
# "Invalid API endpoint or server error (received HTML instead of JSON)".
|
|
377
|
+
private def anthropic_messages_path
|
|
378
|
+
base = @base_url.to_s.chomp("/")
|
|
379
|
+
base.end_with?("/v1") ? "messages" : "v1/messages"
|
|
380
|
+
end
|
|
381
|
+
|
|
343
382
|
# ── Error handling ────────────────────────────────────────────────────────
|
|
344
383
|
|
|
345
384
|
def handle_test_response(response)
|
|
@@ -12,15 +12,33 @@
|
|
|
12
12
|
# exit 0 — success
|
|
13
13
|
# exit 1 — failure
|
|
14
14
|
#
|
|
15
|
-
# This file lives in ~/.clacky/parsers/ and can be modified by the LLM
|
|
16
|
-
# to add new capabilities (e.g. OCR for scanned PDFs).
|
|
15
|
+
# This file lives in ~/.clacky/parsers/ and can be modified by the LLM.
|
|
17
16
|
#
|
|
18
|
-
#
|
|
17
|
+
# Extraction pipeline (first successful step wins):
|
|
18
|
+
# 1. pdftotext (poppler) — fastest, text-based PDFs
|
|
19
|
+
# 2. pdfplumber (Python) — handles more layouts
|
|
20
|
+
# (→ pdf_parser_plumber.py)
|
|
21
|
+
# 3. OCR (tesseract) — scanned / image-only PDFs
|
|
22
|
+
# (→ pdf_parser_ocr.py)
|
|
23
|
+
#
|
|
24
|
+
# Each extractor is a plain, self-contained function. Python-backed steps
|
|
25
|
+
# shell out to a sibling .py script so the LLM can edit them directly
|
|
26
|
+
# (with proper syntax highlighting, linters, and per-file run/debug)
|
|
27
|
+
# instead of wrestling with embedded heredocs.
|
|
28
|
+
#
|
|
29
|
+
# VERSION: 3
|
|
19
30
|
|
|
20
31
|
require "open3"
|
|
21
32
|
|
|
33
|
+
# Minimum useful output (in bytes). Below this, a step is considered a
|
|
34
|
+
# miss and the next fallback is tried.
|
|
22
35
|
MIN_CONTENT_BYTES = 20
|
|
23
36
|
|
|
37
|
+
# Script directory — resolve sibling .py helpers relative to this file
|
|
38
|
+
# so it works both from the gem's default_parsers/ dir and from the
|
|
39
|
+
# copied-to-user ~/.clacky/parsers/ dir.
|
|
40
|
+
SCRIPT_DIR = File.dirname(File.expand_path(__FILE__))
|
|
41
|
+
|
|
24
42
|
def try_pdftotext(path)
|
|
25
43
|
stdout, _stderr, status = Open3.capture3("pdftotext", "-layout", "-enc", "UTF-8", path, "-")
|
|
26
44
|
return nil unless status.success?
|
|
@@ -32,18 +50,10 @@ rescue Errno::ENOENT
|
|
|
32
50
|
end
|
|
33
51
|
|
|
34
52
|
def try_pdfplumber(path)
|
|
35
|
-
script =
|
|
36
|
-
|
|
37
|
-
with pdfplumber.open(sys.argv[1]) as pdf:
|
|
38
|
-
pages = []
|
|
39
|
-
for i, page in enumerate(pdf.pages, 1):
|
|
40
|
-
t = page.extract_text()
|
|
41
|
-
if t and t.strip():
|
|
42
|
-
pages.append(f"--- Page {i} ---\\n{t.strip()}")
|
|
43
|
-
print("\\n\\n".join(pages))
|
|
44
|
-
PYTHON
|
|
53
|
+
script = File.join(SCRIPT_DIR, "pdf_parser_plumber.py")
|
|
54
|
+
return nil unless File.exist?(script)
|
|
45
55
|
|
|
46
|
-
stdout, _stderr, status = Open3.capture3("python3",
|
|
56
|
+
stdout, _stderr, status = Open3.capture3("python3", script, path)
|
|
47
57
|
return nil unless status.success?
|
|
48
58
|
text = stdout.strip
|
|
49
59
|
return nil if text.bytesize < MIN_CONTENT_BYTES
|
|
@@ -52,6 +62,34 @@ rescue Errno::ENOENT
|
|
|
52
62
|
nil # python3 not available
|
|
53
63
|
end
|
|
54
64
|
|
|
65
|
+
# OCR fallback for scanned/image-only PDFs.
|
|
66
|
+
# See pdf_parser_ocr.py for the actual extraction logic.
|
|
67
|
+
#
|
|
68
|
+
# Installation hints (also printed on final failure):
|
|
69
|
+
# macOS: brew install tesseract tesseract-lang poppler
|
|
70
|
+
# pip3 install pytesseract pdf2image
|
|
71
|
+
# Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
|
|
72
|
+
# pip3 install pytesseract pdf2image
|
|
73
|
+
def try_ocr(path)
|
|
74
|
+
# Quick capability check — avoid spawning python if tesseract is missing.
|
|
75
|
+
_stdout, _stderr, status = Open3.capture3("tesseract", "--version")
|
|
76
|
+
return nil unless status.success?
|
|
77
|
+
|
|
78
|
+
script = File.join(SCRIPT_DIR, "pdf_parser_ocr.py")
|
|
79
|
+
return nil unless File.exist?(script)
|
|
80
|
+
|
|
81
|
+
stdout, stderr, status = Open3.capture3("python3", script, path)
|
|
82
|
+
unless status.success?
|
|
83
|
+
warn stderr.strip unless stderr.strip.empty?
|
|
84
|
+
return nil
|
|
85
|
+
end
|
|
86
|
+
text = stdout.strip
|
|
87
|
+
return nil if text.bytesize < MIN_CONTENT_BYTES
|
|
88
|
+
text
|
|
89
|
+
rescue Errno::ENOENT
|
|
90
|
+
nil # tesseract or python3 not available
|
|
91
|
+
end
|
|
92
|
+
|
|
55
93
|
# --- main ---
|
|
56
94
|
|
|
57
95
|
path = ARGV[0]
|
|
@@ -66,14 +104,17 @@ unless File.exist?(path)
|
|
|
66
104
|
exit 1
|
|
67
105
|
end
|
|
68
106
|
|
|
69
|
-
|
|
107
|
+
# Try each extractor in order; first non-nil result wins.
|
|
108
|
+
text = try_pdftotext(path) || try_pdfplumber(path) || try_ocr(path)
|
|
70
109
|
|
|
71
110
|
if text
|
|
72
111
|
print text
|
|
73
112
|
exit 0
|
|
74
113
|
else
|
|
75
114
|
warn "Could not extract text from PDF."
|
|
76
|
-
warn "
|
|
77
|
-
warn "For scanned PDFs
|
|
115
|
+
warn "For text-based PDFs, install poppler: brew install poppler (macOS) / apt install poppler-utils (Linux)"
|
|
116
|
+
warn "For scanned PDFs (OCR):"
|
|
117
|
+
warn " macOS: brew install tesseract tesseract-lang poppler && pip3 install pytesseract pdf2image"
|
|
118
|
+
warn " Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils && pip3 install pytesseract pdf2image"
|
|
78
119
|
exit 1
|
|
79
120
|
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
pdf_parser_ocr.py — extract text from a scanned/image-only PDF using OCR.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python3 pdf_parser_ocr.py <file_path>
|
|
7
|
+
|
|
8
|
+
Output:
|
|
9
|
+
stdout — extracted text, one block per page, separated by blank lines
|
|
10
|
+
stderr — error messages
|
|
11
|
+
exit 0 — success (text was extracted)
|
|
12
|
+
exit 1 — failure / no text found
|
|
13
|
+
exit 2 — dependency missing (pytesseract or pdf2image)
|
|
14
|
+
exit 3 — pdf2image couldn't rasterise the PDF (usually missing poppler)
|
|
15
|
+
|
|
16
|
+
Called from pdf_parser.rb as the third-tier fallback (after pdftotext and
|
|
17
|
+
pdfplumber). This script is copied into ~/.clacky/parsers/ and can be
|
|
18
|
+
edited freely by the LLM — common tweaks:
|
|
19
|
+
- Change DPI (higher = better accuracy, slower + more memory)
|
|
20
|
+
- Change OCR_LANG to match your document (e.g. "jpn+eng")
|
|
21
|
+
- Add image preprocessing (deskew, contrast, threshold) before OCR
|
|
22
|
+
- Adjust MAX_PAGES for very large scans
|
|
23
|
+
|
|
24
|
+
Environment variable overrides:
|
|
25
|
+
CLACKY_OCR_LANG — override OCR_LANG (e.g. "eng", "jpn+eng")
|
|
26
|
+
CLACKY_OCR_MAX_PAGES — override MAX_PAGES
|
|
27
|
+
CLACKY_OCR_DPI — override DPI
|
|
28
|
+
|
|
29
|
+
Install:
|
|
30
|
+
macOS: brew install tesseract tesseract-lang poppler
|
|
31
|
+
pip3 install pytesseract pdf2image
|
|
32
|
+
Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
|
|
33
|
+
pip3 install pytesseract pdf2image
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# VERSION: 1
|
|
37
|
+
|
|
38
|
+
import os
|
|
39
|
+
import sys
|
|
40
|
+
|
|
41
|
+
# --- Config ---
|
|
42
|
+
# Simplified Chinese + English covers most mixed-language documents.
|
|
43
|
+
# For pure English scans, "eng" alone is faster and lighter.
|
|
44
|
+
OCR_LANG = "chi_sim+eng"
|
|
45
|
+
|
|
46
|
+
# 200 DPI is a good balance: tesseract's accuracy plateau starts around
|
|
47
|
+
# 300 DPI, but memory + time cost scales quadratically. Raise to 300 for
|
|
48
|
+
# small fonts or when accuracy matters more than speed.
|
|
49
|
+
DPI = 200
|
|
50
|
+
|
|
51
|
+
# Hard cap on pages to OCR. OCR is slow (~1-3s/page); for huge scans the
|
|
52
|
+
# LLM should be told to OCR in chunks instead.
|
|
53
|
+
MAX_PAGES = 50
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def main():
|
|
57
|
+
if len(sys.argv) < 2:
|
|
58
|
+
sys.stderr.write("Usage: pdf_parser_ocr.py <file_path>\n")
|
|
59
|
+
sys.exit(1)
|
|
60
|
+
|
|
61
|
+
path = sys.argv[1]
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
import pytesseract
|
|
65
|
+
from pdf2image import convert_from_path
|
|
66
|
+
except ImportError as e:
|
|
67
|
+
sys.stderr.write(f"OCR dependencies missing: {e}\n")
|
|
68
|
+
sys.stderr.write("Install with: pip3 install pytesseract pdf2image\n")
|
|
69
|
+
sys.exit(2)
|
|
70
|
+
|
|
71
|
+
lang = os.environ.get("CLACKY_OCR_LANG", OCR_LANG)
|
|
72
|
+
max_pages = int(os.environ.get("CLACKY_OCR_MAX_PAGES", MAX_PAGES))
|
|
73
|
+
dpi = int(os.environ.get("CLACKY_OCR_DPI", DPI))
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
images = convert_from_path(path, dpi=dpi, last_page=max_pages)
|
|
77
|
+
except Exception as e:
|
|
78
|
+
sys.stderr.write(f"pdf2image failed: {e}\n")
|
|
79
|
+
sys.stderr.write("Is poppler installed? (brew install poppler / apt install poppler-utils)\n")
|
|
80
|
+
sys.exit(3)
|
|
81
|
+
|
|
82
|
+
pages = []
|
|
83
|
+
for i, image in enumerate(images, 1):
|
|
84
|
+
try:
|
|
85
|
+
text = pytesseract.image_to_string(image, lang=lang)
|
|
86
|
+
except pytesseract.TesseractError as e:
|
|
87
|
+
# Most common cause: requested language pack not installed.
|
|
88
|
+
# Fall back to English-only for this page rather than aborting.
|
|
89
|
+
sys.stderr.write(f"tesseract error on page {i}: {e}\n")
|
|
90
|
+
text = pytesseract.image_to_string(image, lang="eng")
|
|
91
|
+
text = text.strip()
|
|
92
|
+
if text:
|
|
93
|
+
pages.append(f"--- Page {i} (OCR) ---\n{text}")
|
|
94
|
+
|
|
95
|
+
if not pages:
|
|
96
|
+
sys.stderr.write("OCR produced no text — PDF may be blank or unreadable.\n")
|
|
97
|
+
sys.exit(1)
|
|
98
|
+
|
|
99
|
+
print("\n\n".join(pages))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
main()
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
pdf_parser_plumber.py — extract text from a PDF using pdfplumber.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python3 pdf_parser_plumber.py <file_path>
|
|
7
|
+
|
|
8
|
+
Output:
|
|
9
|
+
stdout — extracted text, one block per page, separated by blank lines
|
|
10
|
+
stderr — error messages
|
|
11
|
+
exit 0 — success (text was extracted)
|
|
12
|
+
exit 1 — failure / no text found
|
|
13
|
+
exit 2 — dependency missing
|
|
14
|
+
|
|
15
|
+
Called from pdf_parser.rb as the second-tier extractor (after pdftotext).
|
|
16
|
+
This script is copied into ~/.clacky/parsers/ and can be edited freely by
|
|
17
|
+
the LLM — e.g. to tune table extraction, layout heuristics, or filter out
|
|
18
|
+
boilerplate headers/footers. Edit, then re-run to test.
|
|
19
|
+
|
|
20
|
+
Install:
|
|
21
|
+
pip3 install pdfplumber
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# VERSION: 1
|
|
25
|
+
|
|
26
|
+
import sys
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def main():
|
|
30
|
+
if len(sys.argv) < 2:
|
|
31
|
+
sys.stderr.write("Usage: pdf_parser_plumber.py <file_path>\n")
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
|
|
34
|
+
path = sys.argv[1]
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
import pdfplumber
|
|
38
|
+
except ImportError as e:
|
|
39
|
+
sys.stderr.write(f"pdfplumber missing: {e}\n")
|
|
40
|
+
sys.stderr.write("Install with: pip3 install pdfplumber\n")
|
|
41
|
+
sys.exit(2)
|
|
42
|
+
|
|
43
|
+
pages = []
|
|
44
|
+
try:
|
|
45
|
+
with pdfplumber.open(path) as pdf:
|
|
46
|
+
for i, page in enumerate(pdf.pages, 1):
|
|
47
|
+
text = page.extract_text()
|
|
48
|
+
if text and text.strip():
|
|
49
|
+
pages.append(f"--- Page {i} ---\n{text.strip()}")
|
|
50
|
+
except Exception as e:
|
|
51
|
+
sys.stderr.write(f"pdfplumber failed: {e}\n")
|
|
52
|
+
sys.exit(1)
|
|
53
|
+
|
|
54
|
+
if not pages:
|
|
55
|
+
sys.stderr.write("pdfplumber produced no text.\n")
|
|
56
|
+
sys.exit(1)
|
|
57
|
+
|
|
58
|
+
print("\n\n".join(pages))
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
if __name__ == "__main__":
|
|
62
|
+
main()
|