openclacky 1.2.12 → 1.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.clacky/skills/gem-release/SKILL.md +5 -1
- data/.clacky/skills/gem-release/scripts/release.sh +4 -1
- data/CHANGELOG.md +39 -0
- data/lib/clacky/agent/llm_caller.rb +40 -25
- data/lib/clacky/agent/memory_updater.rb +12 -0
- data/lib/clacky/agent/session_serializer.rb +1 -0
- data/lib/clacky/agent/skill_auto_creator.rb +7 -4
- data/lib/clacky/agent/skill_evolution.rb +23 -5
- data/lib/clacky/agent/skill_manager.rb +86 -1
- data/lib/clacky/agent/skill_reflector.rb +18 -23
- data/lib/clacky/agent.rb +132 -15
- data/lib/clacky/agent_config.rb +183 -22
- data/lib/clacky/cli.rb +55 -0
- data/lib/clacky/client.rb +11 -1
- data/lib/clacky/default_parsers/pdf_parser.rb +70 -86
- data/lib/clacky/default_parsers/pdf_parser_vlm.py +136 -0
- data/lib/clacky/default_skills/persist-memory/SKILL.md +4 -3
- data/lib/clacky/default_skills/search-skills/SKILL.md +61 -0
- data/lib/clacky/idle_compression_timer.rb +1 -1
- data/lib/clacky/message_format/open_ai.rb +7 -1
- data/lib/clacky/openai_stream_aggregator.rb +4 -1
- data/lib/clacky/providers.rb +77 -12
- data/lib/clacky/server/http_server.rb +296 -7
- data/lib/clacky/server/session_registry.rb +30 -8
- data/lib/clacky/server/web_ui_controller.rb +24 -1
- data/lib/clacky/session_manager.rb +120 -0
- data/lib/clacky/tools/web_search.rb +59 -8
- data/lib/clacky/ui2/layout_manager.rb +15 -5
- data/lib/clacky/ui2/progress_handle.rb +18 -8
- data/lib/clacky/ui2/ui_controller.rb +27 -0
- data/lib/clacky/ui_interface.rb +22 -0
- data/lib/clacky/utils/model_pricing.rb +96 -0
- data/lib/clacky/version.rb +1 -1
- data/lib/clacky/vision/resolver.rb +157 -0
- data/lib/clacky/web/app.css +209 -4
- data/lib/clacky/web/app.js +6 -5
- data/lib/clacky/web/i18n.js +22 -6
- data/lib/clacky/web/index.html +2 -1
- data/lib/clacky/web/sessions.js +408 -80
- data/lib/clacky/web/settings.js +241 -60
- data/lib/clacky/web/skills.js +5 -14
- data/lib/clacky/web/utils.js +57 -0
- data/lib/clacky/web/ws-dispatcher.js +136 -0
- data/lib/clacky.rb +1 -0
- metadata +6 -2
|
@@ -8,113 +8,97 @@
|
|
|
8
8
|
#
|
|
9
9
|
# Output:
|
|
10
10
|
# stdout — extracted text content (UTF-8)
|
|
11
|
-
# stderr — error messages
|
|
11
|
+
# stderr — error / progress messages
|
|
12
12
|
# exit 0 — success
|
|
13
|
-
# exit 1 — failure
|
|
13
|
+
# exit 1 — hard failure (file unreadable, pdftotext missing, etc.)
|
|
14
14
|
#
|
|
15
|
-
#
|
|
15
|
+
# Strategy
|
|
16
|
+
# --------
|
|
17
|
+
# PDF pages naturally fall into two kinds: pages with a real text layer,
|
|
18
|
+
# and scanned-image pages. The right tool is a per-page property, not a
|
|
19
|
+
# document-level one. So:
|
|
16
20
|
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
# (→ pdf_parser_ocr.py)
|
|
21
|
+
# 1. Run pdftotext once over the whole file (`-layout`), split by `\f`.
|
|
22
|
+
# 2. Pages with enough bytes → emit text directly.
|
|
23
|
+
# 3. Pages below threshold → list page numbers in a Notice section
|
|
24
|
+
# with a shell command template the agent can run on demand to
|
|
25
|
+
# render a specific page to PNG, then file_reader that PNG.
|
|
23
26
|
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
#
|
|
27
|
+
# The parser does NOT pre-render images. Most weak pages will never be
|
|
28
|
+
# read (the answer is often already in the text-layer pages). Rendering
|
|
29
|
+
# all of them up front is wasteful — 55 pages takes ~14s and most goes
|
|
30
|
+
# to waste. The agent decides when (and which page) to OCR based on the
|
|
31
|
+
# user's actual question.
|
|
28
32
|
#
|
|
29
|
-
# VERSION:
|
|
33
|
+
# VERSION: 6
|
|
30
34
|
|
|
31
35
|
require "open3"
|
|
32
36
|
|
|
33
|
-
|
|
34
|
-
# miss and the next fallback is tried.
|
|
35
|
-
MIN_CONTENT_BYTES = 20
|
|
37
|
+
MIN_PAGE_BYTES = 20
|
|
36
38
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
SCRIPT_DIR = File.dirname(File.expand_path(__FILE__))
|
|
41
|
-
|
|
42
|
-
def try_pdftotext(path)
|
|
43
|
-
stdout, _stderr, status = Open3.capture3("pdftotext", "-layout", "-enc", "UTF-8", path, "-")
|
|
44
|
-
return nil unless status.success?
|
|
45
|
-
text = stdout.strip
|
|
46
|
-
return nil if text.bytesize < MIN_CONTENT_BYTES
|
|
47
|
-
text
|
|
48
|
-
rescue Errno::ENOENT
|
|
49
|
-
nil # pdftotext not installed
|
|
50
|
-
end
|
|
51
|
-
|
|
52
|
-
def try_pdfplumber(path)
|
|
53
|
-
script = File.join(SCRIPT_DIR, "pdf_parser_plumber.py")
|
|
54
|
-
return nil unless File.exist?(script)
|
|
55
|
-
|
|
56
|
-
stdout, _stderr, status = Open3.capture3("python3", script, path)
|
|
57
|
-
return nil unless status.success?
|
|
58
|
-
text = stdout.strip
|
|
59
|
-
return nil if text.bytesize < MIN_CONTENT_BYTES
|
|
60
|
-
text
|
|
61
|
-
rescue Errno::ENOENT
|
|
62
|
-
nil # python3 not available
|
|
39
|
+
def die(msg)
|
|
40
|
+
warn msg
|
|
41
|
+
exit 1
|
|
63
42
|
end
|
|
64
43
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
# macOS: brew install tesseract tesseract-lang poppler
|
|
70
|
-
# pip3 install pytesseract pdf2image
|
|
71
|
-
# Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
|
|
72
|
-
# pip3 install pytesseract pdf2image
|
|
73
|
-
def try_ocr(path)
|
|
74
|
-
# Quick capability check — avoid spawning python if tesseract is missing.
|
|
75
|
-
_stdout, _stderr, status = Open3.capture3("tesseract", "--version")
|
|
76
|
-
return nil unless status.success?
|
|
77
|
-
|
|
78
|
-
script = File.join(SCRIPT_DIR, "pdf_parser_ocr.py")
|
|
79
|
-
return nil unless File.exist?(script)
|
|
80
|
-
|
|
81
|
-
stdout, stderr, status = Open3.capture3("python3", script, path)
|
|
44
|
+
def pdftotext_pages(path)
|
|
45
|
+
stdout, stderr, status = Open3.capture3(
|
|
46
|
+
"pdftotext", "-layout", "-enc", "UTF-8", path, "-"
|
|
47
|
+
)
|
|
82
48
|
unless status.success?
|
|
83
|
-
warn
|
|
49
|
+
warn "pdftotext failed: #{stderr.strip}"
|
|
84
50
|
return nil
|
|
85
51
|
end
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
52
|
+
pages = stdout.split("\f", -1)
|
|
53
|
+
pages.pop if pages.last && pages.last.strip.empty?
|
|
54
|
+
pages.map(&:strip)
|
|
89
55
|
rescue Errno::ENOENT
|
|
90
|
-
|
|
56
|
+
warn "pdftotext not found. Install poppler (`brew install poppler` / `apt install poppler-utils`)."
|
|
57
|
+
nil
|
|
91
58
|
end
|
|
92
59
|
|
|
93
|
-
|
|
60
|
+
def main(argv)
|
|
61
|
+
die "Usage: pdf_parser.rb <file_path>" if argv.empty?
|
|
62
|
+
path = argv[0]
|
|
63
|
+
die "File not found: #{path}" unless File.file?(path)
|
|
94
64
|
|
|
95
|
-
|
|
65
|
+
pages = pdftotext_pages(path)
|
|
66
|
+
die "Could not extract text from PDF." if pages.nil?
|
|
96
67
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
68
|
+
weak = []
|
|
69
|
+
body_chunks = []
|
|
70
|
+
pages.each_with_index do |text, idx|
|
|
71
|
+
n = idx + 1
|
|
72
|
+
if text.bytesize >= MIN_PAGE_BYTES
|
|
73
|
+
body_chunks << "--- Page #{n} ---\n\n#{text}"
|
|
74
|
+
else
|
|
75
|
+
body_chunks << "--- Page #{n} ---\n\n[no extractable text layer]"
|
|
76
|
+
weak << n
|
|
77
|
+
end
|
|
78
|
+
end
|
|
101
79
|
|
|
102
|
-
|
|
103
|
-
warn "File not found: #{path}"
|
|
104
|
-
exit 1
|
|
105
|
-
end
|
|
80
|
+
output = body_chunks.join("\n\n")
|
|
106
81
|
|
|
107
|
-
|
|
108
|
-
|
|
82
|
+
if weak.any?
|
|
83
|
+
abs_path = File.expand_path(path)
|
|
84
|
+
notice = +"\n\n--- Notice ---\n\n"
|
|
85
|
+
notice << "#{weak.size} of #{pages.size} pages have no extractable text layer "
|
|
86
|
+
notice << "(likely scanned images).\n"
|
|
87
|
+
notice << "Pages without text: #{weak.join(', ')}\n\n"
|
|
88
|
+
notice << "To OCR a specific page, render it to PNG via shell, then "
|
|
89
|
+
notice << "file_reader the PNG (it will be transcribed via the "
|
|
90
|
+
notice << "vision/OCR pipeline):\n\n"
|
|
91
|
+
notice << " pdftoppm -r 150 -f <N> -l <N> -png -singlefile "
|
|
92
|
+
notice << "#{abs_path.inspect} /tmp/clacky-pdf-page-<N>\n"
|
|
93
|
+
notice << " # produces /tmp/clacky-pdf-page-<N>.png\n\n"
|
|
94
|
+
notice << "Only render pages you actually need. If the user's question "
|
|
95
|
+
notice << "is already answered by the extracted text above, skip OCR.\n"
|
|
96
|
+
output << notice
|
|
97
|
+
end
|
|
109
98
|
|
|
110
|
-
|
|
111
|
-
|
|
99
|
+
$stdout.write(output)
|
|
100
|
+
$stdout.write("\n") unless output.end_with?("\n")
|
|
112
101
|
exit 0
|
|
113
|
-
else
|
|
114
|
-
warn "Could not extract text from PDF."
|
|
115
|
-
warn "For text-based PDFs, install poppler: brew install poppler (macOS) / apt install poppler-utils (Linux)"
|
|
116
|
-
warn "For scanned PDFs (OCR):"
|
|
117
|
-
warn " macOS: brew install tesseract tesseract-lang poppler && pip3 install pytesseract pdf2image"
|
|
118
|
-
warn " Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils && pip3 install pytesseract pdf2image"
|
|
119
|
-
exit 1
|
|
120
102
|
end
|
|
103
|
+
|
|
104
|
+
main(ARGV) if __FILE__ == $PROGRAM_NAME
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Clacky PDF Parser — VLM (Vision Language Model) extractor
|
|
5
|
+
|
|
6
|
+
Renders each PDF page to PNG via pdftoppm (poppler), then asks the
|
|
7
|
+
configured OCR sidecar (e.g. gemini-3-5-flash, gpt-4o-mini) to transcribe
|
|
8
|
+
each page through the local Clacky server's internal OCR endpoint.
|
|
9
|
+
|
|
10
|
+
Why through HTTP and not direct API call?
|
|
11
|
+
The OCR sidecar config (model, base_url, api_key) lives in the agent's
|
|
12
|
+
~/.clacky/config.yml. We don't re-implement that lookup here — instead
|
|
13
|
+
the local Clacky server exposes /api/internal/ocr-image which already
|
|
14
|
+
has the agent_config in scope. This parser stays a thin client.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
python3 pdf_parser_vlm.py <file_path>
|
|
18
|
+
|
|
19
|
+
Stdout: extracted text (UTF-8), pages separated by `\\n\\n--- Page N ---\\n\\n`
|
|
20
|
+
Stderr: progress + error messages
|
|
21
|
+
Exit: 0 on success, 1 on failure (server unavailable, no sidecar, etc.)
|
|
22
|
+
|
|
23
|
+
Environment:
|
|
24
|
+
CLACKY_SERVER_HOST default 127.0.0.1
|
|
25
|
+
CLACKY_SERVER_PORT default 7070
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
import subprocess
|
|
31
|
+
import sys
|
|
32
|
+
import tempfile
|
|
33
|
+
import urllib.error
|
|
34
|
+
import urllib.request
|
|
35
|
+
|
|
36
|
+
PAGE_SEPARATOR = "\n\n--- Page {n} ---\n\n"
|
|
37
|
+
RENDER_DPI = 150
|
|
38
|
+
REQUEST_TIMEOUT = 120 # seconds; VLMs can be slow
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def server_url():
|
|
42
|
+
host = os.environ.get("CLACKY_SERVER_HOST", "127.0.0.1")
|
|
43
|
+
port = os.environ.get("CLACKY_SERVER_PORT", "7070")
|
|
44
|
+
return f"http://{host}:{port}/api/internal/ocr-image"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def render_pages(pdf_path, out_dir):
|
|
48
|
+
prefix = os.path.join(out_dir, "page")
|
|
49
|
+
cmd = ["pdftoppm", "-r", str(RENDER_DPI), "-png", pdf_path, prefix]
|
|
50
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
51
|
+
if proc.returncode != 0:
|
|
52
|
+
sys.stderr.write(f"pdftoppm failed: {proc.stderr.strip()}\n")
|
|
53
|
+
return []
|
|
54
|
+
pages = sorted(
|
|
55
|
+
os.path.join(out_dir, f) for f in os.listdir(out_dir)
|
|
56
|
+
if f.startswith("page-") and f.endswith(".png")
|
|
57
|
+
)
|
|
58
|
+
return pages
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def transcribe_page(image_path, page_num):
|
|
62
|
+
with open(image_path, "rb") as f:
|
|
63
|
+
body = f.read()
|
|
64
|
+
|
|
65
|
+
boundary = "----clacky-vlm-boundary"
|
|
66
|
+
parts = []
|
|
67
|
+
parts.append(f"--{boundary}\r\n".encode())
|
|
68
|
+
parts.append(
|
|
69
|
+
b'Content-Disposition: form-data; name="image"; filename="page.png"\r\n'
|
|
70
|
+
b"Content-Type: image/png\r\n\r\n"
|
|
71
|
+
)
|
|
72
|
+
parts.append(body)
|
|
73
|
+
parts.append(f"\r\n--{boundary}\r\n".encode())
|
|
74
|
+
parts.append(
|
|
75
|
+
b'Content-Disposition: form-data; name="prompt"\r\n\r\n'
|
|
76
|
+
)
|
|
77
|
+
parts.append(
|
|
78
|
+
f"This is page {page_num} of a scanned PDF. Extract every legible text "
|
|
79
|
+
"verbatim, preserving reading order. Render tables as Markdown tables. "
|
|
80
|
+
"Skip decorative elements. Output plain Markdown only — no commentary."
|
|
81
|
+
.encode()
|
|
82
|
+
)
|
|
83
|
+
parts.append(f"\r\n--{boundary}--\r\n".encode())
|
|
84
|
+
payload = b"".join(parts)
|
|
85
|
+
|
|
86
|
+
req = urllib.request.Request(
|
|
87
|
+
server_url(),
|
|
88
|
+
data=payload,
|
|
89
|
+
headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
|
|
90
|
+
method="POST",
|
|
91
|
+
)
|
|
92
|
+
try:
|
|
93
|
+
with urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT) as resp:
|
|
94
|
+
data = json.loads(resp.read().decode("utf-8"))
|
|
95
|
+
except urllib.error.URLError as e:
|
|
96
|
+
sys.stderr.write(f"page {page_num}: server unreachable ({e})\n")
|
|
97
|
+
return None
|
|
98
|
+
except Exception as e:
|
|
99
|
+
sys.stderr.write(f"page {page_num}: {e}\n")
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
if not data.get("ok"):
|
|
103
|
+
sys.stderr.write(f"page {page_num}: {data.get('message', 'unknown error')}\n")
|
|
104
|
+
return None
|
|
105
|
+
return data.get("text", "")
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def main():
|
|
109
|
+
if len(sys.argv) != 2:
|
|
110
|
+
sys.stderr.write("Usage: pdf_parser_vlm.py <file_path>\n")
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
path = sys.argv[1]
|
|
113
|
+
if not os.path.exists(path):
|
|
114
|
+
sys.stderr.write(f"File not found: {path}\n")
|
|
115
|
+
sys.exit(1)
|
|
116
|
+
|
|
117
|
+
with tempfile.TemporaryDirectory(prefix="clacky_vlm_") as tmp:
|
|
118
|
+
pages = render_pages(path, tmp)
|
|
119
|
+
if not pages:
|
|
120
|
+
sys.stderr.write("Failed to render PDF pages (is poppler installed?)\n")
|
|
121
|
+
sys.exit(1)
|
|
122
|
+
|
|
123
|
+
sys.stderr.write(f"VLM OCR: {len(pages)} page(s) to transcribe...\n")
|
|
124
|
+
chunks = []
|
|
125
|
+
for i, page in enumerate(pages, 1):
|
|
126
|
+
text = transcribe_page(page, i)
|
|
127
|
+
if text is None:
|
|
128
|
+
# Server unreachable / no sidecar — bail so caller falls back.
|
|
129
|
+
sys.exit(1)
|
|
130
|
+
chunks.append(PAGE_SEPARATOR.format(n=i) + text)
|
|
131
|
+
|
|
132
|
+
sys.stdout.write("".join(chunks).strip())
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
main()
|
|
@@ -48,10 +48,11 @@ Scan the list above:
|
|
|
48
48
|
|
|
49
49
|
Use the `write` tool. Always include the YAML frontmatter shown above.
|
|
50
50
|
|
|
51
|
-
##
|
|
51
|
+
## Guidelines
|
|
52
52
|
|
|
53
|
-
-
|
|
54
|
-
- If
|
|
53
|
+
- Aim for around 4000 characters of content (after the frontmatter). This is a soft target — moderate overshoot is fine, do NOT iterate writes just to shave characters.
|
|
54
|
+
- If a file grows much larger than that (say, well past 8000), trim the least important information rather than splitting one topic across multiple files.
|
|
55
|
+
- Prefer merging into an existing file over creating a new one. Only create a new file when no existing topic genuinely covers the area.
|
|
55
56
|
- Write concise, factual Markdown — no fluff, no redundant headings.
|
|
56
57
|
- One topic per file. Don't bundle unrelated facts together.
|
|
57
58
|
- Do NOT use `terminal` or `file_reader` to list the memories directory — the list above is authoritative.
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: search-skills
|
|
3
|
+
description: 'Search ALL installed skills (including ones not shown in AVAILABLE SKILLS) by keyword. Use this whenever you suspect a fitting skill might exist but is not listed in your system prompt — for example before building a new skill, when the user mentions a domain not covered by visible skills, or after seeing the (N more skills installed) hint. Triggers on phrases like search skills, find a skill for, is there a skill that, 查找skill, 有没有skill做.'
|
|
4
|
+
disable-model-invocation: false
|
|
5
|
+
user-invocable: true
|
|
6
|
+
fork_agent: true
|
|
7
|
+
auto_summarize: true
|
|
8
|
+
forbidden_tools:
|
|
9
|
+
- write
|
|
10
|
+
- edit
|
|
11
|
+
- terminal
|
|
12
|
+
- web_search
|
|
13
|
+
- web_fetch
|
|
14
|
+
- browser
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
# Search Skills Subagent
|
|
18
|
+
|
|
19
|
+
You are a Skill Search Subagent. Given a keyword or topic from the parent agent, scan the complete list of installed skills below and return the best matches.
|
|
20
|
+
|
|
21
|
+
The AVAILABLE SKILLS section in the parent agent system prompt is capped (~30 entries). The list below is the FULL list — pre-loaded for you, no scanning required. Your job is to look beyond that cap so the parent does not redundantly create a new skill when one already exists.
|
|
22
|
+
|
|
23
|
+
## Complete Skill Inventory
|
|
24
|
+
|
|
25
|
+
This list was pre-loaded — do NOT re-scan the filesystem or call any tools.
|
|
26
|
+
|
|
27
|
+
<%= all_skills_meta %>
|
|
28
|
+
|
|
29
|
+
## Workflow
|
|
30
|
+
|
|
31
|
+
### Step 1 — Extract keywords
|
|
32
|
+
|
|
33
|
+
Pull 2-4 keywords from the input task. Both English and Chinese terms are valid (skill descriptions are bilingual).
|
|
34
|
+
|
|
35
|
+
### Step 2 — Match against the inventory above
|
|
36
|
+
|
|
37
|
+
For each skill in the inventory, judge relevance against the keywords:
|
|
38
|
+
- Strong match: keyword appears in the skill `name` or clearly in the `description`'s purpose statement
|
|
39
|
+
- Weak match: keyword appears only in the trigger examples or peripheral mentions
|
|
40
|
+
|
|
41
|
+
### Step 3 — Return a ranked summary
|
|
42
|
+
|
|
43
|
+
Return at most 5 results, strongest matches first:
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
Found N matching skill(s) for: <keywords>
|
|
47
|
+
|
|
48
|
+
1. <name> (<source>)
|
|
49
|
+
<description trimmed to ~200 chars>
|
|
50
|
+
|
|
51
|
+
2. ...
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
If nothing genuinely matches, return exactly: `No installed skill matches: <task>`
|
|
55
|
+
|
|
56
|
+
## Rules
|
|
57
|
+
|
|
58
|
+
- Do NOT invoke any tool. The inventory above is authoritative; just match and return.
|
|
59
|
+
- Do NOT recommend creating a new skill — that is the parent agent's call.
|
|
60
|
+
- If the task is vague, return what genuinely matched, do not invent relevance.
|
|
61
|
+
- Default skills (built-in) are part of the inventory but typically also visible to the parent — flagging them is still useful as a reminder.
|
|
@@ -17,7 +17,7 @@ module Clacky
|
|
|
17
17
|
# Seconds of inactivity before idle compression is triggered.
|
|
18
18
|
# Kept under the 5-minute prompt cache TTL so the compression call itself
|
|
19
19
|
# still hits the existing prefix cache.
|
|
20
|
-
IDLE_DELAY =
|
|
20
|
+
IDLE_DELAY = 266
|
|
21
21
|
|
|
22
22
|
# @param agent [Clacky::Agent] the agent whose messages will be compressed
|
|
23
23
|
# @param session_manager [Clacky::SessionManager, nil] used to persist session after compression
|
|
@@ -206,7 +206,13 @@ module Clacky
|
|
|
206
206
|
# Skip malformed tool calls where name or arguments is nil (broken API response)
|
|
207
207
|
next if name.nil? || arguments.nil?
|
|
208
208
|
|
|
209
|
-
{ id: call["id"], type: call["type"], name: name, arguments: arguments }
|
|
209
|
+
tc = { id: call["id"], type: call["type"], name: name, arguments: arguments }
|
|
210
|
+
# Vertex Gemini's OpenAI shim returns thought_signature inside
|
|
211
|
+
# tool_calls[i].extra_content.google and requires it echoed back on
|
|
212
|
+
# replay, otherwise the next turn 400s with "Function call is missing
|
|
213
|
+
# a thought_signature". Preserve it through the canonical layer.
|
|
214
|
+
tc[:extra_content] = call["extra_content"] if call["extra_content"]
|
|
215
|
+
tc
|
|
210
216
|
end
|
|
211
217
|
end
|
|
212
218
|
|
|
@@ -72,7 +72,7 @@ module Clacky
|
|
|
72
72
|
def to_h
|
|
73
73
|
tool_calls = @tool_calls.keys.sort.map do |idx|
|
|
74
74
|
tc = @tool_calls[idx]
|
|
75
|
-
{
|
|
75
|
+
out = {
|
|
76
76
|
"id" => tc[:id],
|
|
77
77
|
"type" => tc[:type] || "function",
|
|
78
78
|
"function" => {
|
|
@@ -80,6 +80,8 @@ module Clacky
|
|
|
80
80
|
"arguments" => tc[:arguments].to_s
|
|
81
81
|
}
|
|
82
82
|
}
|
|
83
|
+
out["extra_content"] = tc[:extra_content] if tc[:extra_content]
|
|
84
|
+
out
|
|
83
85
|
end
|
|
84
86
|
|
|
85
87
|
message = {
|
|
@@ -104,6 +106,7 @@ module Clacky
|
|
|
104
106
|
slot[:name] ||= fn["name"] if fn["name"]
|
|
105
107
|
slot[:arguments] << fn["arguments"].to_s if fn["arguments"]
|
|
106
108
|
end
|
|
109
|
+
slot[:extra_content] = tc["extra_content"] if tc["extra_content"]
|
|
107
110
|
end
|
|
108
111
|
|
|
109
112
|
private def parse_or_nil(s)
|
data/lib/clacky/providers.rb
CHANGED
|
@@ -39,19 +39,30 @@ module Clacky
|
|
|
39
39
|
"abs-claude-haiku-4-5",
|
|
40
40
|
"dsk-deepseek-v4-pro",
|
|
41
41
|
"dsk-deepseek-v4-flash",
|
|
42
|
-
"or-gemini-3-1-pro"
|
|
42
|
+
"or-gemini-3-1-pro",
|
|
43
|
+
"or-gemini-3-5-flash"
|
|
43
44
|
],
|
|
44
45
|
# Image generation models served by the openclacky platform
|
|
45
46
|
# gateway. The gateway exposes a standard OpenAI-compatible
|
|
46
47
|
# /v1/images/generations endpoint, so the same OpenAICompat
|
|
47
|
-
# provider class handles them. `or-` prefix
|
|
48
|
-
#
|
|
49
|
-
#
|
|
48
|
+
# provider class handles them. `or-` prefix is a routing alias
|
|
49
|
+
# only — the platform may dispatch to OpenRouter or Vertex AI
|
|
50
|
+
# (Gemini Nano Banana family) depending on the model.
|
|
50
51
|
"image_models" => [
|
|
51
52
|
"or-gemini-3-pro-image",
|
|
53
|
+
"or-gemini-3-1-flash-image",
|
|
52
54
|
"or-gpt-image-2"
|
|
53
55
|
],
|
|
56
|
+
"image_model_aliases" => {
|
|
57
|
+
"or-gemini-3-pro-image" => "Nano Banana Pro",
|
|
58
|
+
"or-gemini-3-1-flash-image" => "Nano Banana 2",
|
|
59
|
+
"or-gpt-image-2" => "GPT Image 2"
|
|
60
|
+
},
|
|
54
61
|
"default_image_model" => "or-gpt-image-2",
|
|
62
|
+
# Default OCR sidecar — used when the primary model is text-only.
|
|
63
|
+
# Candidates are derived from the provider's vision-capable models;
|
|
64
|
+
# this just picks the cheap+fast default to surface in "auto" mode.
|
|
65
|
+
"default_ocr_model" => "or-gemini-3-5-flash",
|
|
55
66
|
# Provider-level default: the Claude family served here is vision-capable.
|
|
56
67
|
"capabilities" => { "vision" => true }.freeze,
|
|
57
68
|
# Model-level overrides: DeepSeek models routed through this provider
|
|
@@ -65,20 +76,17 @@ module Clacky
|
|
|
65
76
|
# Per-primary lite pairing: keys are "strong" primary models, values
|
|
66
77
|
# are the lite sidekick to auto-inject when that primary is the
|
|
67
78
|
# default. Lite is consumed by some subagents for cheap/fast work;
|
|
68
|
-
# weak models (haiku / v4-flash) ARE the lite tier
|
|
69
|
-
# they're intentionally not listed here
|
|
70
|
-
# the default model is already lite-class.
|
|
71
|
-
#
|
|
72
|
-
# or-gemini-3-1-pro is intentionally absent: Gemini has no lite
|
|
73
|
-
# sibling wired up (yet) on this provider; subagents using the
|
|
74
|
-
# Gemini default will just reuse it for lite work until we add one.
|
|
79
|
+
# weak models (haiku / v4-flash / 3-5-flash) ARE the lite tier
|
|
80
|
+
# themselves, so they're intentionally not listed here as keys —
|
|
81
|
+
# no injection happens when the default model is already lite-class.
|
|
75
82
|
"lite_models" => {
|
|
76
83
|
"abs-claude-opus-4-8" => "abs-claude-haiku-4-5",
|
|
77
84
|
"abs-claude-opus-4-7" => "abs-claude-haiku-4-5",
|
|
78
85
|
"abs-claude-opus-4-6" => "abs-claude-haiku-4-5",
|
|
79
86
|
"abs-claude-sonnet-4-6" => "abs-claude-haiku-4-5",
|
|
80
87
|
"abs-claude-sonnet-4-5" => "abs-claude-haiku-4-5",
|
|
81
|
-
"dsk-deepseek-v4-pro" => "dsk-deepseek-v4-flash"
|
|
88
|
+
"dsk-deepseek-v4-pro" => "dsk-deepseek-v4-flash",
|
|
89
|
+
"or-gemini-3-1-pro" => "or-gemini-3-5-flash"
|
|
82
90
|
},
|
|
83
91
|
# Fallback chain: if a model is unavailable, try the next one in order.
|
|
84
92
|
# Keys are primary model names; values are the fallback model to use instead.
|
|
@@ -141,6 +149,7 @@ module Clacky
|
|
|
141
149
|
# until we ship a dedicated client-side adapter for that protocol.
|
|
142
150
|
"image_models" => [],
|
|
143
151
|
"default_image_model" => nil,
|
|
152
|
+
"default_ocr_model" => "google/gemini-2.5-flash",
|
|
144
153
|
"website_url" => "https://openrouter.ai/keys"
|
|
145
154
|
}.freeze,
|
|
146
155
|
|
|
@@ -188,6 +197,7 @@ module Clacky
|
|
|
188
197
|
"model_capabilities" => {
|
|
189
198
|
"MiniMax-M3" => { "vision" => true }.freeze
|
|
190
199
|
}.freeze,
|
|
200
|
+
"default_ocr_model" => "MiniMax-M3",
|
|
191
201
|
"website_url" => "https://www.minimaxi.com/user-center/basic-information/interface-key"
|
|
192
202
|
}.freeze,
|
|
193
203
|
|
|
@@ -214,6 +224,7 @@ module Clacky
|
|
|
214
224
|
].freeze,
|
|
215
225
|
# k2.5 / k2.6 are multimodal; legacy k2 text-only models need model_capabilities override if added.
|
|
216
226
|
"capabilities" => { "vision" => true }.freeze,
|
|
227
|
+
"default_ocr_model" => "kimi-k2.5",
|
|
217
228
|
"website_url" => "https://platform.moonshot.cn/console/api-keys"
|
|
218
229
|
}.freeze,
|
|
219
230
|
|
|
@@ -261,6 +272,7 @@ module Clacky
|
|
|
261
272
|
"api" => "anthropic-messages",
|
|
262
273
|
"default_model" => "claude-sonnet-4-6",
|
|
263
274
|
"models" => ["claude-opus-4-8", "claude-opus-4-7", "claude-opus-4-6", "claude-sonnet-4-6", "claude-haiku-4-5"],
|
|
275
|
+
"default_ocr_model" => "claude-haiku-4-5",
|
|
264
276
|
"website_url" => "https://console.anthropic.com/settings/keys"
|
|
265
277
|
}.freeze,
|
|
266
278
|
|
|
@@ -275,6 +287,7 @@ module Clacky
|
|
|
275
287
|
"model_capabilities" => {
|
|
276
288
|
"mimo-v2-omni" => { "vision" => true }.freeze
|
|
277
289
|
}.freeze,
|
|
290
|
+
"default_ocr_model" => "mimo-v2-omni",
|
|
278
291
|
"website_url" => "https://platform.xiaomimimo.com/"
|
|
279
292
|
}.freeze,
|
|
280
293
|
|
|
@@ -304,6 +317,7 @@ module Clacky
|
|
|
304
317
|
"model_capabilities" => {
|
|
305
318
|
"glm-5v-turbo" => { "vision" => true }.freeze
|
|
306
319
|
}.freeze,
|
|
320
|
+
"default_ocr_model" => "glm-5v-turbo",
|
|
307
321
|
"website_url" => "https://open.bigmodel.cn/usercenter/apikeys"
|
|
308
322
|
}.freeze,
|
|
309
323
|
|
|
@@ -334,6 +348,7 @@ module Clacky
|
|
|
334
348
|
"gpt-image-2"
|
|
335
349
|
],
|
|
336
350
|
"default_image_model" => "gpt-image-2",
|
|
351
|
+
"default_ocr_model" => "gpt-5.4-mini",
|
|
337
352
|
"website_url" => "https://platform.openai.com/api-keys"
|
|
338
353
|
}.freeze,
|
|
339
354
|
|
|
@@ -359,6 +374,7 @@ module Clacky
|
|
|
359
374
|
"model_capabilities" => {
|
|
360
375
|
"qwen3.7-max" => { "vision" => false }.freeze
|
|
361
376
|
}.freeze,
|
|
377
|
+
"default_ocr_model" => "qwen3.6-flash",
|
|
362
378
|
"lite_models" => {
|
|
363
379
|
"qwen3.7-max" => "qwen3.6-flash",
|
|
364
380
|
"qwen3.6-plus" => "qwen3.6-flash",
|
|
@@ -487,6 +503,30 @@ module Clacky
|
|
|
487
503
|
preset&.dig("image_models") || []
|
|
488
504
|
end
|
|
489
505
|
|
|
506
|
+
def image_model_aliases(provider_id)
|
|
507
|
+
preset = PRESETS[provider_id]
|
|
508
|
+
preset&.dig("image_model_aliases") || {}
|
|
509
|
+
end
|
|
510
|
+
|
|
511
|
+
def video_model_aliases(provider_id)
|
|
512
|
+
preset = PRESETS[provider_id]
|
|
513
|
+
preset&.dig("video_model_aliases") || {}
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
def audio_model_aliases(provider_id)
|
|
517
|
+
preset = PRESETS[provider_id]
|
|
518
|
+
preset&.dig("audio_model_aliases") || {}
|
|
519
|
+
end
|
|
520
|
+
|
|
521
|
+
def media_model_aliases(provider_id, kind)
|
|
522
|
+
case kind.to_s
|
|
523
|
+
when "image" then image_model_aliases(provider_id)
|
|
524
|
+
when "video" then video_model_aliases(provider_id)
|
|
525
|
+
when "audio" then audio_model_aliases(provider_id)
|
|
526
|
+
else {}
|
|
527
|
+
end
|
|
528
|
+
end
|
|
529
|
+
|
|
490
530
|
# Video generation models — placeholder. No provider supports video
|
|
491
531
|
# via Clacky yet; once they do, declare "video_models" alongside
|
|
492
532
|
# "image_models" in the relevant PRESETS entry and this returns it.
|
|
@@ -501,6 +541,31 @@ module Clacky
|
|
|
501
541
|
preset&.dig("audio_models") || []
|
|
502
542
|
end
|
|
503
543
|
|
|
544
|
+
# OCR sidecar candidates: every chat model under this provider that's
|
|
545
|
+
# vision-capable. Derived from `vision` capability so we don't have
|
|
546
|
+
# to maintain a parallel list — a model that can see is by definition
|
|
547
|
+
# a candidate for "describe an image as text". Image-generation models
|
|
548
|
+
# are excluded (they take prompts and return pixels, not the other way).
|
|
549
|
+
# @param provider_id [String]
|
|
550
|
+
# @return [Array<String>]
|
|
551
|
+
def ocr_models(provider_id)
|
|
552
|
+
preset = PRESETS[provider_id]
|
|
553
|
+
return [] unless preset
|
|
554
|
+
(preset["models"] || []).select { |m| supports?(provider_id, :vision, model_name: m) }
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
# Default OCR sidecar model for a provider. Falls back to the first
|
|
558
|
+
# vision-capable model if the preset doesn't pin an explicit default.
|
|
559
|
+
# @param provider_id [String]
|
|
560
|
+
# @return [String, nil] nil when the provider has zero vision-capable models
|
|
561
|
+
def default_ocr_model(provider_id)
|
|
562
|
+
preset = PRESETS[provider_id]
|
|
563
|
+
return nil unless preset
|
|
564
|
+
explicit = preset["default_ocr_model"]
|
|
565
|
+
return explicit if explicit && ocr_models(provider_id).include?(explicit)
|
|
566
|
+
ocr_models(provider_id).first
|
|
567
|
+
end
|
|
568
|
+
|
|
504
569
|
# Unified entry for media model lookup by kind.
|
|
505
570
|
# @param provider_id [String]
|
|
506
571
|
# @param kind [String] one of "image" / "video" / "audio"
|