RubyGems - openclacky - Versions diffs - 1.2.13 → 1.2.15 - Mend

openclacky 1.2.13 → 1.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/.clacky/skills/gem-release/SKILL.md +4 -0
data/CHANGELOG.md +28 -0
data/lib/clacky/agent/session_serializer.rb +1 -0
data/lib/clacky/agent.rb +123 -14
data/lib/clacky/agent_config.rb +136 -10
data/lib/clacky/client.rb +59 -46
data/lib/clacky/default_parsers/pdf_parser.rb +70 -86
data/lib/clacky/default_parsers/pdf_parser_vlm.py +136 -0
data/lib/clacky/providers.rb +37 -0
data/lib/clacky/proxy_config.rb +65 -0
data/lib/clacky/server/http_server.rb +202 -5
data/lib/clacky/server/scheduler.rb +13 -10
data/lib/clacky/ui2/progress_handle.rb +17 -13
data/lib/clacky/version.rb +1 -1
data/lib/clacky/vision/resolver.rb +157 -0
data/lib/clacky/web/app.css +56 -6
data/lib/clacky/web/i18n.js +24 -2
data/lib/clacky/web/index.html +21 -0
data/lib/clacky/web/notify.js +154 -0
data/lib/clacky/web/notify.mp3 +0 -0
data/lib/clacky/web/settings.js +88 -12
data/lib/clacky/web/ws-dispatcher.js +8 -0
data/lib/clacky.rb +4 -0
metadata +7 -2

data/lib/clacky/default_parsers/pdf_parser.rb CHANGED Viewed

@@ -8,113 +8,97 @@
 #
 # Output:
 #   stdout — extracted text content (UTF-8)
-#   stderr — error messages
+#   stderr — error / progress messages
 #   exit 0 — success
-#   exit 1 — failure
+#   exit 1 — hard failure (file unreadable, pdftotext missing, etc.)
 #
-# This file lives in ~/.clacky/parsers/ and can be modified by the LLM.
+# Strategy
+# --------
+# PDF pages naturally fall into two kinds: pages with a real text layer,
+# and scanned-image pages. The right tool is a per-page property, not a
+# document-level one. So:
 #
-# Extraction pipeline (first successful step wins):
-#   1. pdftotext (poppler)     — fastest, text-based PDFs
-#   2. pdfplumber (Python)     — handles more layouts
-#                                (→ pdf_parser_plumber.py)
-#   3. OCR (tesseract)         — scanned / image-only PDFs
-#                                (→ pdf_parser_ocr.py)
+#   1. Run pdftotext once over the whole file (`-layout`), split by `\f`.
+#   2. Pages with enough bytes → emit text directly.
+#   3. Pages below threshold → list page numbers in a Notice section
+#      with a shell command template the agent can run on demand to
+#      render a specific page to PNG, then file_reader that PNG.
 #
-# Each extractor is a plain, self-contained function. Python-backed steps
-# shell out to a sibling .py script so the LLM can edit them directly
-# (with proper syntax highlighting, linters, and per-file run/debug)
-# instead of wrestling with embedded heredocs.
+# The parser does NOT pre-render images. Most weak pages will never be
+# read (the answer is often already in the text-layer pages). Rendering
+# all of them up front is wasteful — 55 pages takes ~14s and most goes
+# to waste. The agent decides when (and which page) to OCR based on the
+# user's actual question.
 #
-# VERSION: 3
+# VERSION: 6
 require "open3"
-# Minimum useful output (in bytes). Below this, a step is considered a
-# miss and the next fallback is tried.
-MIN_CONTENT_BYTES = 20
+MIN_PAGE_BYTES = 20
-# Script directory — resolve sibling .py helpers relative to this file
-# so it works both from the gem's default_parsers/ dir and from the
-# copied-to-user ~/.clacky/parsers/ dir.
-SCRIPT_DIR = File.dirname(File.expand_path(__FILE__))
-def try_pdftotext(path)
-  stdout, _stderr, status = Open3.capture3("pdftotext", "-layout", "-enc", "UTF-8", path, "-")
-  return nil unless status.success?
-  text = stdout.strip
-  return nil if text.bytesize < MIN_CONTENT_BYTES
-  text
-rescue Errno::ENOENT
-  nil # pdftotext not installed
-end
-def try_pdfplumber(path)
-  script = File.join(SCRIPT_DIR, "pdf_parser_plumber.py")
-  return nil unless File.exist?(script)
-  stdout, _stderr, status = Open3.capture3("python3", script, path)
-  return nil unless status.success?
-  text = stdout.strip
-  return nil if text.bytesize < MIN_CONTENT_BYTES
-  text
-rescue Errno::ENOENT
-  nil # python3 not available
+def die(msg)
+  warn msg
+  exit 1
 end
-# OCR fallback for scanned/image-only PDFs.
-# See pdf_parser_ocr.py for the actual extraction logic.
-#
-# Installation hints (also printed on final failure):
-#   macOS:   brew install tesseract tesseract-lang poppler
-#            pip3 install pytesseract pdf2image
-#   Linux:   apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils
-#            pip3 install pytesseract pdf2image
-def try_ocr(path)
-  # Quick capability check — avoid spawning python if tesseract is missing.
-  _stdout, _stderr, status = Open3.capture3("tesseract", "--version")
-  return nil unless status.success?
-  script = File.join(SCRIPT_DIR, "pdf_parser_ocr.py")
-  return nil unless File.exist?(script)
-  stdout, stderr, status = Open3.capture3("python3", script, path)
+def pdftotext_pages(path)
+  stdout, stderr, status = Open3.capture3(
+    "pdftotext", "-layout", "-enc", "UTF-8", path, "-"
+  )
   unless status.success?
-    warn stderr.strip unless stderr.strip.empty?
+    warn "pdftotext failed: #{stderr.strip}"
     return nil
   end
-  text = stdout.strip
-  return nil if text.bytesize < MIN_CONTENT_BYTES
-  text
+  pages = stdout.split("\f", -1)
+  pages.pop if pages.last && pages.last.strip.empty?
+  pages.map(&:strip)
 rescue Errno::ENOENT
-  nil # tesseract or python3 not available
+  warn "pdftotext not found. Install poppler (`brew install poppler` / `apt install poppler-utils`)."
+  nil
 end
-# --- main ---
+def main(argv)
+  die "Usage: pdf_parser.rb <file_path>" if argv.empty?
+  path = argv[0]
+  die "File not found: #{path}" unless File.file?(path)
-path = ARGV[0]
+  pages = pdftotext_pages(path)
+  die "Could not extract text from PDF." if pages.nil?
-if path.nil? || path.empty?
-  warn "Usage: ruby pdf_parser.rb <file_path>"
-  exit 1
-end
+  weak = []
+  body_chunks = []
+  pages.each_with_index do |text, idx|
+    n = idx + 1
+    if text.bytesize >= MIN_PAGE_BYTES
+      body_chunks << "--- Page #{n} ---\n\n#{text}"
+    else
+      body_chunks << "--- Page #{n} ---\n\n[no extractable text layer]"
+      weak << n
+    end
+  end
-unless File.exist?(path)
-  warn "File not found: #{path}"
-  exit 1
-end
+  output = body_chunks.join("\n\n")
-# Try each extractor in order; first non-nil result wins.
-text = try_pdftotext(path) || try_pdfplumber(path) || try_ocr(path)
+  if weak.any?
+    abs_path = File.expand_path(path)
+    notice = +"\n\n--- Notice ---\n\n"
+    notice << "#{weak.size} of #{pages.size} pages have no extractable text layer "
+    notice << "(likely scanned images).\n"
+    notice << "Pages without text: #{weak.join(', ')}\n\n"
+    notice << "To OCR a specific page, render it to PNG via shell, then "
+    notice << "file_reader the PNG (it will be transcribed via the "
+    notice << "vision/OCR pipeline):\n\n"
+    notice << "  pdftoppm -r 150 -f <N> -l <N> -png -singlefile "
+    notice << "#{abs_path.inspect} /tmp/clacky-pdf-page-<N>\n"
+    notice << "  # produces /tmp/clacky-pdf-page-<N>.png\n\n"
+    notice << "Only render pages you actually need. If the user's question "
+    notice << "is already answered by the extracted text above, skip OCR.\n"
+    output << notice
+  end
-if text
-  print text
+  $stdout.write(output)
+  $stdout.write("\n") unless output.end_with?("\n")
   exit 0
-else
-  warn "Could not extract text from PDF."
-  warn "For text-based PDFs, install poppler: brew install poppler (macOS) / apt install poppler-utils (Linux)"
-  warn "For scanned PDFs (OCR):"
-  warn "  macOS: brew install tesseract tesseract-lang poppler && pip3 install pytesseract pdf2image"
-  warn "  Linux: apt install tesseract-ocr tesseract-ocr-chi-sim poppler-utils && pip3 install pytesseract pdf2image"
-  exit 1
 end
+main(ARGV) if __FILE__ == $PROGRAM_NAME

data/lib/clacky/default_parsers/pdf_parser_vlm.py ADDED Viewed

@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Clacky PDF Parser — VLM (Vision Language Model) extractor
+Renders each PDF page to PNG via pdftoppm (poppler), then asks the
+configured OCR sidecar (e.g. gemini-3-5-flash, gpt-4o-mini) to transcribe
+each page through the local Clacky server's internal OCR endpoint.
+Why through HTTP and not direct API call?
+  The OCR sidecar config (model, base_url, api_key) lives in the agent's
+  ~/.clacky/config.yml. We don't re-implement that lookup here — instead
+  the local Clacky server exposes /api/internal/ocr-image which already
+  has the agent_config in scope. This parser stays a thin client.
+Usage:
+    python3 pdf_parser_vlm.py <file_path>
+Stdout: extracted text (UTF-8), pages separated by `\\n\\n--- Page N ---\\n\\n`
+Stderr: progress + error messages
+Exit:   0 on success, 1 on failure (server unavailable, no sidecar, etc.)
+Environment:
+    CLACKY_SERVER_HOST  default 127.0.0.1
+    CLACKY_SERVER_PORT  default 7070
+"""
+import json
+import os
+import subprocess
+import sys
+import tempfile
+import urllib.error
+import urllib.request
+PAGE_SEPARATOR = "\n\n--- Page {n} ---\n\n"
+RENDER_DPI = 150
+REQUEST_TIMEOUT = 120  # seconds; VLMs can be slow
+def server_url():
+    host = os.environ.get("CLACKY_SERVER_HOST", "127.0.0.1")
+    port = os.environ.get("CLACKY_SERVER_PORT", "7070")
+    return f"http://{host}:{port}/api/internal/ocr-image"
+def render_pages(pdf_path, out_dir):
+    prefix = os.path.join(out_dir, "page")
+    cmd = ["pdftoppm", "-r", str(RENDER_DPI), "-png", pdf_path, prefix]
+    proc = subprocess.run(cmd, capture_output=True, text=True)
+    if proc.returncode != 0:
+        sys.stderr.write(f"pdftoppm failed: {proc.stderr.strip()}\n")
+        return []
+    pages = sorted(
+        os.path.join(out_dir, f) for f in os.listdir(out_dir)
+        if f.startswith("page-") and f.endswith(".png")
+    )
+    return pages
+def transcribe_page(image_path, page_num):
+    with open(image_path, "rb") as f:
+        body = f.read()
+    boundary = "----clacky-vlm-boundary"
+    parts = []
+    parts.append(f"--{boundary}\r\n".encode())
+    parts.append(
+        b'Content-Disposition: form-data; name="image"; filename="page.png"\r\n'
+        b"Content-Type: image/png\r\n\r\n"
+    )
+    parts.append(body)
+    parts.append(f"\r\n--{boundary}\r\n".encode())
+    parts.append(
+        b'Content-Disposition: form-data; name="prompt"\r\n\r\n'
+    )
+    parts.append(
+        f"This is page {page_num} of a scanned PDF. Extract every legible text "
+        "verbatim, preserving reading order. Render tables as Markdown tables. "
+        "Skip decorative elements. Output plain Markdown only — no commentary."
+        .encode()
+    )
+    parts.append(f"\r\n--{boundary}--\r\n".encode())
+    payload = b"".join(parts)
+    req = urllib.request.Request(
+        server_url(),
+        data=payload,
+        headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=REQUEST_TIMEOUT) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+    except urllib.error.URLError as e:
+        sys.stderr.write(f"page {page_num}: server unreachable ({e})\n")
+        return None
+    except Exception as e:
+        sys.stderr.write(f"page {page_num}: {e}\n")
+        return None
+    if not data.get("ok"):
+        sys.stderr.write(f"page {page_num}: {data.get('message', 'unknown error')}\n")
+        return None
+    return data.get("text", "")
+def main():
+    if len(sys.argv) != 2:
+        sys.stderr.write("Usage: pdf_parser_vlm.py <file_path>\n")
+        sys.exit(1)
+    path = sys.argv[1]
+    if not os.path.exists(path):
+        sys.stderr.write(f"File not found: {path}\n")
+        sys.exit(1)
+    with tempfile.TemporaryDirectory(prefix="clacky_vlm_") as tmp:
+        pages = render_pages(path, tmp)
+        if not pages:
+            sys.stderr.write("Failed to render PDF pages (is poppler installed?)\n")
+            sys.exit(1)
+        sys.stderr.write(f"VLM OCR: {len(pages)} page(s) to transcribe...\n")
+        chunks = []
+        for i, page in enumerate(pages, 1):
+            text = transcribe_page(page, i)
+            if text is None:
+                # Server unreachable / no sidecar — bail so caller falls back.
+                sys.exit(1)
+            chunks.append(PAGE_SEPARATOR.format(n=i) + text)
+        sys.stdout.write("".join(chunks).strip())
+if __name__ == "__main__":
+    main()

data/lib/clacky/providers.rb CHANGED Viewed

@@ -59,6 +59,10 @@ module Clacky
           "or-gpt-image-2"             => "GPT Image 2"
         },
         "default_image_model" => "or-gpt-image-2",
+        # Default OCR sidecar — used when the primary model is text-only.
+        # Candidates are derived from the provider's vision-capable models;
+        # this just picks the cheap+fast default to surface in "auto" mode.
+        "default_ocr_model" => "or-gemini-3-5-flash",
         # Provider-level default: the Claude family served here is vision-capable.
         "capabilities" => { "vision" => true }.freeze,
         # Model-level overrides: DeepSeek models routed through this provider
@@ -145,6 +149,7 @@ module Clacky
         # until we ship a dedicated client-side adapter for that protocol.
         "image_models" => [],
         "default_image_model" => nil,
+        "default_ocr_model" => "google/gemini-2.5-flash",
         "website_url" => "https://openrouter.ai/keys"
       }.freeze,
@@ -192,6 +197,7 @@ module Clacky
         "model_capabilities" => {
           "MiniMax-M3" => { "vision" => true }.freeze
         }.freeze,
+        "default_ocr_model" => "MiniMax-M3",
         "website_url" => "https://www.minimaxi.com/user-center/basic-information/interface-key"
       }.freeze,
@@ -218,6 +224,7 @@ module Clacky
         ].freeze,
         # k2.5 / k2.6 are multimodal; legacy k2 text-only models need model_capabilities override if added.
         "capabilities" => { "vision" => true }.freeze,
+        "default_ocr_model" => "kimi-k2.5",
         "website_url" => "https://platform.moonshot.cn/console/api-keys"
       }.freeze,
@@ -265,6 +272,7 @@ module Clacky
         "api" => "anthropic-messages",
         "default_model" => "claude-sonnet-4-6",
         "models" => ["claude-opus-4-8", "claude-opus-4-7", "claude-opus-4-6", "claude-sonnet-4-6", "claude-haiku-4-5"],
+        "default_ocr_model" => "claude-haiku-4-5",
         "website_url" => "https://console.anthropic.com/settings/keys"
       }.freeze,
@@ -279,6 +287,7 @@ module Clacky
         "model_capabilities" => {
           "mimo-v2-omni" => { "vision" => true }.freeze
         }.freeze,
+        "default_ocr_model" => "mimo-v2-omni",
         "website_url" => "https://platform.xiaomimimo.com/"
       }.freeze,
@@ -308,6 +317,7 @@ module Clacky
         "model_capabilities" => {
           "glm-5v-turbo" => { "vision" => true }.freeze
         }.freeze,
+        "default_ocr_model" => "glm-5v-turbo",
         "website_url" => "https://open.bigmodel.cn/usercenter/apikeys"
       }.freeze,
@@ -338,6 +348,7 @@ module Clacky
           "gpt-image-2"
         ],
         "default_image_model" => "gpt-image-2",
+        "default_ocr_model" => "gpt-5.4-mini",
         "website_url" => "https://platform.openai.com/api-keys"
       }.freeze,
@@ -363,6 +374,7 @@ module Clacky
         "model_capabilities" => {
           "qwen3.7-max" => { "vision" => false }.freeze
         }.freeze,
+        "default_ocr_model" => "qwen3.6-flash",
         "lite_models" => {
           "qwen3.7-max"      => "qwen3.6-flash",
           "qwen3.6-plus"     => "qwen3.6-flash",
@@ -529,6 +541,31 @@ module Clacky
         preset&.dig("audio_models") || []
       end
+      # OCR sidecar candidates: every chat model under this provider that's
+      # vision-capable. Derived from `vision` capability so we don't have
+      # to maintain a parallel list — a model that can see is by definition
+      # a candidate for "describe an image as text". Image-generation models
+      # are excluded (they take prompts and return pixels, not the other way).
+      # @param provider_id [String]
+      # @return [Array<String>]
+      def ocr_models(provider_id)
+        preset = PRESETS[provider_id]
+        return [] unless preset
+        (preset["models"] || []).select { |m| supports?(provider_id, :vision, model_name: m) }
+      end
+      # Default OCR sidecar model for a provider. Falls back to the first
+      # vision-capable model if the preset doesn't pin an explicit default.
+      # @param provider_id [String]
+      # @return [String, nil] nil when the provider has zero vision-capable models
+      def default_ocr_model(provider_id)
+        preset = PRESETS[provider_id]
+        return nil unless preset
+        explicit = preset["default_ocr_model"]
+        return explicit if explicit && ocr_models(provider_id).include?(explicit)
+        ocr_models(provider_id).first
+      end
       # Unified entry for media model lookup by kind.
       # @param provider_id [String]
       # @param kind [String] one of "image" / "video" / "audio"

data/lib/clacky/proxy_config.rb ADDED Viewed

@@ -0,0 +1,65 @@
+# frozen_string_literal: true
+module Clacky
+  # Centralized HTTP proxy policy for the current process.
+  #
+  # Single source of truth: AgentConfig#proxy_url. We never honour the user's
+  # shell ENV (HTTP_PROXY etc.) — it's stripped on every install! so a stale
+  # proxy in the launching shell can't poison Clacky.
+  #
+  # epoch increments on every actual change so that long-lived consumers
+  # (e.g. Faraday connections cached on Client instances) can detect when
+  # their cached state is stale and rebuild.
+  module ProxyConfig
+    PROXY_ENV_KEYS = %w[
+      http_proxy HTTP_PROXY
+      https_proxy HTTPS_PROXY
+      all_proxy ALL_PROXY
+    ].freeze
+    @installed_signature = nil
+    @epoch = 0
+    class << self
+      attr_reader :epoch
+      def install!
+        url = load_proxy_url
+        sig = url
+        return if sig == @installed_signature
+        strip_env_proxy
+        assign_env_proxy(url) if url && !url.empty?
+        ensure_faraday_reads_env
+        @installed_signature = sig
+        @epoch += 1
+      end
+      def reset_cache!
+        @installed_signature = nil
+        install!
+      end
+      private def assign_env_proxy(url)
+        %w[http_proxy HTTP_PROXY https_proxy HTTPS_PROXY].each { |k| ENV[k] = url }
+      end
+      private def strip_env_proxy
+        PROXY_ENV_KEYS.each { |k| ENV.delete(k) }
+      end
+      private def ensure_faraday_reads_env
+        return unless defined?(Faraday)
+        Faraday.ignore_env_proxy = false if Faraday.respond_to?(:ignore_env_proxy=)
+      end
+      private def load_proxy_url
+        cfg = Clacky::AgentConfig.load
+        cfg.respond_to?(:proxy_url) ? cfg.proxy_url.to_s.strip : ""
+      rescue StandardError
+        ""
+      end
+    end
+  end
+end