RubyGems - openclacky - Versions diffs - 1.0.0 → 1.0.2 - Mend

openclacky 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +39 -0
data/README.md +87 -53
data/lib/clacky/agent/cost_tracker.rb +19 -2
data/lib/clacky/agent/llm_caller.rb +218 -0
data/lib/clacky/agent/message_compressor_helper.rb +32 -2
data/lib/clacky/agent.rb +54 -22
data/lib/clacky/client.rb +44 -5
data/lib/clacky/default_parsers/pdf_parser.rb +58 -17
data/lib/clacky/default_parsers/pdf_parser_ocr.py +103 -0
data/lib/clacky/default_parsers/pdf_parser_plumber.py +62 -0
data/lib/clacky/default_skills/deploy/SKILL.md +201 -77
data/lib/clacky/default_skills/new/SKILL.md +3 -114
data/lib/clacky/default_skills/onboard/SKILL.md +349 -133
data/lib/clacky/default_skills/onboard/scripts/import_external_skills.rb +371 -0
data/lib/clacky/default_skills/onboard/scripts/install_builtin_skills.rb +175 -0
data/lib/clacky/default_skills/skill-add/scripts/install_from_zip.rb +59 -26
data/lib/clacky/message_format/anthropic.rb +72 -8
data/lib/clacky/message_format/bedrock.rb +6 -3
data/lib/clacky/providers.rb +146 -3
data/lib/clacky/server/channel/adapters/feishu/adapter.rb +14 -0
data/lib/clacky/server/channel/adapters/feishu/bot.rb +10 -0
data/lib/clacky/server/channel/adapters/feishu/message_parser.rb +1 -0
data/lib/clacky/server/channel/channel_manager.rb +12 -4
data/lib/clacky/server/channel/channel_ui_controller.rb +8 -2
data/lib/clacky/server/http_server.rb +746 -13
data/lib/clacky/server/session_registry.rb +55 -24
data/lib/clacky/skill.rb +10 -9
data/lib/clacky/skill_loader.rb +23 -11
data/lib/clacky/tools/file_reader.rb +232 -127
data/lib/clacky/tools/security.rb +42 -64
data/lib/clacky/tools/terminal/persistent_session.rb +15 -4
data/lib/clacky/tools/terminal/safe_rm.sh +106 -0
data/lib/clacky/tools/terminal/session_manager.rb +8 -3
data/lib/clacky/tools/terminal.rb +263 -16
data/lib/clacky/ui2/layout_manager.rb +8 -1
data/lib/clacky/ui2/output_buffer.rb +83 -23
data/lib/clacky/ui2/ui_controller.rb +74 -7
data/lib/clacky/utils/file_processor.rb +14 -40
data/lib/clacky/utils/model_pricing.rb +215 -0
data/lib/clacky/utils/parser_manager.rb +70 -6
data/lib/clacky/utils/string_matcher.rb +23 -1
data/lib/clacky/version.rb +1 -1
data/lib/clacky/web/app.css +673 -9
data/lib/clacky/web/app.js +40 -1608
data/lib/clacky/web/i18n.js +209 -0
data/lib/clacky/web/index.html +166 -2
data/lib/clacky/web/onboard.js +77 -1
data/lib/clacky/web/profile.js +442 -0
data/lib/clacky/web/sessions.js +1034 -2
data/lib/clacky/web/settings.js +127 -6
data/lib/clacky/web/sidebar.js +39 -0
data/lib/clacky/web/skills.js +460 -0
data/lib/clacky/web/trash.js +343 -0
data/lib/clacky/web/ws-dispatcher.js +255 -0
data/lib/clacky.rb +5 -3
metadata +16 -17
data/lib/clacky/clacky_auth_client.rb +0 -152
data/lib/clacky/clacky_cloud_config.rb +0 -123
data/lib/clacky/cloud_project_client.rb +0 -169
data/lib/clacky/default_skills/deploy/scripts/rails_deploy.rb +0 -1377
data/lib/clacky/default_skills/deploy/tools/check_health.rb +0 -116
data/lib/clacky/default_skills/deploy/tools/create_database_service.rb +0 -341
data/lib/clacky/default_skills/deploy/tools/execute_deployment.rb +0 -99
data/lib/clacky/default_skills/deploy/tools/fetch_runtime_logs.rb +0 -77
data/lib/clacky/default_skills/deploy/tools/list_services.rb +0 -67
data/lib/clacky/default_skills/deploy/tools/report_deploy_status.rb +0 -67
data/lib/clacky/default_skills/deploy/tools/set_deploy_variables.rb +0 -189
data/lib/clacky/default_skills/new/scripts/cloud_project_init.sh +0 -74
data/lib/clacky/deploy_api_client.rb +0 -484

data/lib/clacky/ui2/ui_controller.rb CHANGED Viewed

@@ -598,7 +598,7 @@ module Clacky
           @progress_stack.push(handle)
           entry_id = append_output(render_for(handle))
-          update_sessionbar(status: 'working') if handle.style == :primary
+          recompute_sessionbar_status
           entry_id
         end
       end
@@ -625,13 +625,12 @@ module Clacky
           if (restored = @progress_stack.last)
             new_id = append_output(render_for(restored))
             restored.__reattach_entry!(new_id)
-          else
-            # No more progress handles — clear the "working" sessionbar.
-            # We only flip to idle if the handle that just finished was
-            # the one that brought us to working (style :primary). A quiet
-            # handle finishing never touches the sessionbar.
-            update_sessionbar(status: 'idle') if handle.style == :primary
           end
+          # Recompute sessionbar status from whatever remains on the stack.
+          # This handles: (a) empty stack → idle, (b) mixed stack (e.g. a
+          # long-running quiet tool still active underneath) → working.
+          recompute_sessionbar_status
         end
       end
@@ -654,6 +653,11 @@ module Clacky
             @renderer.render_working(decorated) :
             @renderer.render_progress(decorated)
           update_entry(handle.entry_id, painted)
+          # Re-evaluate sessionbar: a quiet handle that crosses the fast-finish
+          # threshold should upgrade the status bar to "working" so long-running
+          # tools (terminal running a build, web_fetch) visibly reflect activity.
+          recompute_sessionbar_status
         end
       end
@@ -672,6 +676,41 @@ module Clacky
           @renderer.render_progress(decorated)
       end
+      # Derive the sessionbar workspace status from the live progress stack.
+      #
+      # Rules:
+      #   - Any :primary handle alive  → "working" (fast path for LLM thinking)
+      #   - Any :quiet handle that has been alive longer than
+      #     FAST_FINISH_THRESHOLD_SECONDS → "working" (so long tools like
+      #     `terminal` running a build or test suite correctly flip the bar
+      #     to working instead of staying on "idle" for minutes)
+      #   - Otherwise → "idle"
+      #
+      # Must be called with @progress_mutex held. Emits update_sessionbar
+      # only when the computed status differs from the last one we wrote,
+      # avoiding pointless re-renders on every tick.
+      private def recompute_sessionbar_status
+        new_status = compute_sessionbar_status
+        return if @last_sessionbar_status == new_status
+        @last_sessionbar_status = new_status
+        update_sessionbar(status: new_status)
+      end
+      private def compute_sessionbar_status
+        return 'idle' if @progress_stack.empty?
+        threshold = ProgressHandle::FAST_FINISH_THRESHOLD_SECONDS
+        now       = Time.now
+        @progress_stack.each do |h|
+          return 'working' if h.style == :primary
+          # Quiet handles only "count" once they've been alive long enough
+          # that a user would naturally expect a busy indicator.
+          start = h.start_time
+          return 'working' if start && (now - start) >= threshold
+        end
+        'idle'
+      end
       # ---------------------------------------------------------------------
       # Legacy shim: show_progress(message, phase:, progress_type:, ...)
       #
@@ -771,12 +810,40 @@ module Clacky
       # Set workspace status to idle (called when agent stops working)
       def set_idle_status
+        # Safety net: close any legacy progress slots that were opened via
+        # show_progress(progress_type: X, phase: "active") but never paired
+        # with a corresponding phase: "done" call. Historically the
+        # "retrying" slot in LlmCaller was leaked on every successful
+        # recovery, leaving the user with a stale "Network failed ... (NNN s)"
+        # line ticking forever. LlmCaller now closes its own slot (see the
+        # ensure in call_llm), but we mirror that defense here so any
+        # future code path that forgets to close a slot still gets cleaned
+        # up at the well-defined idle boundary.
+        close_leaked_legacy_progress_handles
         update_sessionbar(status: 'idle')
+        @last_sessionbar_status = 'idle'
         # Clear user tip when agent stops working
         @input_area.clear_user_tip
         @layout.render_input
       end
+      # Finish every ProgressHandle still registered in the legacy
+      # (show_progress) handle map. Called from set_idle_status as a
+      # defense-in-depth against unpaired active/done calls.
+      private def close_leaked_legacy_progress_handles
+        return unless @legacy_progress_handles
+        leaked = @legacy_progress_handles.reject { |_type, h| h.nil? || !h.running? }
+        return if leaked.empty?
+        # Finish top-down so each handle is the one currently rendering
+        # when it closes (matches the invariant in interrupt_all_progress).
+        leaked.values.reverse_each(&:finish)
+        @legacy_progress_handles.clear
+      end
       # Set workspace status to working (called when agent starts working)
       def set_working_status
         update_sessionbar(status: 'working')

data/lib/clacky/utils/file_processor.rb CHANGED Viewed

@@ -141,26 +141,14 @@ module Clacky
         FileRef.new(name: name, type: :image, original_path: path)
       when ".csv"
-        # CSV is plain text — read directly, no external parser needed.
-        # Try UTF-8 first, then GBK (common in Chinese-origin CSV), then binary with replacement.
-        begin
-          text         = read_text_with_encoding_fallback(path)
-          preview_path = save_preview(text, path)
-          FileRef.new(name: name, type: :csv, original_path: path, preview_path: preview_path)
-        rescue => e
-          FileRef.new(name: name, type: :csv, original_path: path, parse_error: e.message)
-        end
+        # CSV is plain text — the file itself IS the preview. No parser, no copy.
+        # FileReader handles encoding fallback via safe_utf8 when it reads the file.
+        FileRef.new(name: name, type: :csv, original_path: path, preview_path: path)
       when *TEXT_PREVIEW_EXTENSIONS
-        # Markdown / plain text: the file itself IS the preview.
-        # No parser needed — just copy through (with encoding normalisation).
-        begin
-          text         = read_text_with_encoding_fallback(path)
-          preview_path = save_preview(text, path)
-          FileRef.new(name: name, type: :text, original_path: path, preview_path: preview_path)
-        rescue => e
-          FileRef.new(name: name, type: :text, original_path: path, parse_error: e.message)
-        end
+        # Markdown / plain text / log: the file itself IS the preview.
+        # No parser needed, no tmpdir copy — just point preview_path at the original.
+        FileRef.new(name: name, type: :text, original_path: path, preview_path: path)
       else
         result = Utils::ParserManager.parse(path)
@@ -397,7 +385,14 @@ module Clacky
     end
     def self.save_preview(content, original_path)
-      dest = "#{original_path}.preview.md"
+      # Always write previews to a tmpdir-based path to avoid polluting the
+      # user's working directory with .preview.md sidecar files.
+      # Use the same UPLOAD_DIR that uploaded files live in; for on-disk files
+      # outside that dir (e.g. project files opened by file_reader), we still
+      # land in UPLOAD_DIR so the user's tree stays clean.
+      FileUtils.mkdir_p(UPLOAD_DIR)
+      safe_name = File.basename(original_path.to_s).gsub(/[\/\:\*?"<>|\x00]/, "_")
+      dest = File.join(UPLOAD_DIR, "#{SecureRandom.hex(8)}_#{safe_name}.preview.md")
       File.write(dest, content)
       dest
     end
@@ -413,26 +408,6 @@ module Clacky
       base.empty? ? 'upload' : base
     end
-    # Read a text file with automatic encoding detection.
-    # Tries UTF-8, then GBK (common for Chinese-origin CSV/text files), then
-    # falls back to binary read with invalid byte replacement.
-    def self.read_text_with_encoding_fallback(path)
-      # Try UTF-8 first (most common, fastest path)
-      raw = File.binread(path)
-      utf8 = raw.dup.force_encoding("UTF-8")
-      return utf8.encode("UTF-8") if utf8.valid_encoding?
-      # Try GBK (GB2312 superset — common in Chinese Windows/Excel exports)
-      begin
-        return raw.encode("UTF-8", "GBK", invalid: :replace, undef: :replace, replace: "?")
-      rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
-        # fall through
-      end
-      # Last resort: binary read with replacement characters
-      raw.encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "?")
-    end
     # Detect the actual image MIME type from raw binary data by inspecting
     # magic bytes, ignoring the file extension. Falls back to extension-based
     # detection when magic bytes don't match any known format.
@@ -549,7 +524,6 @@ module Clacky
     end
     private_class_method :parse_zip_listing, :parse_tar_listing, :save_preview, :sanitize_filename,
-                         :read_text_with_encoding_fallback,
                          :downscale_png_chunky, :downscale_via_cli
   end
   end

data/lib/clacky/utils/model_pricing.rb CHANGED Viewed

@@ -177,9 +177,186 @@ module Clacky
         }
       },
+      # OpenAI GPT-5.5 / GPT-5.4 — breakpoint at 272K input tokens
+      # Source: https://openai.com/api/pricing/ (USD / 1M tokens)
+      # Note: OpenAI's actual tiered-pricing threshold is 272K, not the
+      # global 200K below.  Prompts between 200K–272K will slightly
+      # over-estimate costs until a per-model threshold is implemented.
+      "gpt-5.5" => {
+        input: {
+          default: 5.00,              # $5/MTok for prompts ≤ 272K tokens
+          over_200k: 10.00            # $10/MTok for prompts > 272K tokens
+        },
+        output: {
+          default: 30.00,             # $30/MTok for prompts ≤ 272K tokens
+          over_200k: 45.00            # $45/MTok for prompts > 272K tokens
+        },
+        cache: {
+          write_default: 5.00,        # $5/MTok cache write (≤ 272K)
+          write_over_200k: 10.00,     # $10/MTok cache write (> 272K)
+          read_default: 0.50,         # $0.50/MTok cache read (≤ 272K)
+          read_over_200k: 1.00        # $1.00/MTok cache read (> 272K)
+        }
+      },
+      "gpt-5.4" => {
+        input: {
+          default: 2.50,              # $2.50/MTok for prompts ≤ 272K tokens
+          over_200k: 5.00             # $5/MTok for prompts > 272K tokens
+        },
+        output: {
+          default: 15.00,             # $15/MTok for prompts ≤ 272K tokens
+          over_200k: 22.50           # $22.50/MTok for prompts > 272K tokens
+        },
+        cache: {
+          write_default: 2.50,        # $2.50/MTok cache write (≤ 272K)
+          write_over_200k: 5.00,      # $5/MTok cache write (> 272K)
+          read_default: 0.25,         # $0.25/MTok cache read (≤ 272K)
+          read_over_200k: 0.50        # $0.50/MTok cache read (> 272K)
+        }
+      },
+      # GPT-5.4 flat-rate models (no breakpoint, single rate regardless of context)
+      "gpt-5.4-mini" => {
+        input: {
+          default: 0.75,              # $0.75/MTok
+          over_200k: 0.75
+        },
+        output: {
+          default: 4.50,              # $4.50/MTok
+          over_200k: 4.50
+        },
+        cache: {
+          write: 0.75,                # $0.75/MTok cache write
+          read: 0.075                 # $0.075/MTok cache read (10% of input)
+        }
+      },
+      "gpt-5.4-nano" => {
+        input: {
+          default: 0.20,              # $0.20/MTok
+          over_200k: 0.20
+        },
+        output: {
+          default: 1.25,              # $1.25/MTok
+          over_200k: 1.25
+        },
+        cache: {
+          write: 0.20,                # $0.20/MTok cache write
+          read: 0.02                  # $0.02/MTok cache read (10% of input)
+        }
+      },
+      # O-series reasoning models — flat-rate (200K context window)
+      # Source: https://openai.com/api/pricing/
+      "o3" => {
+        input: {
+          default: 2.00,              # $2/MTok
+          over_200k: 2.00             # flat rate
+        },
+        output: {
+          default: 8.00,              # $8/MTok
+          over_200k: 8.00
+        },
+        cache: {
+          write: 2.00,                # $2/MTok cache write (same as input)
+          read: 0.50                  # $0.50/MTok cache read (25% of input)
+        }
+      },
+      "o4-mini" => {
+        input: {
+          default: 1.10,              # $1.10/MTok
+          over_200k: 1.10             # flat rate
+        },
+        output: {
+          default: 4.40,              # $4.40/MTok
+          over_200k: 4.40
+        },
+        cache: {
+          write: 1.10,                # $1.10/MTok cache write (same as input)
+          read: 0.275                 # $0.275/MTok cache read (25% of input)
+        }
+      },
+      # GLM (Zhipu / Z.ai) — USD per 1M tokens.
+      # Source: https://docs.z.ai/guides/overview/pricing (Z.ai international).
+      # Pricing policy: we always bill at the Z.ai international flat rate,
+      # regardless of which endpoint (mainland bigmodel.cn vs intl z.ai) the
+      # user configured. Rationale:
+      #   1. Mainland GLM uses tiered pricing (≤32K / >32K / >128K) where the
+      #      >32K tier is hit by the vast majority of real requests, and is
+      #      actually a few RMB cheaper than Z.ai's flat rate — displaying the
+      #      (slightly higher) Z.ai rate gives users a "displayed ≤ actual"
+      #      experience which is psychologically safer than the reverse.
+      #   2. Single flat rate keeps the table shape consistent with every
+      #      other provider here (no special-case tier logic for just GLM).
+      # Cache-write: same convention as DeepSeek/Kimi — OpenAI-compatible
+      # endpoints don't charge separately for cache writes (Z.ai's page lists
+      # "Cached Input Storage: Limited-time Free"), so bill writes at the
+      # regular input miss rate for safe "displayed ≤ actual" behaviour.
+      "glm-5.1" => {
+        input:  { default: 1.40, over_200k: 1.40 },
+        output: { default: 4.40, over_200k: 4.40 },
+        cache:  { write: 1.40, read: 0.26 }
+      },
+      "glm-5" => {
+        input:  { default: 1.00, over_200k: 1.00 },
+        output: { default: 3.20, over_200k: 3.20 },
+        cache:  { write: 1.00, read: 0.20 }
+      },
+      "glm-5-turbo" => {
+        input:  { default: 1.20, over_200k: 1.20 },
+        output: { default: 4.00, over_200k: 4.00 },
+        cache:  { write: 1.20, read: 0.24 }
+      },
+      # GLM-5V-Turbo is the multimodal sibling of GLM-5-Turbo (vision capable,
+      # see providers.rb model_capabilities override). Same input/output rate
+      # as 5-Turbo per Z.ai's Vision Models table.
+      "glm-5v-turbo" => {
+        input:  { default: 1.20, over_200k: 1.20 },
+        output: { default: 4.00, over_200k: 4.00 },
+        cache:  { write: 1.20, read: 0.24 }
+      },
+      "glm-4.7" => {
+        input:  { default: 0.60, over_200k: 0.60 },
+        output: { default: 2.20, over_200k: 2.20 },
+        cache:  { write: 0.60, read: 0.11 }
+      },
+      # MiniMax — USD per 1M tokens.
+      # Source: https://platform.minimaxi.com (Pay-as-You-Go).
+      # MiniMax pricing is identical across mainland (.com) and international
+      # (.io) endpoints, verified by the team. Same cache-write convention as
+      # DeepSeek/Kimi/GLM: bill writes at the input miss rate (OpenAI-compatible
+      # usage responses from MiniMax don't reliably carry a separate
+      # cache_creation_input_tokens field, so a distinct write rate would be
+      # dead code in practice).
+      # Note: providers.rb uses the capitalised "MiniMax-M2.x" model id, but
+      # the pricing table keys are lowercased to stay consistent with the
+      # rest of this file; normalize_model_name() lowercases incoming model
+      # names before lookup.
+      "minimax-m2.5" => {
+        input:  { default: 0.30, over_200k: 0.30 },
+        output: { default: 1.20, over_200k: 1.20 },
+        cache:  { write: 0.30, read: 0.03 }
+      },
+      "minimax-m2.7" => {
+        input:  { default: 0.30, over_200k: 0.30 },
+        output: { default: 1.20, over_200k: 1.20 },
+        cache:  { write: 0.30, read: 0.06 }
+      },
     }.freeze
     # Threshold for tiered pricing (200K tokens)
+    # NOTE: OpenAI GPT-5.5/GPT-5.4 use a 272K breakpoint, not 200K.
+    # Costs for prompts between 200K–272K will be slightly over-estimated.
     TIERED_PRICING_THRESHOLD = 200_000
     class << self
@@ -314,6 +491,44 @@ module Clacky
           "kimi-k2.5"
         when /^kimi-k2\.?6$/i
           "kimi-k2.6"
+        # GLM (Zhipu / Z.ai) — the five models registered in providers.rb.
+        # GLM-5V-Turbo is the vision variant; all five share the same Z.ai
+        # international flat-rate pricing regardless of which endpoint
+        # (mainland bigmodel.cn vs intl z.ai) the user configured.
+        # Strict anchored match so unrelated strings like "glm-5-x-foo"
+        # don't silently borrow a nearby model's rate.
+        when /^glm-5\.1$/i
+          "glm-5.1"
+        when /^glm-5v-turbo$/i
+          "glm-5v-turbo"
+        when /^glm-5-turbo$/i
+          "glm-5-turbo"
+        when /^glm-5$/i
+          "glm-5"
+        when /^glm-4\.7$/i
+          "glm-4.7"
+        # MiniMax — model ids in providers.rb use capitalised "MiniMax-M2.x"
+        # but we match case-insensitively and map to the lowercased table key.
+        when /^minimax-m2\.5$/i
+          "minimax-m2.5"
+        when /^minimax-m2\.7$/i
+          "minimax-m2.7"
+        # OpenAI GPT-5.x models — match various dashed/dotted/compact forms
+        # (e.g. "gpt-5.5", "gpt-5-5", "gpt5.5", "gpt55")
+        when /^gpt-?5\.?5$/i, /^gpt-?5[\.-]?5$/i
+          "gpt-5.5"
+        when /^gpt-?5\.?4[^.]*mini$/i, /^gpt-?5\.?4[\.-]?mini$/i
+          "gpt-5.4-mini"
+        when /^gpt-?5\.?4[^.]*nano$/i, /^gpt-?5\.?4[\.-]?nano$/i
+          "gpt-5.4-nano"
+        when /^gpt-?5\.?4$/i, /^gpt-?5[\.-]?4$/i
+          "gpt-5.4"
+        # O-series reasoning models
+        when /^o4[\.-]?mini$/i
+          "o4-mini"
+        when /^o3$/i
+          "o3"
         else
           nil  # No pricing available for this model — cost will show as N/A
         end

data/lib/clacky/utils/parser_manager.rb CHANGED Viewed

@@ -33,19 +33,83 @@ module Clacky
       }.freeze
       # Ensure ~/.clacky/parsers/ exists and all default parsers are present.
-      # Called once at startup.
+      # Called at Agent startup (idempotent — safe to run every time).
+      #
+      # Copies every file from default_parsers/ (not just the entry-point .rb
+      # scripts listed in PARSER_FOR). A parser may ship companion helper
+      # scripts — e.g. pdf_parser_ocr.py sits next to pdf_parser.rb and is
+      # invoked by relative path — so those helpers must be distributed too.
+      #
+      # Version upgrade policy:
+      #   Each bundled parser declares `VERSION: <n>` in a header comment
+      #   (works for Ruby `# VERSION: 2` and Python `# VERSION: 2` alike,
+      #   scanned in the first 40 lines of the file).
+      #
+      #   On startup, per-file:
+      #     - If the file does NOT exist in ~/.clacky/parsers/ → copy it.
+      #     - If it exists:
+      #         * bundled has no VERSION → never touch (bundled file
+      #           is opting out of managed upgrades).
+      #         * installed has no VERSION → treat it as legacy v0 and
+      #           upgrade (lenient mode — covers users who installed before
+      #           the VERSION scheme existed). The old file is backed up.
+      #         * both have VERSION, bundled > installed → upgrade, backing
+      #           up the old copy as `<script>.v<old>.bak`.
+      #         * bundled ≤ installed → leave the user's copy alone
+      #           (preserves LLM/user modifications).
+      #
+      #   Backups live alongside the parser so the user can inspect
+      #   their own edits after an upgrade. They are never removed
+      #   automatically.
       def self.setup!
         FileUtils.mkdir_p(PARSERS_DIR)
-        PARSER_FOR.values.uniq.each do |script|
-          dest = File.join(PARSERS_DIR, script)
-          next if File.exist?(dest)
+        Dir.glob(File.join(DEFAULT_PARSERS_DIR, "**", "*")).each do |src|
+          next unless File.file?(src)
+          basename = File.basename(src)
+          next if basename.start_with?(".") || basename.end_with?(".bak")
+          rel  = src.sub(/^#{Regexp.escape(DEFAULT_PARSERS_DIR)}\/?/, "")
+          dest = File.join(PARSERS_DIR, rel)
+          if !File.exist?(dest)
+            FileUtils.mkdir_p(File.dirname(dest))
+            FileUtils.cp(src, dest)
+            # Preserve executable bit so sibling scripts can be run directly.
+            FileUtils.chmod(File.stat(src).mode, dest)
+            next
+          end
+          bundled_version = extract_version(src)
+          # Bundled file opts out of managed upgrades — never touch user copy.
+          next unless bundled_version
-          src = File.join(DEFAULT_PARSERS_DIR, script)
-          if File.exist?(src)
+          installed_version = extract_version(dest) || 0
+          if bundled_version > installed_version
+            backup = "#{dest}.v#{installed_version}.bak"
+            FileUtils.cp(dest, backup) unless File.exist?(backup)
             FileUtils.cp(src, dest)
+            FileUtils.chmod(File.stat(src).mode, dest)
+          end
+        end
+      end
+      # Read the VERSION marker from a parser script (e.g. "# VERSION: 2").
+      # Works for any script language that uses `#` for comments
+      # (Ruby, Python, shell). Returns Integer or nil.
+      def self.extract_version(path)
+        return nil unless File.exist?(path)
+        # Only scan the first 40 lines — the marker lives in the header.
+        File.foreach(path).with_index do |line, i|
+          break if i >= 40
+          if (m = line.match(/^\s*#\s*VERSION:\s*(\d+)/i))
+            return m[1].to_i
           end
         end
+        nil
+      rescue StandardError
+        nil
       end
       # Run the appropriate parser for the given file path.

data/lib/clacky/utils/string_matcher.rb CHANGED Viewed

@@ -20,6 +20,14 @@ module Clacky
       # @return [Hash, nil] { matched_string: String, occurrences: Integer }
       #   or nil when nothing matches
       def self.find_match(content, old_string)
+        # Defensive: if either side contains invalid UTF-8 bytes (binary files,
+        # mixed-encoding content, etc.), Regexp#scan / String#include? with a
+        # UTF-8-tagged candidate can raise `ArgumentError: invalid byte sequence
+        # in UTF-8`. Scrub once at the entry point so every matching layer —
+        # including callers like the edit preview — is safe.
+        content    = Clacky::Utils::Encoding.to_utf8(content)    unless content.nil?
+        old_string = Clacky::Utils::Encoding.to_utf8(old_string) unless old_string.nil?
         candidates = generate_candidates(old_string)
         # Simple string matching for each candidate
@@ -29,7 +37,7 @@ module Clacky
           if content.include?(candidate)
             return {
               matched_string: candidate,
-              occurrences: content.scan(Regexp.quote(candidate)).length
+              occurrences: count_occurrences(content, candidate)
             }
           end
         end
@@ -38,6 +46,20 @@ module Clacky
         try_smart_match(content, old_string)
       end
+      # Count non-overlapping occurrences of `needle` in `haystack` without
+      # going through Regexp (safer on mixed-encoding strings and avoids an
+      # extra escape step).
+      def self.count_occurrences(haystack, needle)
+        return 0 if needle.empty?
+        count = 0
+        offset = 0
+        while (idx = haystack.index(needle, offset))
+          count += 1
+          offset = idx + needle.length
+        end
+        count
+      end
       # Generate candidate strings by applying different transformations.
       #
       # @param old_string [String]

data/lib/clacky/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Clacky
-  VERSION = "1.0.0"
+  VERSION = "1.0.2"
 end