RubyGems - pikuri-workspace - Versions diffs - 0.0.4 → 0.0.6 - Mend

pikuri-workspace 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/pikuri/workspace/filesystem.rb +17 -14
data/lib/pikuri/workspace/read.rb +87 -163
data/lib/pikuri-workspace.rb +3 -2
metadata +4 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4be40e803f487eb130b8ee758b86efb7a744c92691a1dd103aae68d91e59a2f9
-  data.tar.gz: 981478770e9e7f8acef1fb1ca2b1896cf6a25c7eb73c01c1d57592f81583cf0f
+  metadata.gz: 52db0d4ce078507aa4a72cbc84d54a9433788f2a2d9a90ab883a9e469e6dea99
+  data.tar.gz: 107b34e1c3b387b4b68d542ff8ab2345885383a461c227a6b7aeb29973a4292e
 SHA512:
-  metadata.gz: a211df218e4270acbe737f624cc8a77cc145d8a051c3500131036692168996c800a0110a27bd714a7cad3fb5cb9dd3e25fdfdeefb917fb73ccead021cd807c94
-  data.tar.gz: fe2ff7510012d619caa166252ccef80148750e35fa58a68cd1722cec372758dc17e5605cbe380c4f106538b301134402da32e0fb13c6f23d6acec7c4aceb301e
+  metadata.gz: 432af6cfc0a0f3555666e9c88accb0e9b6162af2c5f041c9ff71b10443f1681b8e70b9e46aa6c75ed12344357a286df087869b889f0a40aeb9635aa6c9a1e651
+  data.tar.gz: 362eb9437127e8734f15e09919ae7fb928e0d1b71fc9bb90a78f419cb7b4f52f29aebd323dd32e7bd491b0a73a3de60cff54a91e1756f24bcdc4e91d52de5fc2

data/lib/pikuri/workspace/filesystem.rb CHANGED Viewed

@@ -37,9 +37,9 @@ module Pikuri
     # +~/.cache/pikuri/workspace-XXX/+ ({#internal_temp}). It is minted
     # lazily on first access — workspaces that never touch it (most
     # specs, hosts that don't want a playground and don't use the
-    # bubblewrap overlay) pay nothing — and removed by a single
-    # +at_exit+ handler when the process exits. Everything ephemeral
-    # this workspace produces lives inside the umbrella, so one
+    # bubblewrap overlay) pay nothing — and removed at process exit via
+    # a {Pikuri::Finalizers} registration. Everything ephemeral this
+    # workspace produces lives inside the umbrella, so one
     # +remove_entry+ at process exit cleans the lot:
     #
     # * {#temp} — the LLM-visible playground subdir, present only when
@@ -60,9 +60,9 @@ module Pikuri
     #
     # At gem load, {.sweep_stale_internal_temps!} prunes umbrella dirs
     # older than seven days — a safety net for sessions that died
-    # before +at_exit+ could run (SIGKILL, OOM). Recent umbrellas are
-    # left alone so a concurrent pikuri-code in another shell isn't
-    # disturbed.
+    # before the {Pikuri::Finalizers} sweep could run (SIGKILL, OOM).
+    # Recent umbrellas are left alone so a concurrent pikuri-code in
+    # another shell isn't disturbed.
     #
     # == Optional temp playground
     #
@@ -279,8 +279,8 @@ module Pikuri
       end
       # Per-workspace ephemeral umbrella. Minted lazily on first call
-      # under {CACHE_BASE}. Registered for +at_exit+ removal the
-      # moment it's minted, so anything subsequently placed inside
+      # under {CACHE_BASE}. Registered with {Pikuri::Finalizers} for
+      # removal the moment it's minted, so anything subsequently placed inside
       # (the playground, {Pikuri::Code::Bash::Sandbox::Bubblewrap}'s
       # overlay state) gets wiped together. Callers that want
       # ephemeral state owned by the workspace should put it under
@@ -291,13 +291,16 @@ module Pikuri
         @internal_temp ||= Filesystem.mint_internal_temp
       end
-      # @api private — minting helper shared with {AllowAll}. The
-      # +FileUtils.remove_entry+ +at_exit+ guards against the dir
-      # being already gone (test cleanup, manual rm).
+      # @api private — minting helper shared with {AllowAll}.
+      # Registers umbrella removal with {Pikuri::Finalizers} (block
+      # form — the umbrella is a dir to wipe, not a closeable object)
+      # rather than its own +at_exit+, so process teardown stays on the
+      # one registry. The +path.exist?+ guard makes the removal a no-op
+      # when the dir is already gone (test cleanup, manual rm).
       def self.mint_internal_temp
         FileUtils.mkdir_p(CACHE_BASE)
         path = Pathname.new(Dir.mktmpdir('workspace-', CACHE_BASE)).realpath
-        at_exit { FileUtils.remove_entry(path.to_s) if path.exist? }
+        Pikuri::Finalizers.register { FileUtils.remove_entry(path.to_s) if path.exist? }
         path
       end
@@ -305,8 +308,8 @@ module Pikuri
       # {INTERNAL_TEMP_STALE_SECONDS}. Called once at gem load via
       # {Pikuri::Workspace} so each process boot inherits a tidy
       # {CACHE_BASE}. Failures (permission denied, racing concurrent
-      # sweeper) are swallowed — best-effort cleanup, the real
-      # +at_exit+ path is the load-bearing one.
+      # sweeper) are swallowed — best-effort cleanup; the
+      # {Pikuri::Finalizers} removal is the load-bearing path.
       #
       # @return [void]
       def self.sweep_stale_internal_temps!

data/lib/pikuri/workspace/read.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 # frozen_string_literal: true
-require 'pdf-reader'
 require 'ruby_llm'
 module Pikuri
@@ -26,7 +25,11 @@ module Pikuri
     #
     # == Truncation rules
     #
-    # Two independent limits, whichever fires first wins:
+    # The line/byte windowing is delegated to
+    # {Pikuri::FileType.read_as_text_paged}, which returns a
+    # {Pikuri::Extractor::Page} this tool renders; the same windower
+    # backs +VectorDb::Tools::Read+. Two independent limits, whichever fires
+    # first wins:
     #
     # * *Line limit* — {DEFAULT_LIMIT} lines (overridable via +limit+).
     # * *Byte cap* — {MAX_BYTES} bytes of input content; not exposed as a
@@ -34,25 +37,30 @@ module Pikuri
     #
     # Additionally, individual lines longer than {MAX_LINE_LENGTH} chars
     # are truncated with {LINE_TRUNCATION_MARKER} appended; the model is
-    # told to reach for +grep+ to find content inside such files.
+    # told to reach for +grep+ to find content inside such files. (These
+    # constants alias the +PAGE_*+ ones on {Pikuri::Extractor} — one
+    # source of truth, shared with +VectorDb::Tools::Read+.)
     #
-    # == PDF extraction
+    # == PDF (and other extracted formats)
     #
-    # PDFs are detected by their +%PDF-+ magic prefix in the sample bytes
-    # and routed to {.format_pdf} instead of the binary-refusal path. The
-    # extractor walks pages lazily via +pdf-reader+, emitting one synthetic
-    # +"--- Page N ---"+ header line per page followed by that page's text.
-    # The offset / limit / MAX_BYTES contract is identical to the text
-    # path — extraction stops as soon as the line or byte cap is hit, so
-    # reading the first window of a 500-page PDF only parses the few pages
-    # needed. Line numbers in PDF output are for citation back to the user
-    # only; PDFs are not editable through {Edit}.
+    # Which formats read as text is the {Pikuri::Extractor} registry's
+    # business, not this tool's: with pikuri-pdf's extractor
+    # registered, PDFs are claimed by their +%PDF-+ magic prefix ahead
+    # of the binary refusal and extracted with one synthetic
+    # +"--- Page N ---"+ header line per page (see
+    # +Pikuri::Extractors::PDF+); a gem plugging another extractor
+    # into the registry extends this tool for free. Extraction is lazy
+    # where the format allows (+extract_lines+): reading the first
+    # window of a 500-page PDF parses only the pages the window needs.
+    # Formats without a lazy line shape (HTML) are extracted in full
+    # and then windowed. Line numbers in PDF output are for citation
+    # back to the user only; PDFs are not editable through {Edit}.
     #
     # PDFs with no extractable text (scanned images, empty documents) come
     # back with an LLM-actionable hint string rather than an empty
     # observation. Encrypted / malformed / XFA-form PDFs surface as
-    # +"Error: cannot extract PDF text: ..."+ — same convention as other
-    # tool errors the model can react to. No OCR.
+    # +"Error: ..."+ — same convention as other tool errors the model
+    # can react to. No OCR.
     #
     # == Image attachments
     #
@@ -90,32 +98,38 @@ module Pikuri
     # * Image larger than {MAX_IMAGE_BYTES} → +"Error: image too large…"+,
     #   leaving the model to pick a different file or ask the user to
     #   resize.
-    # * Binary content → {Pikuri::FileType.binary?} on the sample; any
-    #   +NUL+ byte or a sample dense in control characters triggers
-    #   refusal. Catches archives and compiled artifacts without an
-    #   extension list to maintain. PDFs and supported images are
-    #   intercepted by their respective magic-byte checks via
-    #   {Pikuri::FileType.detect_mime} before the binary sniff — see
-    #   above.
+    # * Binary content → nothing in the {Pikuri::Extractor} registry
+    #   claims it ({Pikuri::Extractor::Passthrough} declines on the
+    #   {Pikuri::FileType.binary?} heuristic: any +NUL+ byte or a
+    #   sample dense in control characters). Catches archives and
+    #   compiled artifacts without an extension list to maintain.
+    #   Registered extractors (pikuri-pdf's PDF, pikuri-extractors'
+    #   office formats) claim their bytes ahead of that refusal;
+    #   images are intercepted here via {Pikuri::FileType.detect_mime}
+    #   before extraction is attempted — see above.
     # * Offset past EOF → +"Error: offset N is beyond end of file (M lines total)"+.
     class Read < Pikuri::Tool
+      # The windowing constants live on {Pikuri::Extractor} (shared
+      # with +VectorDb::Tools::Read+); these aliases keep the names this tool's
+      # description and specs reference pointing at the single source.
       # @return [Integer] default value of the +limit+ parameter (number
       #   of lines to read per call).
-      DEFAULT_LIMIT = 2000
+      DEFAULT_LIMIT = Pikuri::Extractor::PAGE_DEFAULT_LIMIT
       # @return [Integer] per-line character cap; longer lines are
       #   truncated with {LINE_TRUNCATION_MARKER}.
-      MAX_LINE_LENGTH = 2000
+      MAX_LINE_LENGTH = Pikuri::Extractor::PAGE_MAX_LINE_LENGTH
       # @return [String] suffix appended to lines truncated by
       #   {MAX_LINE_LENGTH}.
-      LINE_TRUNCATION_MARKER = "... (line truncated to #{MAX_LINE_LENGTH} chars)"
+      LINE_TRUNCATION_MARKER = Pikuri::Extractor::PAGE_LINE_TRUNCATION_MARKER
       # @return [Integer] hard byte cap on input content collected per
       #   call. Counted on the line bytes (plus one for the joining
       #   newline); the rendered output is slightly larger due to the
       #   per-line +"%6d\t"+ prefix.
-      MAX_BYTES = 50 * 1024
+      MAX_BYTES = Pikuri::Extractor::PAGE_MAX_BYTES
       # @return [String] human-readable form of {MAX_BYTES} for the
       #   continuation marker.
@@ -210,168 +224,78 @@ module Pikuri
         return "Error: #{path} is a directory; use the glob tool to list files." if resolved.directory?
         mime = Pikuri::FileType.detect_mime(resolved)
-        return format_pdf(path: path, resolved: resolved, offset: offset, limit: limit) if mime == 'application/pdf'
         return format_image(path: path, resolved: resolved, mime: mime) if mime&.start_with?('image/')
-        return "Error: cannot read binary file: #{path}" if Pikuri::FileType.binary?(resolved)
-        format_slice(path: path, resolved: resolved, offset: offset, limit: limit)
+        page = Pikuri::FileType.read_as_text_paged(
+          resolved, offset: offset, limit: limit,
+          max_bytes: MAX_BYTES, max_line_length: MAX_LINE_LENGTH
+        )
+        render_page(page)
       rescue Filesystem::Error => e
         "Error: #{e.message}"
       rescue Errno::EACCES => e
         "Error: cannot read #{path}: #{e.message}"
+      rescue ArgumentError
+        # Nothing in the Extractor registry claimed the content —
+        # read_as_text_paged's binary refusal (directories and images
+        # were already handled above).
+        "Error: cannot read binary file: #{path}"
+      rescue RuntimeError => e
+        # Extraction failure (malformed / unsupported PDF, ...)
+        # surfaced by read_as_text_paged.
+        "Error: #{e.message}"
       end
-      # Stream the file line-by-line, collect at most +limit+ lines
-      # starting at +offset+, and stop early if {MAX_BYTES} is reached.
-      # We keep counting lines past the collection window so the trailer
-      # can report total line count when the line limit (not the byte
-      # cap) was the stopping criterion — same trick opencode uses.
+      # Render a {Pikuri::Extractor::Page} as the cat-n observation: a
+      # six-column line number, a tab, then the (already-truncated)
+      # content, followed by a trailer that tells the model whether to
+      # page on. PDF pages carry +"--- Page N ---"+ marker lines from
+      # the extractor; the +kind+ only changes trailer wording here.
       #
+      # @param page [Pikuri::Extractor::Page]
       # @return [String]
-      def self.format_slice(path:, resolved:, offset:, limit:)
-        start_index   = offset - 1
-        collected     = []
-        total_lines   = 0
-        bytes         = 0
-        byte_cap_hit  = false
-        has_more      = false
-        resolved.each_line do |raw|
-          total_lines += 1
-          next if total_lines <= start_index
-          if collected.length >= limit
-            has_more = true
-            next
-          end
-          line = raw.chomp
-          if line.length > MAX_LINE_LENGTH
-            line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
-          end
-          size = line.bytesize + 1 # +1 for the joining newline
-          if bytes + size > MAX_BYTES
-            byte_cap_hit = true
-            has_more = true
-            break
-          end
-          collected << line
-          bytes += size
-        end
-        return '(Empty file)' if total_lines.zero?
+      def self.render_page(page)
+        if page.lines.empty?
+          return empty_message(page) if page.total_lines.zero?
-        if start_index >= total_lines
-          return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
+          return "Error: offset #{page.start_line} is beyond end of file " \
+                 "(#{page.total_lines} lines total)"
         end
-        last_line = offset + collected.length - 1
-        body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
+        noun = page.kind == :pdf ? 'PDF ' : ''
+        last = page.start_line + page.lines.length - 1
+        body = page.lines.each_with_index.map { |line, i| format("%6d\t%s", i + page.start_line, line) }.join("\n")
         trailer =
-          if byte_cap_hit
-            "(Output capped at #{MAX_BYTES_LABEL}. Showing lines #{offset}-#{last_line}. " \
-              "Use offset=#{last_line + 1} to continue.)"
-          elsif has_more
-            "(Showing lines #{offset}-#{last_line} of #{total_lines}. " \
-              "Use offset=#{last_line + 1} to continue.)"
+          if page.byte_capped
+            "(Output capped at #{MAX_BYTES_LABEL}. Showing #{noun}lines #{page.start_line}-#{last}. " \
+              "Use offset=#{last + 1} to continue.)"
+          elsif page.more
+            total = page.total_lines ? " of #{page.total_lines}" : ''
+            "(Showing #{noun}lines #{page.start_line}-#{last}#{total}. " \
+              "Use offset=#{last + 1} to continue.)"
           else
-            "(End of file - total #{total_lines} lines)"
+            "(End of #{page.kind == :pdf ? 'PDF' : 'file'} - total #{page.total_lines} lines)"
           end
         "#{body}\n\n#{trailer}"
       end
-      private_class_method :format_slice
+      private_class_method :render_page
-      # PDF counterpart to {.format_slice}: walk +pdf-reader+'s lazy page
-      # iterator, emit a +"--- Page N ---"+ header followed by each line
-      # of the page's extracted text, and apply the same offset / limit /
-      # MAX_BYTES contract. Stops parsing as soon as the cap is hit so a
-      # 500-page PDF only touches the few pages needed for the requested
-      # window — the file handle stays open inside the +throw :done+
-      # block, which short-circuits both the inner line loop and the
-      # outer page loop in one move.
-      #
-      # The +has_more+ trailer here cannot quote a total line count the
-      # way text files do — we'd have to parse every page just to count.
-      # Instead we drop the "of N lines" claim and stick to the
-      # next-offset hint, which is all the model needs to page.
+      # The empty-document message, worded by content kind: a scanned /
+      # text-free PDF gets an LLM-actionable hint rather than the
+      # plain-file "(Empty file)".
       #
+      # @param page [Pikuri::Extractor::Page]
       # @return [String]
-      def self.format_pdf(path:, resolved:, offset:, limit:)
-        start_index   = offset - 1
-        collected     = []
-        total_lines   = 0
-        bytes         = 0
-        byte_cap_hit  = false
-        has_more      = false
-        catch(:done) do
-          resolved.open('rb') do |io|
-            reader = ::PDF::Reader.new(io)
-            reader.pages.each_with_index do |page, idx|
-              text = page.text.strip
-              next if text.empty?
-              page_lines = ["--- Page #{idx + 1} ---", *text.split("\n")]
-              page_lines.each do |raw|
-                total_lines += 1
-                next if total_lines <= start_index
-                if collected.length >= limit
-                  has_more = true
-                  throw :done
-                end
-                line = raw
-                if line.length > MAX_LINE_LENGTH
-                  line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
-                end
-                size = line.bytesize + 1
-                if bytes + size > MAX_BYTES
-                  byte_cap_hit = true
-                  has_more = true
-                  throw :done
-                end
-                collected << line
-                bytes += size
-              end
-            end
-          end
-        end
-        return '(PDF has no extractable text; likely scanned image content)' if total_lines.zero?
-        if start_index >= total_lines
-          return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
+      def self.empty_message(page)
+        if page.kind == :pdf
+          '(PDF has no extractable text; likely scanned image content)'
+        else
+          '(Empty file)'
         end
-        last_line = offset + collected.length - 1
-        body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
-        trailer =
-          if byte_cap_hit
-            "(Output capped at #{MAX_BYTES_LABEL}. Showing PDF lines #{offset}-#{last_line}. " \
-              "Use offset=#{last_line + 1} to continue.)"
-          elsif has_more
-            "(Showing PDF lines #{offset}-#{last_line}. " \
-              "Use offset=#{last_line + 1} to continue.)"
-          else
-            "(End of PDF - total #{total_lines} lines)"
-          end
-        "#{body}\n\n#{trailer}"
-      rescue ::PDF::Reader::MalformedPDFError,
-             ::PDF::Reader::InvalidPageError,
-             ::PDF::Reader::UnsupportedFeatureError => e
-        "Error: cannot extract PDF text from #{path}: #{e.class.name.split('::').last}: #{e.message}"
       end
-      private_class_method :format_pdf
+      private_class_method :empty_message
       # Build a multimodal observation: a short metadata note ("Read
       # image: …") plus the file itself attached as a path. The model

data/lib/pikuri-workspace.rb CHANGED Viewed

@@ -23,8 +23,9 @@ module Pikuri
     LOADER.eager_load
     # Reap +~/.cache/pikuri/workspace-*+ leftovers from sessions
-    # killed before their +at_exit+ could fire. Best-effort; runs once
-    # at gem load. See {Filesystem.sweep_stale_internal_temps!}.
+    # killed before the {Pikuri::Finalizers} removal could fire.
+    # Best-effort; runs once at gem load. See
+    # {Filesystem.sweep_stale_internal_temps!}.
     Filesystem.sweep_stale_internal_temps!
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pikuri-workspace
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.6
 platform: ruby
 authors:
 - Martin Vysny
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-05-29 00:00:00.000000000 Z
+date: 2026-06-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: pikuri-core
@@ -16,14 +16,14 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 0.0.4
+        version: 0.0.6
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 0.0.4
+        version: 0.0.6
 description: |
   pikuri-workspace adds "operate on a directory tree" to pikuri-core
   agents: the +Pikuri::Workspace::Filesystem+ class that scopes