RubyGems - pikuri-workspace - Versions diffs - 0.0.4 → 0.0.5 - Mend

pikuri-workspace 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/pikuri/workspace/filesystem.rb +17 -14
data/lib/pikuri/workspace/read.rb +71 -152
data/lib/pikuri-workspace.rb +3 -2
metadata +4 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4be40e803f487eb130b8ee758b86efb7a744c92691a1dd103aae68d91e59a2f9
-  data.tar.gz: 981478770e9e7f8acef1fb1ca2b1896cf6a25c7eb73c01c1d57592f81583cf0f
+  metadata.gz: afd83aa622997eea8b09c700282cf401753d1925ace04a0c4ade3856c07ee8b4
+  data.tar.gz: 4cd46c7de8d42112e1119e68bab0ce7937ed9e5c270a29ceb34e5089178abd4c
 SHA512:
-  metadata.gz: a211df218e4270acbe737f624cc8a77cc145d8a051c3500131036692168996c800a0110a27bd714a7cad3fb5cb9dd3e25fdfdeefb917fb73ccead021cd807c94
-  data.tar.gz: fe2ff7510012d619caa166252ccef80148750e35fa58a68cd1722cec372758dc17e5605cbe380c4f106538b301134402da32e0fb13c6f23d6acec7c4aceb301e
+  metadata.gz: c818b87a2aca2f0f615d084cc7c6e0457d0a84e5cd5be8b061c36aba50a071ad2565c27170d33ae25ca810dee8a25b16b1a13b25477b21e1c00eb79d067f2bb0
+  data.tar.gz: 7277b1ba878546589d4107136f4d49e3dc68b0ee5aff80d3eeff8e06f9d97b4fa569eb8caf186337e39c72e9f33812e3963fd433eeb35f926f82bb6b681bb579

data/lib/pikuri/workspace/filesystem.rb CHANGED Viewed

@@ -37,9 +37,9 @@ module Pikuri
     # +~/.cache/pikuri/workspace-XXX/+ ({#internal_temp}). It is minted
     # lazily on first access — workspaces that never touch it (most
     # specs, hosts that don't want a playground and don't use the
-    # bubblewrap overlay) pay nothing — and removed by a single
-    # +at_exit+ handler when the process exits. Everything ephemeral
-    # this workspace produces lives inside the umbrella, so one
+    # bubblewrap overlay) pay nothing — and removed at process exit via
+    # a {Pikuri::Finalizers} registration. Everything ephemeral this
+    # workspace produces lives inside the umbrella, so one
     # +remove_entry+ at process exit cleans the lot:
     #
     # * {#temp} — the LLM-visible playground subdir, present only when
@@ -60,9 +60,9 @@ module Pikuri
     #
     # At gem load, {.sweep_stale_internal_temps!} prunes umbrella dirs
     # older than seven days — a safety net for sessions that died
-    # before +at_exit+ could run (SIGKILL, OOM). Recent umbrellas are
-    # left alone so a concurrent pikuri-code in another shell isn't
-    # disturbed.
+    # before the {Pikuri::Finalizers} sweep could run (SIGKILL, OOM).
+    # Recent umbrellas are left alone so a concurrent pikuri-code in
+    # another shell isn't disturbed.
     #
     # == Optional temp playground
     #
@@ -279,8 +279,8 @@ module Pikuri
       end
       # Per-workspace ephemeral umbrella. Minted lazily on first call
-      # under {CACHE_BASE}. Registered for +at_exit+ removal the
-      # moment it's minted, so anything subsequently placed inside
+      # under {CACHE_BASE}. Registered with {Pikuri::Finalizers} for
+      # removal the moment it's minted, so anything subsequently placed inside
       # (the playground, {Pikuri::Code::Bash::Sandbox::Bubblewrap}'s
       # overlay state) gets wiped together. Callers that want
       # ephemeral state owned by the workspace should put it under
@@ -291,13 +291,16 @@ module Pikuri
         @internal_temp ||= Filesystem.mint_internal_temp
       end
-      # @api private — minting helper shared with {AllowAll}. The
-      # +FileUtils.remove_entry+ +at_exit+ guards against the dir
-      # being already gone (test cleanup, manual rm).
+      # @api private — minting helper shared with {AllowAll}.
+      # Registers umbrella removal with {Pikuri::Finalizers} (block
+      # form — the umbrella is a dir to wipe, not a closeable object)
+      # rather than its own +at_exit+, so process teardown stays on the
+      # one registry. The +path.exist?+ guard makes the removal a no-op
+      # when the dir is already gone (test cleanup, manual rm).
       def self.mint_internal_temp
         FileUtils.mkdir_p(CACHE_BASE)
         path = Pathname.new(Dir.mktmpdir('workspace-', CACHE_BASE)).realpath
-        at_exit { FileUtils.remove_entry(path.to_s) if path.exist? }
+        Pikuri::Finalizers.register { FileUtils.remove_entry(path.to_s) if path.exist? }
         path
       end
@@ -305,8 +308,8 @@ module Pikuri
       # {INTERNAL_TEMP_STALE_SECONDS}. Called once at gem load via
       # {Pikuri::Workspace} so each process boot inherits a tidy
       # {CACHE_BASE}. Failures (permission denied, racing concurrent
-      # sweeper) are swallowed — best-effort cleanup, the real
-      # +at_exit+ path is the load-bearing one.
+      # sweeper) are swallowed — best-effort cleanup; the
+      # {Pikuri::Finalizers} removal is the load-bearing path.
       #
       # @return [void]
       def self.sweep_stale_internal_temps!

data/lib/pikuri/workspace/read.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 # frozen_string_literal: true
-require 'pdf-reader'
 require 'ruby_llm'
 module Pikuri
@@ -26,7 +25,11 @@ module Pikuri
     #
     # == Truncation rules
     #
-    # Two independent limits, whichever fires first wins:
+    # The line/byte windowing is delegated to
+    # {Pikuri::FileType.read_as_text_paged}, which returns a
+    # {Pikuri::FileType::Page} this tool renders; the same windower
+    # backs +VectorDb::Tools::Read+. Two independent limits, whichever fires
+    # first wins:
     #
     # * *Line limit* — {DEFAULT_LIMIT} lines (overridable via +limit+).
     # * *Byte cap* — {MAX_BYTES} bytes of input content; not exposed as a
@@ -34,19 +37,22 @@ module Pikuri
     #
     # Additionally, individual lines longer than {MAX_LINE_LENGTH} chars
     # are truncated with {LINE_TRUNCATION_MARKER} appended; the model is
-    # told to reach for +grep+ to find content inside such files.
+    # told to reach for +grep+ to find content inside such files. (These
+    # constants alias the +PAGE_*+ ones on {Pikuri::FileType} — one
+    # source of truth, shared with +VectorDb::Tools::Read+.)
     #
     # == PDF extraction
     #
     # PDFs are detected by their +%PDF-+ magic prefix in the sample bytes
-    # and routed to {.format_pdf} instead of the binary-refusal path. The
-    # extractor walks pages lazily via +pdf-reader+, emitting one synthetic
-    # +"--- Page N ---"+ header line per page followed by that page's text.
-    # The offset / limit / MAX_BYTES contract is identical to the text
-    # path — extraction stops as soon as the line or byte cap is hit, so
-    # reading the first window of a 500-page PDF only parses the few pages
-    # needed. Line numbers in PDF output are for citation back to the user
-    # only; PDFs are not editable through {Edit}.
+    # and routed through {Pikuri::FileType.read_as_text_paged} instead of
+    # the binary-refusal path. The extractor walks pages lazily via
+    # +pdf-reader+, emitting one synthetic +"--- Page N ---"+ header line
+    # per page followed by that page's text. The offset / limit /
+    # MAX_BYTES contract is identical to the text path — extraction stops
+    # as soon as the line or byte cap is hit, so reading the first window
+    # of a 500-page PDF only parses the few pages needed. Line numbers in
+    # PDF output are for citation back to the user only; PDFs are not
+    # editable through {Edit}.
     #
     # PDFs with no extractable text (scanned images, empty documents) come
     # back with an LLM-actionable hint string rather than an empty
@@ -99,23 +105,27 @@ module Pikuri
     #   above.
     # * Offset past EOF → +"Error: offset N is beyond end of file (M lines total)"+.
     class Read < Pikuri::Tool
+      # The windowing constants live on {Pikuri::FileType} now (shared
+      # with +VectorDb::Tools::Read+); these aliases keep the names this tool's
+      # description and specs reference pointing at the single source.
       # @return [Integer] default value of the +limit+ parameter (number
       #   of lines to read per call).
-      DEFAULT_LIMIT = 2000
+      DEFAULT_LIMIT = Pikuri::FileType::PAGE_DEFAULT_LIMIT
       # @return [Integer] per-line character cap; longer lines are
       #   truncated with {LINE_TRUNCATION_MARKER}.
-      MAX_LINE_LENGTH = 2000
+      MAX_LINE_LENGTH = Pikuri::FileType::PAGE_MAX_LINE_LENGTH
       # @return [String] suffix appended to lines truncated by
       #   {MAX_LINE_LENGTH}.
-      LINE_TRUNCATION_MARKER = "... (line truncated to #{MAX_LINE_LENGTH} chars)"
+      LINE_TRUNCATION_MARKER = Pikuri::FileType::PAGE_LINE_TRUNCATION_MARKER
       # @return [Integer] hard byte cap on input content collected per
       #   call. Counted on the line bytes (plus one for the joining
       #   newline); the rendered output is slightly larger due to the
       #   per-line +"%6d\t"+ prefix.
-      MAX_BYTES = 50 * 1024
+      MAX_BYTES = Pikuri::FileType::PAGE_MAX_BYTES
       # @return [String] human-readable form of {MAX_BYTES} for the
       #   continuation marker.
@@ -210,168 +220,77 @@ module Pikuri
         return "Error: #{path} is a directory; use the glob tool to list files." if resolved.directory?
         mime = Pikuri::FileType.detect_mime(resolved)
-        return format_pdf(path: path, resolved: resolved, offset: offset, limit: limit) if mime == 'application/pdf'
         return format_image(path: path, resolved: resolved, mime: mime) if mime&.start_with?('image/')
-        return "Error: cannot read binary file: #{path}" if Pikuri::FileType.binary?(resolved)
+        # PDFs are binary by the heuristic, so the PDF route (handled
+        # inside read_as_text_paged) must win over the binary refusal.
+        if mime != 'application/pdf' && Pikuri::FileType.binary?(resolved)
+          return "Error: cannot read binary file: #{path}"
+        end
-        format_slice(path: path, resolved: resolved, offset: offset, limit: limit)
+        page = Pikuri::FileType.read_as_text_paged(
+          resolved, offset: offset, limit: limit,
+          max_bytes: MAX_BYTES, max_line_length: MAX_LINE_LENGTH
+        )
+        render_page(page)
       rescue Filesystem::Error => e
         "Error: #{e.message}"
       rescue Errno::EACCES => e
         "Error: cannot read #{path}: #{e.message}"
+      rescue RuntimeError => e
+        # Malformed / unsupported PDF surfaced by read_as_text_paged.
+        "Error: #{e.message}"
       end
-      # Stream the file line-by-line, collect at most +limit+ lines
-      # starting at +offset+, and stop early if {MAX_BYTES} is reached.
-      # We keep counting lines past the collection window so the trailer
-      # can report total line count when the line limit (not the byte
-      # cap) was the stopping criterion — same trick opencode uses.
+      # Render a {Pikuri::FileType::Page} as the cat-n observation: a
+      # six-column line number, a tab, then the (already-truncated)
+      # content, followed by a trailer that tells the model whether to
+      # page on. PDF pages carry +"--- Page N ---"+ marker lines from
+      # the extractor; the +kind+ only changes trailer wording here.
       #
+      # @param page [Pikuri::FileType::Page]
       # @return [String]
-      def self.format_slice(path:, resolved:, offset:, limit:)
-        start_index   = offset - 1
-        collected     = []
-        total_lines   = 0
-        bytes         = 0
-        byte_cap_hit  = false
-        has_more      = false
+      def self.render_page(page)
+        if page.lines.empty?
+          return empty_message(page) if page.total_lines.zero?
-        resolved.each_line do |raw|
-          total_lines += 1
-          next if total_lines <= start_index
-          if collected.length >= limit
-            has_more = true
-            next
-          end
-          line = raw.chomp
-          if line.length > MAX_LINE_LENGTH
-            line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
-          end
-          size = line.bytesize + 1 # +1 for the joining newline
-          if bytes + size > MAX_BYTES
-            byte_cap_hit = true
-            has_more = true
-            break
-          end
-          collected << line
-          bytes += size
-        end
-        return '(Empty file)' if total_lines.zero?
-        if start_index >= total_lines
-          return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
+          return "Error: offset #{page.start_line} is beyond end of file " \
+                 "(#{page.total_lines} lines total)"
         end
-        last_line = offset + collected.length - 1
-        body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
+        noun = page.kind == :pdf ? 'PDF ' : ''
+        last = page.start_line + page.lines.length - 1
+        body = page.lines.each_with_index.map { |line, i| format("%6d\t%s", i + page.start_line, line) }.join("\n")
         trailer =
-          if byte_cap_hit
-            "(Output capped at #{MAX_BYTES_LABEL}. Showing lines #{offset}-#{last_line}. " \
-              "Use offset=#{last_line + 1} to continue.)"
-          elsif has_more
-            "(Showing lines #{offset}-#{last_line} of #{total_lines}. " \
-              "Use offset=#{last_line + 1} to continue.)"
+          if page.byte_capped
+            "(Output capped at #{MAX_BYTES_LABEL}. Showing #{noun}lines #{page.start_line}-#{last}. " \
+              "Use offset=#{last + 1} to continue.)"
+          elsif page.more
+            total = page.total_lines ? " of #{page.total_lines}" : ''
+            "(Showing #{noun}lines #{page.start_line}-#{last}#{total}. " \
+              "Use offset=#{last + 1} to continue.)"
           else
-            "(End of file - total #{total_lines} lines)"
+            "(End of #{page.kind == :pdf ? 'PDF' : 'file'} - total #{page.total_lines} lines)"
           end
         "#{body}\n\n#{trailer}"
       end
-      private_class_method :format_slice
+      private_class_method :render_page
-      # PDF counterpart to {.format_slice}: walk +pdf-reader+'s lazy page
-      # iterator, emit a +"--- Page N ---"+ header followed by each line
-      # of the page's extracted text, and apply the same offset / limit /
-      # MAX_BYTES contract. Stops parsing as soon as the cap is hit so a
-      # 500-page PDF only touches the few pages needed for the requested
-      # window — the file handle stays open inside the +throw :done+
-      # block, which short-circuits both the inner line loop and the
-      # outer page loop in one move.
-      #
-      # The +has_more+ trailer here cannot quote a total line count the
-      # way text files do — we'd have to parse every page just to count.
-      # Instead we drop the "of N lines" claim and stick to the
-      # next-offset hint, which is all the model needs to page.
+      # The empty-document message, worded by content kind: a scanned /
+      # text-free PDF gets an LLM-actionable hint rather than the
+      # plain-file "(Empty file)".
       #
+      # @param page [Pikuri::FileType::Page]
       # @return [String]
-      def self.format_pdf(path:, resolved:, offset:, limit:)
-        start_index   = offset - 1
-        collected     = []
-        total_lines   = 0
-        bytes         = 0
-        byte_cap_hit  = false
-        has_more      = false
-        catch(:done) do
-          resolved.open('rb') do |io|
-            reader = ::PDF::Reader.new(io)
-            reader.pages.each_with_index do |page, idx|
-              text = page.text.strip
-              next if text.empty?
-              page_lines = ["--- Page #{idx + 1} ---", *text.split("\n")]
-              page_lines.each do |raw|
-                total_lines += 1
-                next if total_lines <= start_index
-                if collected.length >= limit
-                  has_more = true
-                  throw :done
-                end
-                line = raw
-                if line.length > MAX_LINE_LENGTH
-                  line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
-                end
-                size = line.bytesize + 1
-                if bytes + size > MAX_BYTES
-                  byte_cap_hit = true
-                  has_more = true
-                  throw :done
-                end
-                collected << line
-                bytes += size
-              end
-            end
-          end
-        end
-        return '(PDF has no extractable text; likely scanned image content)' if total_lines.zero?
-        if start_index >= total_lines
-          return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
+      def self.empty_message(page)
+        if page.kind == :pdf
+          '(PDF has no extractable text; likely scanned image content)'
+        else
+          '(Empty file)'
         end
-        last_line = offset + collected.length - 1
-        body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
-        trailer =
-          if byte_cap_hit
-            "(Output capped at #{MAX_BYTES_LABEL}. Showing PDF lines #{offset}-#{last_line}. " \
-              "Use offset=#{last_line + 1} to continue.)"
-          elsif has_more
-            "(Showing PDF lines #{offset}-#{last_line}. " \
-              "Use offset=#{last_line + 1} to continue.)"
-          else
-            "(End of PDF - total #{total_lines} lines)"
-          end
-        "#{body}\n\n#{trailer}"
-      rescue ::PDF::Reader::MalformedPDFError,
-             ::PDF::Reader::InvalidPageError,
-             ::PDF::Reader::UnsupportedFeatureError => e
-        "Error: cannot extract PDF text from #{path}: #{e.class.name.split('::').last}: #{e.message}"
       end
-      private_class_method :format_pdf
+      private_class_method :empty_message
       # Build a multimodal observation: a short metadata note ("Read
       # image: …") plus the file itself attached as a path. The model

data/lib/pikuri-workspace.rb CHANGED Viewed

@@ -23,8 +23,9 @@ module Pikuri
     LOADER.eager_load
     # Reap +~/.cache/pikuri/workspace-*+ leftovers from sessions
-    # killed before their +at_exit+ could fire. Best-effort; runs once
-    # at gem load. See {Filesystem.sweep_stale_internal_temps!}.
+    # killed before the {Pikuri::Finalizers} removal could fire.
+    # Best-effort; runs once at gem load. See
+    # {Filesystem.sweep_stale_internal_temps!}.
     Filesystem.sweep_stale_internal_temps!
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pikuri-workspace
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.5
 platform: ruby
 authors:
 - Martin Vysny
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-05-29 00:00:00.000000000 Z
+date: 2026-06-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: pikuri-core
@@ -16,14 +16,14 @@ dependencies:
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 0.0.4
+        version: 0.0.5
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - '='
       - !ruby/object:Gem::Version
-        version: 0.0.4
+        version: 0.0.5
 description: |
   pikuri-workspace adds "operate on a directory tree" to pikuri-core
   agents: the +Pikuri::Workspace::Filesystem+ class that scopes