pikuri-workspace 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4be40e803f487eb130b8ee758b86efb7a744c92691a1dd103aae68d91e59a2f9
4
- data.tar.gz: 981478770e9e7f8acef1fb1ca2b1896cf6a25c7eb73c01c1d57592f81583cf0f
3
+ metadata.gz: 52db0d4ce078507aa4a72cbc84d54a9433788f2a2d9a90ab883a9e469e6dea99
4
+ data.tar.gz: 107b34e1c3b387b4b68d542ff8ab2345885383a461c227a6b7aeb29973a4292e
5
5
  SHA512:
6
- metadata.gz: a211df218e4270acbe737f624cc8a77cc145d8a051c3500131036692168996c800a0110a27bd714a7cad3fb5cb9dd3e25fdfdeefb917fb73ccead021cd807c94
7
- data.tar.gz: fe2ff7510012d619caa166252ccef80148750e35fa58a68cd1722cec372758dc17e5605cbe380c4f106538b301134402da32e0fb13c6f23d6acec7c4aceb301e
6
+ metadata.gz: 432af6cfc0a0f3555666e9c88accb0e9b6162af2c5f041c9ff71b10443f1681b8e70b9e46aa6c75ed12344357a286df087869b889f0a40aeb9635aa6c9a1e651
7
+ data.tar.gz: 362eb9437127e8734f15e09919ae7fb928e0d1b71fc9bb90a78f419cb7b4f52f29aebd323dd32e7bd491b0a73a3de60cff54a91e1756f24bcdc4e91d52de5fc2
@@ -37,9 +37,9 @@ module Pikuri
37
37
  # +~/.cache/pikuri/workspace-XXX/+ ({#internal_temp}). It is minted
38
38
  # lazily on first access — workspaces that never touch it (most
39
39
  # specs, hosts that don't want a playground and don't use the
40
- # bubblewrap overlay) pay nothing — and removed by a single
41
- # +at_exit+ handler when the process exits. Everything ephemeral
42
- # this workspace produces lives inside the umbrella, so one
40
+ # bubblewrap overlay) pay nothing — and removed at process exit via
41
+ # a {Pikuri::Finalizers} registration. Everything ephemeral this
42
+ # workspace produces lives inside the umbrella, so one
43
43
  # +remove_entry+ at process exit cleans the lot:
44
44
  #
45
45
  # * {#temp} — the LLM-visible playground subdir, present only when
@@ -60,9 +60,9 @@ module Pikuri
60
60
  #
61
61
  # At gem load, {.sweep_stale_internal_temps!} prunes umbrella dirs
62
62
  # older than seven days — a safety net for sessions that died
63
- # before +at_exit+ could run (SIGKILL, OOM). Recent umbrellas are
64
- # left alone so a concurrent pikuri-code in another shell isn't
65
- # disturbed.
63
+ # before the {Pikuri::Finalizers} sweep could run (SIGKILL, OOM).
64
+ # Recent umbrellas are left alone so a concurrent pikuri-code in
65
+ # another shell isn't disturbed.
66
66
  #
67
67
  # == Optional temp playground
68
68
  #
@@ -279,8 +279,8 @@ module Pikuri
279
279
  end
280
280
 
281
281
  # Per-workspace ephemeral umbrella. Minted lazily on first call
282
- # under {CACHE_BASE}. Registered for +at_exit+ removal the
283
- # moment it's minted, so anything subsequently placed inside
282
+ # under {CACHE_BASE}. Registered with {Pikuri::Finalizers} for
283
+ # removal the moment it's minted, so anything subsequently placed inside
284
284
  # (the playground, {Pikuri::Code::Bash::Sandbox::Bubblewrap}'s
285
285
  # overlay state) gets wiped together. Callers that want
286
286
  # ephemeral state owned by the workspace should put it under
@@ -291,13 +291,16 @@ module Pikuri
291
291
  @internal_temp ||= Filesystem.mint_internal_temp
292
292
  end
293
293
 
294
- # @api private — minting helper shared with {AllowAll}. The
295
- # +FileUtils.remove_entry+ +at_exit+ guards against the dir
296
- # being already gone (test cleanup, manual rm).
294
+ # @api private — minting helper shared with {AllowAll}.
295
+ # Registers umbrella removal with {Pikuri::Finalizers} (block
296
+ # form the umbrella is a dir to wipe, not a closeable object)
297
+ # rather than its own +at_exit+, so process teardown stays on the
298
+ # one registry. The +path.exist?+ guard makes the removal a no-op
299
+ # when the dir is already gone (test cleanup, manual rm).
297
300
  def self.mint_internal_temp
298
301
  FileUtils.mkdir_p(CACHE_BASE)
299
302
  path = Pathname.new(Dir.mktmpdir('workspace-', CACHE_BASE)).realpath
300
- at_exit { FileUtils.remove_entry(path.to_s) if path.exist? }
303
+ Pikuri::Finalizers.register { FileUtils.remove_entry(path.to_s) if path.exist? }
301
304
  path
302
305
  end
303
306
 
@@ -305,8 +308,8 @@ module Pikuri
305
308
  # {INTERNAL_TEMP_STALE_SECONDS}. Called once at gem load via
306
309
  # {Pikuri::Workspace} so each process boot inherits a tidy
307
310
  # {CACHE_BASE}. Failures (permission denied, racing concurrent
308
- # sweeper) are swallowed — best-effort cleanup, the real
309
- # +at_exit+ path is the load-bearing one.
311
+ # sweeper) are swallowed — best-effort cleanup; the
312
+ # {Pikuri::Finalizers} removal is the load-bearing path.
310
313
  #
311
314
  # @return [void]
312
315
  def self.sweep_stale_internal_temps!
@@ -1,6 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'pdf-reader'
4
3
  require 'ruby_llm'
5
4
 
6
5
  module Pikuri
@@ -26,7 +25,11 @@ module Pikuri
26
25
  #
27
26
  # == Truncation rules
28
27
  #
29
- # Two independent limits, whichever fires first wins:
28
+ # The line/byte windowing is delegated to
29
+ # {Pikuri::FileType.read_as_text_paged}, which returns a
30
+ # {Pikuri::Extractor::Page} this tool renders; the same windower
31
+ # backs +VectorDb::Tools::Read+. Two independent limits, whichever fires
32
+ # first wins:
30
33
  #
31
34
  # * *Line limit* — {DEFAULT_LIMIT} lines (overridable via +limit+).
32
35
  # * *Byte cap* — {MAX_BYTES} bytes of input content; not exposed as a
@@ -34,25 +37,30 @@ module Pikuri
34
37
  #
35
38
  # Additionally, individual lines longer than {MAX_LINE_LENGTH} chars
36
39
  # are truncated with {LINE_TRUNCATION_MARKER} appended; the model is
37
- # told to reach for +grep+ to find content inside such files.
40
+ # told to reach for +grep+ to find content inside such files. (These
41
+ # constants alias the +PAGE_*+ ones on {Pikuri::Extractor} — one
42
+ # source of truth, shared with +VectorDb::Tools::Read+.)
38
43
  #
39
- # == PDF extraction
44
+ # == PDF (and other extracted formats)
40
45
  #
41
- # PDFs are detected by their +%PDF-+ magic prefix in the sample bytes
42
- # and routed to {.format_pdf} instead of the binary-refusal path. The
43
- # extractor walks pages lazily via +pdf-reader+, emitting one synthetic
44
- # +"--- Page N ---"+ header line per page followed by that page's text.
45
- # The offset / limit / MAX_BYTES contract is identical to the text
46
- # path extraction stops as soon as the line or byte cap is hit, so
47
- # reading the first window of a 500-page PDF only parses the few pages
48
- # needed. Line numbers in PDF output are for citation back to the user
49
- # only; PDFs are not editable through {Edit}.
46
+ # Which formats read as text is the {Pikuri::Extractor} registry's
47
+ # business, not this tool's: with pikuri-pdf's extractor
48
+ # registered, PDFs are claimed by their +%PDF-+ magic prefix ahead
49
+ # of the binary refusal and extracted with one synthetic
50
+ # +"--- Page N ---"+ header line per page (see
51
+ # +Pikuri::Extractors::PDF+); a gem plugging another extractor
52
+ # into the registry extends this tool for free. Extraction is lazy
53
+ # where the format allows (+extract_lines+): reading the first
54
+ # window of a 500-page PDF parses only the pages the window needs.
55
+ # Formats without a lazy line shape (HTML) are extracted in full
56
+ # and then windowed. Line numbers in PDF output are for citation
57
+ # back to the user only; PDFs are not editable through {Edit}.
50
58
  #
51
59
  # PDFs with no extractable text (scanned images, empty documents) come
52
60
  # back with an LLM-actionable hint string rather than an empty
53
61
  # observation. Encrypted / malformed / XFA-form PDFs surface as
54
- # +"Error: cannot extract PDF text: ..."+ — same convention as other
55
- # tool errors the model can react to. No OCR.
62
+ # +"Error: ..."+ — same convention as other tool errors the model
63
+ # can react to. No OCR.
56
64
  #
57
65
  # == Image attachments
58
66
  #
@@ -90,32 +98,38 @@ module Pikuri
90
98
  # * Image larger than {MAX_IMAGE_BYTES} → +"Error: image too large…"+,
91
99
  # leaving the model to pick a different file or ask the user to
92
100
  # resize.
93
- # * Binary content → {Pikuri::FileType.binary?} on the sample; any
94
- # +NUL+ byte or a sample dense in control characters triggers
95
- # refusal. Catches archives and compiled artifacts without an
96
- # extension list to maintain. PDFs and supported images are
97
- # intercepted by their respective magic-byte checks via
98
- # {Pikuri::FileType.detect_mime} before the binary sniff — see
99
- # above.
101
+ # * Binary content → nothing in the {Pikuri::Extractor} registry
102
+ # claims it ({Pikuri::Extractor::Passthrough} declines on the
103
+ # {Pikuri::FileType.binary?} heuristic: any +NUL+ byte or a
104
+ # sample dense in control characters). Catches archives and
105
+ # compiled artifacts without an extension list to maintain.
106
+ # Registered extractors (pikuri-pdf's PDF, pikuri-extractors'
107
+ # office formats) claim their bytes ahead of that refusal;
108
+ # images are intercepted here via {Pikuri::FileType.detect_mime}
109
+ # before extraction is attempted — see above.
100
110
  # * Offset past EOF → +"Error: offset N is beyond end of file (M lines total)"+.
101
111
  class Read < Pikuri::Tool
112
+ # The windowing constants live on {Pikuri::Extractor} (shared
113
+ # with +VectorDb::Tools::Read+); these aliases keep the names this tool's
114
+ # description and specs reference pointing at the single source.
115
+
102
116
  # @return [Integer] default value of the +limit+ parameter (number
103
117
  # of lines to read per call).
104
- DEFAULT_LIMIT = 2000
118
+ DEFAULT_LIMIT = Pikuri::Extractor::PAGE_DEFAULT_LIMIT
105
119
 
106
120
  # @return [Integer] per-line character cap; longer lines are
107
121
  # truncated with {LINE_TRUNCATION_MARKER}.
108
- MAX_LINE_LENGTH = 2000
122
+ MAX_LINE_LENGTH = Pikuri::Extractor::PAGE_MAX_LINE_LENGTH
109
123
 
110
124
  # @return [String] suffix appended to lines truncated by
111
125
  # {MAX_LINE_LENGTH}.
112
- LINE_TRUNCATION_MARKER = "... (line truncated to #{MAX_LINE_LENGTH} chars)"
126
+ LINE_TRUNCATION_MARKER = Pikuri::Extractor::PAGE_LINE_TRUNCATION_MARKER
113
127
 
114
128
  # @return [Integer] hard byte cap on input content collected per
115
129
  # call. Counted on the line bytes (plus one for the joining
116
130
  # newline); the rendered output is slightly larger due to the
117
131
  # per-line +"%6d\t"+ prefix.
118
- MAX_BYTES = 50 * 1024
132
+ MAX_BYTES = Pikuri::Extractor::PAGE_MAX_BYTES
119
133
 
120
134
  # @return [String] human-readable form of {MAX_BYTES} for the
121
135
  # continuation marker.
@@ -210,168 +224,78 @@ module Pikuri
210
224
  return "Error: #{path} is a directory; use the glob tool to list files." if resolved.directory?
211
225
 
212
226
  mime = Pikuri::FileType.detect_mime(resolved)
213
-
214
- return format_pdf(path: path, resolved: resolved, offset: offset, limit: limit) if mime == 'application/pdf'
215
227
  return format_image(path: path, resolved: resolved, mime: mime) if mime&.start_with?('image/')
216
- return "Error: cannot read binary file: #{path}" if Pikuri::FileType.binary?(resolved)
217
228
 
218
- format_slice(path: path, resolved: resolved, offset: offset, limit: limit)
229
+ page = Pikuri::FileType.read_as_text_paged(
230
+ resolved, offset: offset, limit: limit,
231
+ max_bytes: MAX_BYTES, max_line_length: MAX_LINE_LENGTH
232
+ )
233
+ render_page(page)
219
234
  rescue Filesystem::Error => e
220
235
  "Error: #{e.message}"
221
236
  rescue Errno::EACCES => e
222
237
  "Error: cannot read #{path}: #{e.message}"
238
+ rescue ArgumentError
239
+ # Nothing in the Extractor registry claimed the content —
240
+ # read_as_text_paged's binary refusal (directories and images
241
+ # were already handled above).
242
+ "Error: cannot read binary file: #{path}"
243
+ rescue RuntimeError => e
244
+ # Extraction failure (malformed / unsupported PDF, ...)
245
+ # surfaced by read_as_text_paged.
246
+ "Error: #{e.message}"
223
247
  end
224
248
 
225
- # Stream the file line-by-line, collect at most +limit+ lines
226
- # starting at +offset+, and stop early if {MAX_BYTES} is reached.
227
- # We keep counting lines past the collection window so the trailer
228
- # can report total line count when the line limit (not the byte
229
- # cap) was the stopping criterion same trick opencode uses.
249
+ # Render a {Pikuri::Extractor::Page} as the cat-n observation: a
250
+ # six-column line number, a tab, then the (already-truncated)
251
+ # content, followed by a trailer that tells the model whether to
252
+ # page on. PDF pages carry +"--- Page N ---"+ marker lines from
253
+ # the extractor; the +kind+ only changes trailer wording here.
230
254
  #
255
+ # @param page [Pikuri::Extractor::Page]
231
256
  # @return [String]
232
- def self.format_slice(path:, resolved:, offset:, limit:)
233
- start_index = offset - 1
234
- collected = []
235
- total_lines = 0
236
- bytes = 0
237
- byte_cap_hit = false
238
- has_more = false
239
-
240
- resolved.each_line do |raw|
241
- total_lines += 1
242
- next if total_lines <= start_index
243
-
244
- if collected.length >= limit
245
- has_more = true
246
- next
247
- end
248
-
249
- line = raw.chomp
250
- if line.length > MAX_LINE_LENGTH
251
- line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
252
- end
253
-
254
- size = line.bytesize + 1 # +1 for the joining newline
255
- if bytes + size > MAX_BYTES
256
- byte_cap_hit = true
257
- has_more = true
258
- break
259
- end
260
-
261
- collected << line
262
- bytes += size
263
- end
264
-
265
- return '(Empty file)' if total_lines.zero?
257
+ def self.render_page(page)
258
+ if page.lines.empty?
259
+ return empty_message(page) if page.total_lines.zero?
266
260
 
267
- if start_index >= total_lines
268
- return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
261
+ return "Error: offset #{page.start_line} is beyond end of file " \
262
+ "(#{page.total_lines} lines total)"
269
263
  end
270
264
 
271
- last_line = offset + collected.length - 1
272
- body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
265
+ noun = page.kind == :pdf ? 'PDF ' : ''
266
+ last = page.start_line + page.lines.length - 1
267
+ body = page.lines.each_with_index.map { |line, i| format("%6d\t%s", i + page.start_line, line) }.join("\n")
273
268
 
274
269
  trailer =
275
- if byte_cap_hit
276
- "(Output capped at #{MAX_BYTES_LABEL}. Showing lines #{offset}-#{last_line}. " \
277
- "Use offset=#{last_line + 1} to continue.)"
278
- elsif has_more
279
- "(Showing lines #{offset}-#{last_line} of #{total_lines}. " \
280
- "Use offset=#{last_line + 1} to continue.)"
270
+ if page.byte_capped
271
+ "(Output capped at #{MAX_BYTES_LABEL}. Showing #{noun}lines #{page.start_line}-#{last}. " \
272
+ "Use offset=#{last + 1} to continue.)"
273
+ elsif page.more
274
+ total = page.total_lines ? " of #{page.total_lines}" : ''
275
+ "(Showing #{noun}lines #{page.start_line}-#{last}#{total}. " \
276
+ "Use offset=#{last + 1} to continue.)"
281
277
  else
282
- "(End of file - total #{total_lines} lines)"
278
+ "(End of #{page.kind == :pdf ? 'PDF' : 'file'} - total #{page.total_lines} lines)"
283
279
  end
284
280
 
285
281
  "#{body}\n\n#{trailer}"
286
282
  end
287
- private_class_method :format_slice
283
+ private_class_method :render_page
288
284
 
289
- # PDF counterpart to {.format_slice}: walk +pdf-reader+'s lazy page
290
- # iterator, emit a +"--- Page N ---"+ header followed by each line
291
- # of the page's extracted text, and apply the same offset / limit /
292
- # MAX_BYTES contract. Stops parsing as soon as the cap is hit so a
293
- # 500-page PDF only touches the few pages needed for the requested
294
- # window — the file handle stays open inside the +throw :done+
295
- # block, which short-circuits both the inner line loop and the
296
- # outer page loop in one move.
297
- #
298
- # The +has_more+ trailer here cannot quote a total line count the
299
- # way text files do — we'd have to parse every page just to count.
300
- # Instead we drop the "of N lines" claim and stick to the
301
- # next-offset hint, which is all the model needs to page.
285
+ # The empty-document message, worded by content kind: a scanned /
286
+ # text-free PDF gets an LLM-actionable hint rather than the
287
+ # plain-file "(Empty file)".
302
288
  #
289
+ # @param page [Pikuri::Extractor::Page]
303
290
  # @return [String]
304
- def self.format_pdf(path:, resolved:, offset:, limit:)
305
- start_index = offset - 1
306
- collected = []
307
- total_lines = 0
308
- bytes = 0
309
- byte_cap_hit = false
310
- has_more = false
311
-
312
- catch(:done) do
313
- resolved.open('rb') do |io|
314
- reader = ::PDF::Reader.new(io)
315
- reader.pages.each_with_index do |page, idx|
316
- text = page.text.strip
317
- next if text.empty?
318
-
319
- page_lines = ["--- Page #{idx + 1} ---", *text.split("\n")]
320
- page_lines.each do |raw|
321
- total_lines += 1
322
- next if total_lines <= start_index
323
-
324
- if collected.length >= limit
325
- has_more = true
326
- throw :done
327
- end
328
-
329
- line = raw
330
- if line.length > MAX_LINE_LENGTH
331
- line = line[0, MAX_LINE_LENGTH] + LINE_TRUNCATION_MARKER
332
- end
333
-
334
- size = line.bytesize + 1
335
- if bytes + size > MAX_BYTES
336
- byte_cap_hit = true
337
- has_more = true
338
- throw :done
339
- end
340
-
341
- collected << line
342
- bytes += size
343
- end
344
- end
345
- end
346
- end
347
-
348
- return '(PDF has no extractable text; likely scanned image content)' if total_lines.zero?
349
-
350
- if start_index >= total_lines
351
- return "Error: offset #{offset} is beyond end of file (#{total_lines} lines total)"
291
+ def self.empty_message(page)
292
+ if page.kind == :pdf
293
+ '(PDF has no extractable text; likely scanned image content)'
294
+ else
295
+ '(Empty file)'
352
296
  end
353
-
354
- last_line = offset + collected.length - 1
355
- body = collected.each_with_index.map { |line, i| format("%6d\t%s", i + offset, line) }.join("\n")
356
-
357
- trailer =
358
- if byte_cap_hit
359
- "(Output capped at #{MAX_BYTES_LABEL}. Showing PDF lines #{offset}-#{last_line}. " \
360
- "Use offset=#{last_line + 1} to continue.)"
361
- elsif has_more
362
- "(Showing PDF lines #{offset}-#{last_line}. " \
363
- "Use offset=#{last_line + 1} to continue.)"
364
- else
365
- "(End of PDF - total #{total_lines} lines)"
366
- end
367
-
368
- "#{body}\n\n#{trailer}"
369
- rescue ::PDF::Reader::MalformedPDFError,
370
- ::PDF::Reader::InvalidPageError,
371
- ::PDF::Reader::UnsupportedFeatureError => e
372
- "Error: cannot extract PDF text from #{path}: #{e.class.name.split('::').last}: #{e.message}"
373
297
  end
374
- private_class_method :format_pdf
298
+ private_class_method :empty_message
375
299
 
376
300
  # Build a multimodal observation: a short metadata note ("Read
377
301
  # image: …") plus the file itself attached as a path. The model
@@ -23,8 +23,9 @@ module Pikuri
23
23
  LOADER.eager_load
24
24
 
25
25
  # Reap +~/.cache/pikuri/workspace-*+ leftovers from sessions
26
- # killed before their +at_exit+ could fire. Best-effort; runs once
27
- # at gem load. See {Filesystem.sweep_stale_internal_temps!}.
26
+ # killed before the {Pikuri::Finalizers} removal could fire.
27
+ # Best-effort; runs once at gem load. See
28
+ # {Filesystem.sweep_stale_internal_temps!}.
28
29
  Filesystem.sweep_stale_internal_temps!
29
30
  end
30
31
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pikuri-workspace
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Vysny
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-05-29 00:00:00.000000000 Z
11
+ date: 2026-06-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pikuri-core
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 0.0.4
19
+ version: 0.0.6
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 0.0.4
26
+ version: 0.0.6
27
27
  description: |
28
28
  pikuri-workspace adds "operate on a directory tree" to pikuri-core
29
29
  agents: the +Pikuri::Workspace::Filesystem+ class that scopes