pikuri-workspace 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/pikuri/workspace/read.rb +41 -36
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: afd83aa622997eea8b09c700282cf401753d1925ace04a0c4ade3856c07ee8b4
4
- data.tar.gz: 4cd46c7de8d42112e1119e68bab0ce7937ed9e5c270a29ceb34e5089178abd4c
3
+ metadata.gz: 52db0d4ce078507aa4a72cbc84d54a9433788f2a2d9a90ab883a9e469e6dea99
4
+ data.tar.gz: 107b34e1c3b387b4b68d542ff8ab2345885383a461c227a6b7aeb29973a4292e
5
5
  SHA512:
6
- metadata.gz: c818b87a2aca2f0f615d084cc7c6e0457d0a84e5cd5be8b061c36aba50a071ad2565c27170d33ae25ca810dee8a25b16b1a13b25477b21e1c00eb79d067f2bb0
7
- data.tar.gz: 7277b1ba878546589d4107136f4d49e3dc68b0ee5aff80d3eeff8e06f9d97b4fa569eb8caf186337e39c72e9f33812e3963fd433eeb35f926f82bb6b681bb579
6
+ metadata.gz: 432af6cfc0a0f3555666e9c88accb0e9b6162af2c5f041c9ff71b10443f1681b8e70b9e46aa6c75ed12344357a286df087869b889f0a40aeb9635aa6c9a1e651
7
+ data.tar.gz: 362eb9437127e8734f15e09919ae7fb928e0d1b71fc9bb90a78f419cb7b4f52f29aebd323dd32e7bd491b0a73a3de60cff54a91e1756f24bcdc4e91d52de5fc2
@@ -27,7 +27,7 @@ module Pikuri
27
27
  #
28
28
  # The line/byte windowing is delegated to
29
29
  # {Pikuri::FileType.read_as_text_paged}, which returns a
30
- # {Pikuri::FileType::Page} this tool renders; the same windower
30
+ # {Pikuri::Extractor::Page} this tool renders; the same windower
31
31
  # backs +VectorDb::Tools::Read+. Two independent limits, whichever fires
32
32
  # first wins:
33
33
  #
@@ -38,27 +38,29 @@ module Pikuri
38
38
  # Additionally, individual lines longer than {MAX_LINE_LENGTH} chars
39
39
  # are truncated with {LINE_TRUNCATION_MARKER} appended; the model is
40
40
  # told to reach for +grep+ to find content inside such files. (These
41
- # constants alias the +PAGE_*+ ones on {Pikuri::FileType} — one
41
+ # constants alias the +PAGE_*+ ones on {Pikuri::Extractor} — one
42
42
  # source of truth, shared with +VectorDb::Tools::Read+.)
43
43
  #
44
- # == PDF extraction
44
+ # == PDF (and other extracted formats)
45
45
  #
46
- # PDFs are detected by their +%PDF-+ magic prefix in the sample bytes
47
- # and routed through {Pikuri::FileType.read_as_text_paged} instead of
48
- # the binary-refusal path. The extractor walks pages lazily via
49
- # +pdf-reader+, emitting one synthetic +"--- Page N ---"+ header line
50
- # per page followed by that page's text. The offset / limit /
51
- # MAX_BYTES contract is identical to the text path — extraction stops
52
- # as soon as the line or byte cap is hit, so reading the first window
53
- # of a 500-page PDF only parses the few pages needed. Line numbers in
54
- # PDF output are for citation back to the user only; PDFs are not
55
- # editable through {Edit}.
46
+ # Which formats read as text is the {Pikuri::Extractor} registry's
47
+ # business, not this tool's: with pikuri-pdf's extractor
48
+ # registered, PDFs are claimed by their +%PDF-+ magic prefix ahead
49
+ # of the binary refusal and extracted with one synthetic
50
+ # +"--- Page N ---"+ header line per page (see
51
+ # +Pikuri::Extractors::PDF+); a gem plugging another extractor
52
+ # into the registry extends this tool for free. Extraction is lazy
53
+ # where the format allows (+extract_lines+): reading the first
54
+ # window of a 500-page PDF parses only the pages the window needs.
55
+ # Formats without a lazy line shape (HTML) are extracted in full
56
+ # and then windowed. Line numbers in PDF output are for citation
57
+ # back to the user only; PDFs are not editable through {Edit}.
56
58
  #
57
59
  # PDFs with no extractable text (scanned images, empty documents) come
58
60
  # back with an LLM-actionable hint string rather than an empty
59
61
  # observation. Encrypted / malformed / XFA-form PDFs surface as
60
- # +"Error: cannot extract PDF text: ..."+ — same convention as other
61
- # tool errors the model can react to. No OCR.
62
+ # +"Error: ..."+ — same convention as other tool errors the model
63
+ # can react to. No OCR.
62
64
  #
63
65
  # == Image attachments
64
66
  #
@@ -96,36 +98,38 @@ module Pikuri
96
98
  # * Image larger than {MAX_IMAGE_BYTES} → +"Error: image too large…"+,
97
99
  # leaving the model to pick a different file or ask the user to
98
100
  # resize.
99
- # * Binary content → {Pikuri::FileType.binary?} on the sample; any
100
- # +NUL+ byte or a sample dense in control characters triggers
101
- # refusal. Catches archives and compiled artifacts without an
102
- # extension list to maintain. PDFs and supported images are
103
- # intercepted by their respective magic-byte checks via
104
- # {Pikuri::FileType.detect_mime} before the binary sniff — see
105
- # above.
101
+ # * Binary content → nothing in the {Pikuri::Extractor} registry
102
+ # claims it ({Pikuri::Extractor::Passthrough} declines on the
103
+ # {Pikuri::FileType.binary?} heuristic: any +NUL+ byte or a
104
+ # sample dense in control characters). Catches archives and
105
+ # compiled artifacts without an extension list to maintain.
106
+ # Registered extractors (pikuri-pdf's PDF, pikuri-extractors'
107
+ # office formats) claim their bytes ahead of that refusal;
108
+ # images are intercepted here via {Pikuri::FileType.detect_mime}
109
+ # before extraction is attempted — see above.
106
110
  # * Offset past EOF → +"Error: offset N is beyond end of file (M lines total)"+.
107
111
  class Read < Pikuri::Tool
108
- # The windowing constants live on {Pikuri::FileType} now (shared
112
+ # The windowing constants live on {Pikuri::Extractor} (shared
109
113
  # with +VectorDb::Tools::Read+); these aliases keep the names this tool's
110
114
  # description and specs reference pointing at the single source.
111
115
 
112
116
  # @return [Integer] default value of the +limit+ parameter (number
113
117
  # of lines to read per call).
114
- DEFAULT_LIMIT = Pikuri::FileType::PAGE_DEFAULT_LIMIT
118
+ DEFAULT_LIMIT = Pikuri::Extractor::PAGE_DEFAULT_LIMIT
115
119
 
116
120
  # @return [Integer] per-line character cap; longer lines are
117
121
  # truncated with {LINE_TRUNCATION_MARKER}.
118
- MAX_LINE_LENGTH = Pikuri::FileType::PAGE_MAX_LINE_LENGTH
122
+ MAX_LINE_LENGTH = Pikuri::Extractor::PAGE_MAX_LINE_LENGTH
119
123
 
120
124
  # @return [String] suffix appended to lines truncated by
121
125
  # {MAX_LINE_LENGTH}.
122
- LINE_TRUNCATION_MARKER = Pikuri::FileType::PAGE_LINE_TRUNCATION_MARKER
126
+ LINE_TRUNCATION_MARKER = Pikuri::Extractor::PAGE_LINE_TRUNCATION_MARKER
123
127
 
124
128
  # @return [Integer] hard byte cap on input content collected per
125
129
  # call. Counted on the line bytes (plus one for the joining
126
130
  # newline); the rendered output is slightly larger due to the
127
131
  # per-line +"%6d\t"+ prefix.
128
- MAX_BYTES = Pikuri::FileType::PAGE_MAX_BYTES
132
+ MAX_BYTES = Pikuri::Extractor::PAGE_MAX_BYTES
129
133
 
130
134
  # @return [String] human-readable form of {MAX_BYTES} for the
131
135
  # continuation marker.
@@ -221,11 +225,6 @@ module Pikuri
221
225
 
222
226
  mime = Pikuri::FileType.detect_mime(resolved)
223
227
  return format_image(path: path, resolved: resolved, mime: mime) if mime&.start_with?('image/')
224
- # PDFs are binary by the heuristic, so the PDF route (handled
225
- # inside read_as_text_paged) must win over the binary refusal.
226
- if mime != 'application/pdf' && Pikuri::FileType.binary?(resolved)
227
- return "Error: cannot read binary file: #{path}"
228
- end
229
228
 
230
229
  page = Pikuri::FileType.read_as_text_paged(
231
230
  resolved, offset: offset, limit: limit,
@@ -236,18 +235,24 @@ module Pikuri
236
235
  "Error: #{e.message}"
237
236
  rescue Errno::EACCES => e
238
237
  "Error: cannot read #{path}: #{e.message}"
238
+ rescue ArgumentError
239
+ # Nothing in the Extractor registry claimed the content —
240
+ # read_as_text_paged's binary refusal (directories and images
241
+ # were already handled above).
242
+ "Error: cannot read binary file: #{path}"
239
243
  rescue RuntimeError => e
240
- # Malformed / unsupported PDF surfaced by read_as_text_paged.
244
+ # Extraction failure (malformed / unsupported PDF, ...)
245
+ # surfaced by read_as_text_paged.
241
246
  "Error: #{e.message}"
242
247
  end
243
248
 
244
- # Render a {Pikuri::FileType::Page} as the cat-n observation: a
249
+ # Render a {Pikuri::Extractor::Page} as the cat-n observation: a
245
250
  # six-column line number, a tab, then the (already-truncated)
246
251
  # content, followed by a trailer that tells the model whether to
247
252
  # page on. PDF pages carry +"--- Page N ---"+ marker lines from
248
253
  # the extractor; the +kind+ only changes trailer wording here.
249
254
  #
250
- # @param page [Pikuri::FileType::Page]
255
+ # @param page [Pikuri::Extractor::Page]
251
256
  # @return [String]
252
257
  def self.render_page(page)
253
258
  if page.lines.empty?
@@ -281,7 +286,7 @@ module Pikuri
281
286
  # text-free PDF gets an LLM-actionable hint rather than the
282
287
  # plain-file "(Empty file)".
283
288
  #
284
- # @param page [Pikuri::FileType::Page]
289
+ # @param page [Pikuri::Extractor::Page]
285
290
  # @return [String]
286
291
  def self.empty_message(page)
287
292
  if page.kind == :pdf
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pikuri-workspace
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Vysny
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - '='
18
18
  - !ruby/object:Gem::Version
19
- version: 0.0.5
19
+ version: 0.0.6
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
- version: 0.0.5
26
+ version: 0.0.6
27
27
  description: |
28
28
  pikuri-workspace adds "operate on a directory tree" to pikuri-core
29
29
  agents: the +Pikuri::Workspace::Filesystem+ class that scopes