pikuri-workspace 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pikuri/workspace/read.rb +41 -36
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 52db0d4ce078507aa4a72cbc84d54a9433788f2a2d9a90ab883a9e469e6dea99
|
|
4
|
+
data.tar.gz: 107b34e1c3b387b4b68d542ff8ab2345885383a461c227a6b7aeb29973a4292e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 432af6cfc0a0f3555666e9c88accb0e9b6162af2c5f041c9ff71b10443f1681b8e70b9e46aa6c75ed12344357a286df087869b889f0a40aeb9635aa6c9a1e651
|
|
7
|
+
data.tar.gz: 362eb9437127e8734f15e09919ae7fb928e0d1b71fc9bb90a78f419cb7b4f52f29aebd323dd32e7bd491b0a73a3de60cff54a91e1756f24bcdc4e91d52de5fc2
|
|
@@ -27,7 +27,7 @@ module Pikuri
|
|
|
27
27
|
#
|
|
28
28
|
# The line/byte windowing is delegated to
|
|
29
29
|
# {Pikuri::FileType.read_as_text_paged}, which returns a
|
|
30
|
-
# {Pikuri::
|
|
30
|
+
# {Pikuri::Extractor::Page} this tool renders; the same windower
|
|
31
31
|
# backs +VectorDb::Tools::Read+. Two independent limits, whichever fires
|
|
32
32
|
# first wins:
|
|
33
33
|
#
|
|
@@ -38,27 +38,29 @@ module Pikuri
|
|
|
38
38
|
# Additionally, individual lines longer than {MAX_LINE_LENGTH} chars
|
|
39
39
|
# are truncated with {LINE_TRUNCATION_MARKER} appended; the model is
|
|
40
40
|
# told to reach for +grep+ to find content inside such files. (These
|
|
41
|
-
# constants alias the +PAGE_*+ ones on {Pikuri::
|
|
41
|
+
# constants alias the +PAGE_*+ ones on {Pikuri::Extractor} — one
|
|
42
42
|
# source of truth, shared with +VectorDb::Tools::Read+.)
|
|
43
43
|
#
|
|
44
|
-
# == PDF
|
|
44
|
+
# == PDF (and other extracted formats)
|
|
45
45
|
#
|
|
46
|
-
#
|
|
47
|
-
#
|
|
48
|
-
#
|
|
49
|
-
#
|
|
50
|
-
#
|
|
51
|
-
#
|
|
52
|
-
#
|
|
53
|
-
#
|
|
54
|
-
#
|
|
55
|
-
#
|
|
46
|
+
# Which formats read as text is the {Pikuri::Extractor} registry's
|
|
47
|
+
# business, not this tool's: with pikuri-pdf's extractor
|
|
48
|
+
# registered, PDFs are claimed by their +%PDF-+ magic prefix ahead
|
|
49
|
+
# of the binary refusal and extracted with one synthetic
|
|
50
|
+
# +"--- Page N ---"+ header line per page (see
|
|
51
|
+
# +Pikuri::Extractors::PDF+); a gem plugging another extractor
|
|
52
|
+
# into the registry extends this tool for free. Extraction is lazy
|
|
53
|
+
# where the format allows (+extract_lines+): reading the first
|
|
54
|
+
# window of a 500-page PDF parses only the pages the window needs.
|
|
55
|
+
# Formats without a lazy line shape (HTML) are extracted in full
|
|
56
|
+
# and then windowed. Line numbers in PDF output are for citation
|
|
57
|
+
# back to the user only; PDFs are not editable through {Edit}.
|
|
56
58
|
#
|
|
57
59
|
# PDFs with no extractable text (scanned images, empty documents) come
|
|
58
60
|
# back with an LLM-actionable hint string rather than an empty
|
|
59
61
|
# observation. Encrypted / malformed / XFA-form PDFs surface as
|
|
60
|
-
# +"Error:
|
|
61
|
-
#
|
|
62
|
+
# +"Error: ..."+ — same convention as other tool errors the model
|
|
63
|
+
# can react to. No OCR.
|
|
62
64
|
#
|
|
63
65
|
# == Image attachments
|
|
64
66
|
#
|
|
@@ -96,36 +98,38 @@ module Pikuri
|
|
|
96
98
|
# * Image larger than {MAX_IMAGE_BYTES} → +"Error: image too large…"+,
|
|
97
99
|
# leaving the model to pick a different file or ask the user to
|
|
98
100
|
# resize.
|
|
99
|
-
# * Binary content → {Pikuri::
|
|
100
|
-
#
|
|
101
|
-
#
|
|
102
|
-
#
|
|
103
|
-
#
|
|
104
|
-
#
|
|
105
|
-
#
|
|
101
|
+
# * Binary content → nothing in the {Pikuri::Extractor} registry
|
|
102
|
+
# claims it ({Pikuri::Extractor::Passthrough} declines on the
|
|
103
|
+
# {Pikuri::FileType.binary?} heuristic: any +NUL+ byte or a
|
|
104
|
+
# sample dense in control characters). Catches archives and
|
|
105
|
+
# compiled artifacts without an extension list to maintain.
|
|
106
|
+
# Registered extractors (pikuri-pdf's PDF, pikuri-extractors'
|
|
107
|
+
# office formats) claim their bytes ahead of that refusal;
|
|
108
|
+
# images are intercepted here via {Pikuri::FileType.detect_mime}
|
|
109
|
+
# before extraction is attempted — see above.
|
|
106
110
|
# * Offset past EOF → +"Error: offset N is beyond end of file (M lines total)"+.
|
|
107
111
|
class Read < Pikuri::Tool
|
|
108
|
-
# The windowing constants live on {Pikuri::
|
|
112
|
+
# The windowing constants live on {Pikuri::Extractor} (shared
|
|
109
113
|
# with +VectorDb::Tools::Read+); these aliases keep the names this tool's
|
|
110
114
|
# description and specs reference pointing at the single source.
|
|
111
115
|
|
|
112
116
|
# @return [Integer] default value of the +limit+ parameter (number
|
|
113
117
|
# of lines to read per call).
|
|
114
|
-
DEFAULT_LIMIT = Pikuri::
|
|
118
|
+
DEFAULT_LIMIT = Pikuri::Extractor::PAGE_DEFAULT_LIMIT
|
|
115
119
|
|
|
116
120
|
# @return [Integer] per-line character cap; longer lines are
|
|
117
121
|
# truncated with {LINE_TRUNCATION_MARKER}.
|
|
118
|
-
MAX_LINE_LENGTH = Pikuri::
|
|
122
|
+
MAX_LINE_LENGTH = Pikuri::Extractor::PAGE_MAX_LINE_LENGTH
|
|
119
123
|
|
|
120
124
|
# @return [String] suffix appended to lines truncated by
|
|
121
125
|
# {MAX_LINE_LENGTH}.
|
|
122
|
-
LINE_TRUNCATION_MARKER = Pikuri::
|
|
126
|
+
LINE_TRUNCATION_MARKER = Pikuri::Extractor::PAGE_LINE_TRUNCATION_MARKER
|
|
123
127
|
|
|
124
128
|
# @return [Integer] hard byte cap on input content collected per
|
|
125
129
|
# call. Counted on the line bytes (plus one for the joining
|
|
126
130
|
# newline); the rendered output is slightly larger due to the
|
|
127
131
|
# per-line +"%6d\t"+ prefix.
|
|
128
|
-
MAX_BYTES = Pikuri::
|
|
132
|
+
MAX_BYTES = Pikuri::Extractor::PAGE_MAX_BYTES
|
|
129
133
|
|
|
130
134
|
# @return [String] human-readable form of {MAX_BYTES} for the
|
|
131
135
|
# continuation marker.
|
|
@@ -221,11 +225,6 @@ module Pikuri
|
|
|
221
225
|
|
|
222
226
|
mime = Pikuri::FileType.detect_mime(resolved)
|
|
223
227
|
return format_image(path: path, resolved: resolved, mime: mime) if mime&.start_with?('image/')
|
|
224
|
-
# PDFs are binary by the heuristic, so the PDF route (handled
|
|
225
|
-
# inside read_as_text_paged) must win over the binary refusal.
|
|
226
|
-
if mime != 'application/pdf' && Pikuri::FileType.binary?(resolved)
|
|
227
|
-
return "Error: cannot read binary file: #{path}"
|
|
228
|
-
end
|
|
229
228
|
|
|
230
229
|
page = Pikuri::FileType.read_as_text_paged(
|
|
231
230
|
resolved, offset: offset, limit: limit,
|
|
@@ -236,18 +235,24 @@ module Pikuri
|
|
|
236
235
|
"Error: #{e.message}"
|
|
237
236
|
rescue Errno::EACCES => e
|
|
238
237
|
"Error: cannot read #{path}: #{e.message}"
|
|
238
|
+
rescue ArgumentError
|
|
239
|
+
# Nothing in the Extractor registry claimed the content —
|
|
240
|
+
# read_as_text_paged's binary refusal (directories and images
|
|
241
|
+
# were already handled above).
|
|
242
|
+
"Error: cannot read binary file: #{path}"
|
|
239
243
|
rescue RuntimeError => e
|
|
240
|
-
#
|
|
244
|
+
# Extraction failure (malformed / unsupported PDF, ...)
|
|
245
|
+
# surfaced by read_as_text_paged.
|
|
241
246
|
"Error: #{e.message}"
|
|
242
247
|
end
|
|
243
248
|
|
|
244
|
-
# Render a {Pikuri::
|
|
249
|
+
# Render a {Pikuri::Extractor::Page} as the cat-n observation: a
|
|
245
250
|
# six-column line number, a tab, then the (already-truncated)
|
|
246
251
|
# content, followed by a trailer that tells the model whether to
|
|
247
252
|
# page on. PDF pages carry +"--- Page N ---"+ marker lines from
|
|
248
253
|
# the extractor; the +kind+ only changes trailer wording here.
|
|
249
254
|
#
|
|
250
|
-
# @param page [Pikuri::
|
|
255
|
+
# @param page [Pikuri::Extractor::Page]
|
|
251
256
|
# @return [String]
|
|
252
257
|
def self.render_page(page)
|
|
253
258
|
if page.lines.empty?
|
|
@@ -281,7 +286,7 @@ module Pikuri
|
|
|
281
286
|
# text-free PDF gets an LLM-actionable hint rather than the
|
|
282
287
|
# plain-file "(Empty file)".
|
|
283
288
|
#
|
|
284
|
-
# @param page [Pikuri::
|
|
289
|
+
# @param page [Pikuri::Extractor::Page]
|
|
285
290
|
# @return [String]
|
|
286
291
|
def self.empty_message(page)
|
|
287
292
|
if page.kind == :pdf
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pikuri-workspace
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Martin Vysny
|
|
@@ -16,14 +16,14 @@ dependencies:
|
|
|
16
16
|
requirements:
|
|
17
17
|
- - '='
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: 0.0.
|
|
19
|
+
version: 0.0.6
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - '='
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: 0.0.
|
|
26
|
+
version: 0.0.6
|
|
27
27
|
description: |
|
|
28
28
|
pikuri-workspace adds "operate on a directory tree" to pikuri-core
|
|
29
29
|
agents: the +Pikuri::Workspace::Filesystem+ class that scopes
|