pikuri-core 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -3
- data/lib/pikuri/agent/chat_transport.rb +135 -11
- data/lib/pikuri/agent/configurator.rb +4 -4
- data/lib/pikuri/agent/context_window_detector.rb +103 -52
- data/lib/pikuri/agent/control/step_limit.rb +39 -7
- data/lib/pikuri/agent/event.rb +43 -16
- data/lib/pikuri/agent/extension.rb +31 -17
- data/lib/pikuri/agent/extension_context.rb +147 -0
- data/lib/pikuri/agent/listener/terminal.rb +30 -37
- data/lib/pikuri/agent/listener/token_log.rb +60 -13
- data/lib/pikuri/agent/listener.rb +12 -5
- data/lib/pikuri/agent/listener_list.rb +7 -17
- data/lib/pikuri/agent/synthesizer.rb +93 -67
- data/lib/pikuri/agent.rb +358 -403
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +74 -266
- data/lib/pikuri/sanitizer.rb +179 -0
- data/lib/pikuri/subprocess.rb +73 -2
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/parameters.rb +65 -2
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/search/brave.rb +32 -18
- data/lib/pikuri/tool/search/duckduckgo.rb +18 -7
- data/lib/pikuri/tool/search/engines.rb +72 -49
- data/lib/pikuri/tool/search/exa.rb +34 -22
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/tool/web_search.rb +45 -26
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +11 -10
- metadata +9 -66
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
data/lib/pikuri/file_type.rb
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'pdf-reader'
|
|
4
|
-
|
|
5
3
|
module Pikuri
|
|
6
|
-
# Magic-byte content sniffing
|
|
7
|
-
# responsibilities:
|
|
4
|
+
# Magic-byte content sniffing, plus the path-aware front over the
|
|
5
|
+
# {Extractor} registry. Two responsibilities:
|
|
8
6
|
#
|
|
9
7
|
# * {.detect_mime} — recognise a file from its leading bytes. Returns
|
|
10
8
|
# a MIME String for formats pikuri knows how to handle specially
|
|
@@ -15,21 +13,16 @@ module Pikuri
|
|
|
15
13
|
# {.detect_mime}: a file can be both recognised (e.g. PDF) *and*
|
|
16
14
|
# binary. {.detect_mime} tells you what the bytes are;
|
|
17
15
|
# {.binary?} tells you whether they're safe to render as text.
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
24
|
-
#
|
|
25
|
-
#
|
|
26
|
-
#
|
|
27
|
-
#
|
|
28
|
-
# +VectorDb::Tools::Read+ so the offset/limit/byte-cap windowing lives
|
|
29
|
-
# in one tested place; each tool keeps its own presentation
|
|
30
|
-
# (cat-n numbering, trailer wording, citation vs. path). Same
|
|
31
|
-
# refusal contract as {.read_as_text} (raises on image / binary
|
|
32
|
-
# / missing / malformed-PDF).
|
|
16
|
+
#
|
|
17
|
+
# On top of those sit the two +Pathname+ conveniences,
|
|
18
|
+
# {.read_as_text} (whole document, the {Pikuri::VectorDb} indexer's
|
|
19
|
+
# shape) and {.read_as_text_paged} (line-windowed, the Read tools'
|
|
20
|
+
# shape). Both are thin wrappers: they own the *path-level* refusals
|
|
21
|
+
# (missing file, directory, image) and the exception mapping, then
|
|
22
|
+
# hand the opened IO to {Extractor.extract} /
|
|
23
|
+
# {Extractor.extract_paged} — which format the bytes are and how
|
|
24
|
+
# they become text is entirely the registry's business, so a gem
|
|
25
|
+
# plugging a new extractor in extends these wrappers for free.
|
|
33
26
|
#
|
|
34
27
|
# {.detect_mime} and {.binary?} accept either a +String+ of bytes
|
|
35
28
|
# (sample taken by the caller) or a +Pathname+ — when given a path,
|
|
@@ -37,8 +30,7 @@ module Pikuri
|
|
|
37
30
|
# for the sniff itself. The Pathname form is the convenience path;
|
|
38
31
|
# the bytes form is for callers that already have the sample or are
|
|
39
32
|
# calling both methods on the same file and want to avoid a second
|
|
40
|
-
# open.
|
|
41
|
-
# bytes-in shortcut because the PDF case needs to seek the file.
|
|
33
|
+
# open.
|
|
42
34
|
#
|
|
43
35
|
# == Why a separate module
|
|
44
36
|
#
|
|
@@ -48,8 +40,7 @@ module Pikuri
|
|
|
48
40
|
# {.binary?} reached for by {Workspace::Edit}. Collecting the
|
|
49
41
|
# detection logic here lets {Read} focus on routing
|
|
50
42
|
# (mime-to-formatter), {Edit} drop its cross-tool reach, and new
|
|
51
|
-
# tools
|
|
52
|
-
# ...) share one set of magic-byte truths.
|
|
43
|
+
# tools share one set of magic-byte truths.
|
|
53
44
|
#
|
|
54
45
|
# == Deliberate non-goals
|
|
55
46
|
#
|
|
@@ -94,58 +85,6 @@ module Pikuri
|
|
|
94
85
|
# with this five-byte ASCII sequence per ISO 32000-1 §7.5.2.
|
|
95
86
|
PDF_MAGIC = '%PDF-'
|
|
96
87
|
|
|
97
|
-
# @return [Integer] default line-window size for
|
|
98
|
-
# {.read_as_text_paged} when the caller omits +limit+.
|
|
99
|
-
PAGE_DEFAULT_LIMIT = 2000
|
|
100
|
-
|
|
101
|
-
# @return [Integer] default hard byte cap on the content collected
|
|
102
|
-
# by a single {.read_as_text_paged} call. Bypassable by paging
|
|
103
|
-
# via +offset+. The rendered output is slightly larger (line
|
|
104
|
-
# numbering, trailer) — that's the caller's concern.
|
|
105
|
-
PAGE_MAX_BYTES = 50 * 1024
|
|
106
|
-
|
|
107
|
-
# @return [Integer] default per-line character cap;
|
|
108
|
-
# {.read_as_text_paged} truncates longer lines and appends
|
|
109
|
-
# {PAGE_LINE_TRUNCATION_MARKER}.
|
|
110
|
-
PAGE_MAX_LINE_LENGTH = 2000
|
|
111
|
-
|
|
112
|
-
# @return [String] suffix appended to a line truncated at
|
|
113
|
-
# {PAGE_MAX_LINE_LENGTH}.
|
|
114
|
-
PAGE_LINE_TRUNCATION_MARKER = "... (line truncated to #{PAGE_MAX_LINE_LENGTH} chars)"
|
|
115
|
-
|
|
116
|
-
# One windowed slice of a document, returned by
|
|
117
|
-
# {.read_as_text_paged}. The caller turns this into an
|
|
118
|
-
# observation; this struct carries everything a trailer needs
|
|
119
|
-
# without the caller re-reading the file.
|
|
120
|
-
#
|
|
121
|
-
# == Fields
|
|
122
|
-
#
|
|
123
|
-
# * +lines+ — +Array<String>+, the collected window. Already
|
|
124
|
-
# per-line truncated (with {PAGE_LINE_TRUNCATION_MARKER}); *not*
|
|
125
|
-
# line-numbered — numbering is presentation the caller adds. For
|
|
126
|
-
# a PDF the array includes +"--- Page N ---"+ marker lines (one
|
|
127
|
-
# per page that contributed text), which count toward +limit+ /
|
|
128
|
-
# the byte cap like any other line.
|
|
129
|
-
# * +start_line+ — the 1-indexed line number of +lines.first+
|
|
130
|
-
# (i.e. the +offset+ the caller asked for). +lines.last+ is at
|
|
131
|
-
# +start_line + lines.length - 1+.
|
|
132
|
-
# * +total_lines+ — total line count of the document when known,
|
|
133
|
-
# else +nil+. Known when extraction reached EOF (so the caller
|
|
134
|
-
# can print "of N"); +nil+ when the read stopped early — the
|
|
135
|
-
# byte cap fired, or a PDF filled the window before its last
|
|
136
|
-
# page (counting the rest would defeat the laziness).
|
|
137
|
-
# * +more+ — +true+ if content remains past this window (the
|
|
138
|
-
# caller should offer +offset = start_line + lines.length+).
|
|
139
|
-
# * +byte_capped+ — +true+ if {PAGE_MAX_BYTES} (not the line
|
|
140
|
-
# limit) was the stopping criterion.
|
|
141
|
-
# * +kind+ — +:text+ or +:pdf+; lets the caller word PDF-specific
|
|
142
|
-
# trailers and the empty-document message.
|
|
143
|
-
#
|
|
144
|
-
# An empty document yields +lines: []+, +total_lines: 0+; an
|
|
145
|
-
# +offset+ past EOF yields +lines: []+ with +total_lines+ set to
|
|
146
|
-
# the real (non-zero) count — the caller distinguishes the two.
|
|
147
|
-
Page = Data.define(:lines, :start_line, :total_lines, :more, :byte_capped, :kind)
|
|
148
|
-
|
|
149
88
|
# Recognise a file from its leading bytes. Returns the MIME type
|
|
150
89
|
# as a String for formats pikuri handles specially, or +nil+ for
|
|
151
90
|
# "unrecognised" — callers interpret +nil+ themselves (text,
|
|
@@ -197,19 +136,13 @@ module Pikuri
|
|
|
197
136
|
non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
|
|
198
137
|
end
|
|
199
138
|
|
|
200
|
-
# Read +path+ and return its content as plain UTF-8 text
|
|
201
|
-
#
|
|
202
|
-
#
|
|
203
|
-
#
|
|
204
|
-
#
|
|
205
|
-
#
|
|
206
|
-
#
|
|
207
|
-
# length if they care.
|
|
208
|
-
# * **Plain text** — anything that {.detect_mime} doesn't
|
|
209
|
-
# recognise and that {.binary?} accepts. Read with UTF-8
|
|
210
|
-
# encoding; behaviour on non-UTF-8 bytes is whatever +File.read+
|
|
211
|
-
# does with +encoding: Encoding::UTF_8+ (which is "leave invalid
|
|
212
|
-
# bytes in, let downstream decide").
|
|
139
|
+
# Read +path+ and return its content as plain UTF-8 text, routed
|
|
140
|
+
# through the {Extractor} registry: anything
|
|
141
|
+
# unrecognised-but-textual passes through verbatim
|
|
142
|
+
# ({Extractor::Passthrough}); with pikuri-pdf registered, PDFs
|
|
143
|
+
# are extracted with +"--- Page N ---"+ markers (a scanned-image
|
|
144
|
+
# PDF with no extractable text comes back as the empty String, a
|
|
145
|
+
# deliberate silent skip callers detect by length if they care).
|
|
213
146
|
#
|
|
214
147
|
# Refusal cases — all raise rather than returning a sentinel
|
|
215
148
|
# because the callers are internal pikuri code, not an LLM
|
|
@@ -220,13 +153,11 @@ module Pikuri
|
|
|
220
153
|
# * Path is a directory → +ArgumentError+.
|
|
221
154
|
# * Image (PNG / JPEG / GIF / WebP per {.detect_mime}) →
|
|
222
155
|
# +ArgumentError+; images aren't text.
|
|
223
|
-
# *
|
|
224
|
-
# +ArgumentError
|
|
225
|
-
# *
|
|
226
|
-
#
|
|
227
|
-
#
|
|
228
|
-
# path included so callers don't need to know pdf-reader's
|
|
229
|
-
# exception hierarchy.
|
|
156
|
+
# * Content no extractor claims (opaque binary) →
|
|
157
|
+
# +ArgumentError+, mapped from {Extractor::Unsupported}.
|
|
158
|
+
# * Extraction failure (malformed PDF, ...) → +RuntimeError+ with
|
|
159
|
+
# the path included, mapped from {Extractor::Error} so callers
|
|
160
|
+
# don't need to know any extractor's exception hierarchy.
|
|
230
161
|
#
|
|
231
162
|
# @param path [Pathname] file to read.
|
|
232
163
|
# @return [String] UTF-8 text. May be empty (empty text file, or
|
|
@@ -234,56 +165,26 @@ module Pikuri
|
|
|
234
165
|
# @raise [ArgumentError] if +path+ isn't a +Pathname+, points at
|
|
235
166
|
# a directory, is an image, or is binary.
|
|
236
167
|
# @raise [Errno::ENOENT] if +path+ doesn't exist.
|
|
237
|
-
# @raise [RuntimeError] on
|
|
168
|
+
# @raise [RuntimeError] on an extraction failure (malformed /
|
|
169
|
+
# unsupported PDF, ...).
|
|
238
170
|
def read_as_text(path)
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
|
|
246
|
-
raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
|
|
247
|
-
|
|
248
|
-
path.read(encoding: Encoding::UTF_8)
|
|
171
|
+
mime = guard_extractable(path)
|
|
172
|
+
path.open('rb') { |io| Extractor.extract(io, content_type: mime) }
|
|
173
|
+
rescue Extractor::Unsupported
|
|
174
|
+
raise ArgumentError, "#{path} appears to be binary; cannot extract as text"
|
|
175
|
+
rescue Extractor::Error => e
|
|
176
|
+
raise "Cannot extract text from #{path}: #{e.message}"
|
|
249
177
|
end
|
|
250
178
|
|
|
251
|
-
#
|
|
252
|
-
#
|
|
253
|
-
#
|
|
254
|
-
#
|
|
255
|
-
#
|
|
256
|
-
#
|
|
257
|
-
#
|
|
258
|
-
#
|
|
259
|
-
|
|
260
|
-
pages = path.open('rb') do |io|
|
|
261
|
-
::PDF::Reader.new(io).pages.map { |p| p.text.strip }
|
|
262
|
-
end
|
|
263
|
-
pages.reject(&:empty?).join("\n\n")
|
|
264
|
-
rescue ::PDF::Reader::MalformedPDFError,
|
|
265
|
-
::PDF::Reader::UnsupportedFeatureError,
|
|
266
|
-
::PDF::Reader::InvalidPageError => e
|
|
267
|
-
raise "Cannot extract PDF text from #{path}: " \
|
|
268
|
-
"#{e.class.name.split('::').last}: #{e.message}"
|
|
269
|
-
end
|
|
270
|
-
private_class_method :read_pdf_text
|
|
271
|
-
|
|
272
|
-
# Extract +path+ as text and return a windowed {Page}: the lines
|
|
273
|
-
# from +offset+ (1-indexed) up to +limit+ of them, stopping early
|
|
274
|
-
# if +max_bytes+ is reached, with over-long lines truncated at
|
|
275
|
-
# +max_line_length+. Lazy by design — a text file is streamed
|
|
276
|
-
# line-by-line and a PDF is parsed page-by-page only until the
|
|
277
|
-
# window fills, so reading the first page of a 500-page PDF parses
|
|
278
|
-
# a handful of pages, not all of them.
|
|
279
|
-
#
|
|
280
|
-
# Same routing and refusal contract as {.read_as_text}: PDFs are
|
|
281
|
-
# extracted (with +"--- Page N ---"+ marker lines, unlike
|
|
282
|
-
# {.read_as_text}'s marker-free join — paging is a display path,
|
|
283
|
-
# the marker-free form stays the indexing path); images, binaries,
|
|
284
|
-
# directories, missing files, and malformed PDFs all raise rather
|
|
285
|
-
# than returning a sentinel. The LLM-facing callers map those into
|
|
286
|
-
# +"Error: ..."+ observations themselves.
|
|
179
|
+
# Extract +path+ and return a windowed {Extractor::Page}: the
|
|
180
|
+
# lines from +offset+ (1-indexed) up to +limit+ of them, stopping
|
|
181
|
+
# early if +max_bytes+ is reached, with over-long lines truncated
|
|
182
|
+
# at +max_line_length+. Same routing and refusal contract as
|
|
183
|
+
# {.read_as_text}; the windowing semantics (including the lazy
|
|
184
|
+
# +extract_lines+ consumption that stops parsing once the window
|
|
185
|
+
# fills) are {Extractor.extract_paged}'s.
|
|
186
|
+
# The LLM-facing callers map the exceptions into +"Error: ..."+
|
|
187
|
+
# observations themselves.
|
|
287
188
|
#
|
|
288
189
|
# @param path [Pathname] file to read.
|
|
289
190
|
# @param offset [Integer] 1-indexed first line to include. The
|
|
@@ -292,141 +193,48 @@ module Pikuri
|
|
|
292
193
|
# validates +limit >= 1+.
|
|
293
194
|
# @param max_bytes [Integer] hard byte cap on collected content.
|
|
294
195
|
# @param max_line_length [Integer] per-line truncation threshold.
|
|
295
|
-
# @return [Page] the windowed slice.
|
|
196
|
+
# @return [Extractor::Page] the windowed slice.
|
|
296
197
|
# @raise [ArgumentError] if +path+ isn't a +Pathname+, is a
|
|
297
198
|
# directory, an image, or binary.
|
|
298
199
|
# @raise [Errno::ENOENT] if +path+ doesn't exist.
|
|
299
|
-
# @raise [RuntimeError] on
|
|
300
|
-
|
|
301
|
-
|
|
200
|
+
# @raise [RuntimeError] on an extraction failure (malformed /
|
|
201
|
+
# unsupported PDF, ...).
|
|
202
|
+
def read_as_text_paged(path, offset: 1, limit: Extractor::PAGE_DEFAULT_LIMIT,
|
|
203
|
+
max_bytes: Extractor::PAGE_MAX_BYTES,
|
|
204
|
+
max_line_length: Extractor::PAGE_MAX_LINE_LENGTH)
|
|
205
|
+
mime = guard_extractable(path)
|
|
206
|
+
path.open('rb') do |io|
|
|
207
|
+
Extractor.extract_paged(io, content_type: mime, offset: offset, limit: limit,
|
|
208
|
+
max_bytes: max_bytes, max_line_length: max_line_length)
|
|
209
|
+
end
|
|
210
|
+
rescue Extractor::Unsupported
|
|
211
|
+
raise ArgumentError, "#{path} appears to be binary; cannot extract as text"
|
|
212
|
+
rescue Extractor::Error => e
|
|
213
|
+
raise "Cannot extract text from #{path}: #{e.message}"
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# The shared path-level refusals for {.read_as_text} /
|
|
217
|
+
# {.read_as_text_paged}: must be an existing non-directory
|
|
218
|
+
# +Pathname+, and not an image (images are data for a vision
|
|
219
|
+
# model, never text). Returns the {.detect_mime} result so the
|
|
220
|
+
# caller can pass it to the {Extractor} as the content-type hint.
|
|
221
|
+
#
|
|
222
|
+
# @param path [Pathname]
|
|
223
|
+
# @return [String, nil] the sniffed MIME type.
|
|
224
|
+
# @raise [ArgumentError] on a non-Pathname, a directory, or an
|
|
225
|
+
# image.
|
|
226
|
+
# @raise [Errno::ENOENT] if +path+ doesn't exist.
|
|
227
|
+
def guard_extractable(path)
|
|
302
228
|
raise ArgumentError, "expected Pathname, got #{path.class}" unless path.is_a?(Pathname)
|
|
303
229
|
raise Errno::ENOENT, path.to_s unless path.exist?
|
|
304
230
|
raise ArgumentError, "#{path} is a directory" if path.directory?
|
|
305
231
|
|
|
306
232
|
mime = detect_mime(path)
|
|
307
|
-
if mime == 'application/pdf'
|
|
308
|
-
return paged_pdf(path, offset: offset, limit: limit,
|
|
309
|
-
max_bytes: max_bytes, max_line_length: max_line_length)
|
|
310
|
-
end
|
|
311
233
|
raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
|
|
312
|
-
raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
|
|
313
|
-
|
|
314
|
-
paged_text(path, offset: offset, limit: limit,
|
|
315
|
-
max_bytes: max_bytes, max_line_length: max_line_length)
|
|
316
|
-
end
|
|
317
|
-
|
|
318
|
-
# Stream a text file line-by-line into a {Page}. Keeps counting
|
|
319
|
-
# lines past the collection window so +total_lines+ can report the
|
|
320
|
-
# real total when the line limit (not the byte cap) stopped
|
|
321
|
-
# collection; on the byte cap it breaks and leaves +total_lines+
|
|
322
|
-
# +nil+ (the rest of the file is never read).
|
|
323
|
-
#
|
|
324
|
-
# @return [Page] +kind: :text+.
|
|
325
|
-
def paged_text(path, offset:, limit:, max_bytes:, max_line_length:)
|
|
326
|
-
start_index = offset - 1
|
|
327
|
-
collected = []
|
|
328
|
-
total_lines = 0
|
|
329
|
-
bytes = 0
|
|
330
|
-
byte_capped = false
|
|
331
|
-
more = false
|
|
332
|
-
|
|
333
|
-
path.each_line do |raw|
|
|
334
|
-
total_lines += 1
|
|
335
|
-
next if total_lines <= start_index
|
|
336
|
-
|
|
337
|
-
if collected.length >= limit
|
|
338
|
-
more = true
|
|
339
|
-
next
|
|
340
|
-
end
|
|
341
|
-
|
|
342
|
-
line = truncate_line(raw.chomp, max_line_length)
|
|
343
|
-
size = line.bytesize + 1 # +1 for the joining newline
|
|
344
|
-
if bytes + size > max_bytes
|
|
345
|
-
byte_capped = true
|
|
346
|
-
more = true
|
|
347
|
-
break
|
|
348
|
-
end
|
|
349
|
-
collected << line
|
|
350
|
-
bytes += size
|
|
351
|
-
end
|
|
352
|
-
|
|
353
|
-
Page.new(lines: collected, start_line: offset,
|
|
354
|
-
total_lines: byte_capped ? nil : total_lines,
|
|
355
|
-
more: more, byte_capped: byte_capped, kind: :text)
|
|
356
|
-
end
|
|
357
|
-
private_class_method :paged_text
|
|
358
|
-
|
|
359
|
-
# PDF counterpart to {paged_text}: walk +pdf-reader+'s lazy page
|
|
360
|
-
# iterator, emitting a +"--- Page N ---"+ header line then each
|
|
361
|
-
# line of the page's text, applying the same offset / limit /
|
|
362
|
-
# byte-cap contract. The +throw :done+ short-circuits both loops
|
|
363
|
-
# the moment the window fills, so parsing stops — which is why a
|
|
364
|
-
# PDF that stops early can't report +total_lines+ (it would have
|
|
365
|
-
# to parse every page to count).
|
|
366
|
-
#
|
|
367
|
-
# @return [Page] +kind: :pdf+.
|
|
368
|
-
# @raise [RuntimeError] on a malformed / unsupported PDF.
|
|
369
|
-
def paged_pdf(path, offset:, limit:, max_bytes:, max_line_length:)
|
|
370
|
-
start_index = offset - 1
|
|
371
|
-
collected = []
|
|
372
|
-
total_lines = 0
|
|
373
|
-
bytes = 0
|
|
374
|
-
byte_capped = false
|
|
375
|
-
more = false
|
|
376
|
-
|
|
377
|
-
catch(:done) do
|
|
378
|
-
path.open('rb') do |io|
|
|
379
|
-
reader = ::PDF::Reader.new(io)
|
|
380
|
-
reader.pages.each_with_index do |page, idx|
|
|
381
|
-
text = page.text.strip
|
|
382
|
-
next if text.empty?
|
|
383
|
-
|
|
384
|
-
["--- Page #{idx + 1} ---", *text.split("\n")].each do |raw|
|
|
385
|
-
total_lines += 1
|
|
386
|
-
next if total_lines <= start_index
|
|
387
|
-
|
|
388
|
-
if collected.length >= limit
|
|
389
|
-
more = true
|
|
390
|
-
throw :done
|
|
391
|
-
end
|
|
392
|
-
|
|
393
|
-
line = truncate_line(raw, max_line_length)
|
|
394
|
-
size = line.bytesize + 1
|
|
395
|
-
if bytes + size > max_bytes
|
|
396
|
-
byte_capped = true
|
|
397
|
-
more = true
|
|
398
|
-
throw :done
|
|
399
|
-
end
|
|
400
|
-
collected << line
|
|
401
|
-
bytes += size
|
|
402
|
-
end
|
|
403
|
-
end
|
|
404
|
-
end
|
|
405
|
-
end
|
|
406
|
-
|
|
407
|
-
Page.new(lines: collected, start_line: offset,
|
|
408
|
-
total_lines: more ? nil : total_lines,
|
|
409
|
-
more: more, byte_capped: byte_capped, kind: :pdf)
|
|
410
|
-
rescue ::PDF::Reader::MalformedPDFError,
|
|
411
|
-
::PDF::Reader::InvalidPageError,
|
|
412
|
-
::PDF::Reader::UnsupportedFeatureError => e
|
|
413
|
-
raise "Cannot extract PDF text from #{path}: " \
|
|
414
|
-
"#{e.class.name.split('::').last}: #{e.message}"
|
|
415
|
-
end
|
|
416
|
-
private_class_method :paged_pdf
|
|
417
|
-
|
|
418
|
-
# Truncate +line+ to +max_line_length+ chars, appending
|
|
419
|
-
# {PAGE_LINE_TRUNCATION_MARKER} when it overflows.
|
|
420
|
-
#
|
|
421
|
-
# @param line [String]
|
|
422
|
-
# @param max_line_length [Integer]
|
|
423
|
-
# @return [String]
|
|
424
|
-
def truncate_line(line, max_line_length)
|
|
425
|
-
return line if line.length <= max_line_length
|
|
426
234
|
|
|
427
|
-
|
|
235
|
+
mime
|
|
428
236
|
end
|
|
429
|
-
private_class_method :
|
|
237
|
+
private_class_method :guard_extractable
|
|
430
238
|
|
|
431
239
|
# Coerce an +input+ argument into a bytes String for the sniffs.
|
|
432
240
|
# +String+ inputs are returned as-is (caller already sampled);
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
# Renders attacker-controlled text safe to display, and reports *why*
|
|
5
|
+
# it was unsafe.
|
|
6
|
+
#
|
|
7
|
+
# Every string an LLM composes is untrusted: a bash command, a tool
|
|
8
|
+
# observation echoed back to the user, a description it wrote for a
|
|
9
|
+
# confirmation prompt. A model that is broken — or, far more likely,
|
|
10
|
+
# being driven by a prompt injection — can embed bytes that a terminal
|
|
11
|
+
# acts on rather than prints: a carriage return that overwrites the
|
|
12
|
+
# line the user just read, an ESC that recolors or repositions, a
|
|
13
|
+
# backspace that erases, a bidirectional override that reorders text so
|
|
14
|
+
# it reads differently than it runs, a zero-width character that hides
|
|
15
|
+
# in plain sight, or a Cyrillic +а+ masquerading as a Latin +a+. The
|
|
16
|
+
# whole point of a confirmation prompt collapses if the bytes the user
|
|
17
|
+
# approves are not the bytes that execute.
|
|
18
|
+
#
|
|
19
|
+
# {.sanitize} is the one chrome-independent primitive every renderer
|
|
20
|
+
# (terminal, TUI, web) routes through. It does two things and returns
|
|
21
|
+
# both as a {Result}:
|
|
22
|
+
#
|
|
23
|
+
# 1. *Neutralize* — make the dangerous bytes visible without changing
|
|
24
|
+
# structure. Control bytes become +\xNN+, bidi/zero-width codepoints
|
|
25
|
+
# become +\u{NNNN}+, tab becomes +\t+. Newlines are preserved
|
|
26
|
+
# (multi-line commands are normal). This is *faithful, not
|
|
27
|
+
# beautifying*: it never collapses runs of whitespace or rewrites a
|
|
28
|
+
# tab to a space, because the user must see exactly what they are
|
|
29
|
+
# approving — a Makefile's leading tab stays visibly a tab. A web
|
|
30
|
+
# chrome composes +html_escape(sanitize(s).text)+; the HTML layer is
|
|
31
|
+
# the caller's, not ours.
|
|
32
|
+
# 2. *Warn* — return a {Warning} per category detected, each a semantic
|
|
33
|
+
# record (kind + offending tokens + a plain-English explanation).
|
|
34
|
+
# Presentation is the chrome's: a terminal renders these bold yellow,
|
|
35
|
+
# a web client a banner. The {Warning} carries no color or markup.
|
|
36
|
+
#
|
|
37
|
+
# == Scope (deliberately closed)
|
|
38
|
+
#
|
|
39
|
+
# Detection covers the *invisibility / cursor-control / reordering*
|
|
40
|
+
# attack classes completely, because each is a finite, enumerable set
|
|
41
|
+
# of codepoints: C0 controls, C1 controls (a second ANSI introducer on
|
|
42
|
+
# some emulators), DEL, the bidi overrides, and the zero-width
|
|
43
|
+
# characters. On top of that, {.sanitize} flags *mixed-script tokens* —
|
|
44
|
+
# a single word combining letters from Latin + Cyrillic + Greek, which
|
|
45
|
+
# is the signature of a homoglyph spoof and has near-zero false
|
|
46
|
+
# positives on real text (humans do not weld two alphabets inside one
|
|
47
|
+
# word; +café+ is all-Latin, +Москва+ all-Cyrillic, only +Pаypal+ mixes).
|
|
48
|
+
#
|
|
49
|
+
# Two confusable classes are explicitly *out of scope*, because
|
|
50
|
+
# detecting them needs Unicode confusables tables and produces heavy
|
|
51
|
+
# false positives on legitimate multilingual text:
|
|
52
|
+
#
|
|
53
|
+
# * *Whole-script* homoglyphs — an entirely-Cyrillic string that merely
|
|
54
|
+
# looks Latin (no mixing to detect).
|
|
55
|
+
# * *Single-symbol* confusables — the Greek question mark +;+ (U+037E)
|
|
56
|
+
# that looks like a semicolon, full-width forms, the division slash.
|
|
57
|
+
#
|
|
58
|
+
# "Solid" here means complete on the classes above, not exhaustive over
|
|
59
|
+
# all of Unicode.
|
|
60
|
+
module Sanitizer
|
|
61
|
+
# One reason a piece of text was flagged, ready for a chrome to
|
|
62
|
+
# render however it surfaces warnings (bold yellow line, web banner).
|
|
63
|
+
#
|
|
64
|
+
# * +kind+ — a {Symbol} category: +:backspace+, +:control_bytes+,
|
|
65
|
+
# +:bidi+, +:zero_width+, or +:mixed_script+.
|
|
66
|
+
# * +offenders+ — the distinct offending tokens, in first-seen order:
|
|
67
|
+
# the escaped forms (+"\\x1b"+, +"\\u{202e}"+) for byte categories,
|
|
68
|
+
# the raw tokens (+"Pаypal"+) for +:mixed_script+.
|
|
69
|
+
# * +explanation+ — a one-line, chrome-agnostic English summary of
|
|
70
|
+
# what the bytes can do.
|
|
71
|
+
Warning = Data.define(:kind, :offenders, :explanation)
|
|
72
|
+
|
|
73
|
+
# The output of {Sanitizer.sanitize}.
|
|
74
|
+
#
|
|
75
|
+
# * +text+ — the neutralized string, safe to print literally.
|
|
76
|
+
# * +warnings+ — {Array}<{Warning}>, empty when nothing was flagged.
|
|
77
|
+
Result = Data.define(:text, :warnings)
|
|
78
|
+
|
|
79
|
+
# Bidirectional-override codepoints: the explicit LRO/RLO/PDF/LRE/RLE
|
|
80
|
+
# set plus the isolate set (LRI/RLI/FSI/PDI). Reordering attacks.
|
|
81
|
+
BIDI_OVERRIDES = [*0x202a..0x202e, *0x2066..0x2069].freeze
|
|
82
|
+
|
|
83
|
+
# Zero-width and invisible codepoints: ZWSP, ZWNJ, ZWJ, and the BOM /
|
|
84
|
+
# zero-width no-break space.
|
|
85
|
+
ZERO_WIDTH = [0x200b, 0x200c, 0x200d, 0xfeff].freeze
|
|
86
|
+
|
|
87
|
+
# Codepoints {.sanitize} rewrites: C0 controls including tab (U+0009)
|
|
88
|
+
# but *excluding* newline (U+000A, which passes through untouched),
|
|
89
|
+
# C1 controls + DEL (U+007F–009F), the zero-width set, and the bidi
|
|
90
|
+
# overrides. Newline is the one control character a faithful render
|
|
91
|
+
# must keep, so the C0 range is split around it.
|
|
92
|
+
SUSPECT = /[\u0000-\u0009\u000b-\u001f\u007f-\u009f\u200b-\u200d\u202a-\u202e\u2066-\u2069\ufeff]/
|
|
93
|
+
|
|
94
|
+
# The three Latin-confusable scripts whose mixing inside one token
|
|
95
|
+
# signals a homoglyph spoof. Punctuation, digits and spaces are the
|
|
96
|
+
# +Common+ script and match none of these, so they never count toward
|
|
97
|
+
# the "two distinct scripts" threshold.
|
|
98
|
+
CONFUSABLE_SCRIPTS = { 'Latin' => /\p{Latin}/, 'Cyrillic' => /\p{Cyrillic}/, 'Greek' => /\p{Greek}/ }.freeze
|
|
99
|
+
|
|
100
|
+
# Neutralize +text+ for literal display and report what was flagged.
|
|
101
|
+
#
|
|
102
|
+
# @param text [String] attacker-controlled text (an LLM-composed
|
|
103
|
+
# command, description, or tool observation), e.g.
|
|
104
|
+
# +"echo hi\rrm -rf /"+
|
|
105
|
+
# @return [Result] the neutralized +text+ plus an {Array}<{Warning}>
|
|
106
|
+
# (empty when clean)
|
|
107
|
+
def self.sanitize(text)
|
|
108
|
+
backspace = false
|
|
109
|
+
control = []
|
|
110
|
+
bidi = []
|
|
111
|
+
zero_width = []
|
|
112
|
+
|
|
113
|
+
clean = text.gsub(SUSPECT) do |ch|
|
|
114
|
+
cp = ch.ord
|
|
115
|
+
if cp == 0x09
|
|
116
|
+
'\\t'
|
|
117
|
+
elsif cp == 0x08
|
|
118
|
+
backspace = true
|
|
119
|
+
'\\x08'
|
|
120
|
+
elsif BIDI_OVERRIDES.include?(cp)
|
|
121
|
+
format('\\u{%04x}', cp).tap { |t| bidi << t }
|
|
122
|
+
elsif ZERO_WIDTH.include?(cp)
|
|
123
|
+
format('\\u{%04x}', cp).tap { |t| zero_width << t }
|
|
124
|
+
else
|
|
125
|
+
format('\\x%02x', cp).tap { |t| control << t }
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
Result.new(text: clean, warnings: warnings_for(backspace, control, bidi, zero_width, mixed_script_tokens(text)))
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Tokens (whitespace-delimited runs) that combine letters from two or
|
|
133
|
+
# more of {CONFUSABLE_SCRIPTS} — the homoglyph-spoof signature.
|
|
134
|
+
#
|
|
135
|
+
# @param text [String]
|
|
136
|
+
# @return [Array<String>] distinct offending tokens, first-seen order
|
|
137
|
+
def self.mixed_script_tokens(text)
|
|
138
|
+
text.split(/\s+/).reject(&:empty?).select do |token|
|
|
139
|
+
CONFUSABLE_SCRIPTS.count { |_name, re| token.match?(re) } >= 2
|
|
140
|
+
end.uniq
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Assemble one {Warning} per non-empty category, in a stable order
|
|
144
|
+
# (most-deceptive first).
|
|
145
|
+
#
|
|
146
|
+
# @return [Array<Warning>]
|
|
147
|
+
def self.warnings_for(backspace, control, bidi, zero_width, mixed)
|
|
148
|
+
out = []
|
|
149
|
+
if backspace
|
|
150
|
+
out << Warning.new(kind: :backspace, offenders: ['\\x08'],
|
|
151
|
+
explanation: 'Backspace characters present — the model may be trying to visually erase ' \
|
|
152
|
+
'part of the text after you have read it.')
|
|
153
|
+
end
|
|
154
|
+
unless bidi.empty?
|
|
155
|
+
out << Warning.new(kind: :bidi, offenders: bidi.uniq,
|
|
156
|
+
explanation: "Bidirectional-override characters present (#{bidi.uniq.join(' ')}) — these " \
|
|
157
|
+
'can reorder how text is displayed so it reads differently than it runs.')
|
|
158
|
+
end
|
|
159
|
+
unless zero_width.empty?
|
|
160
|
+
out << Warning.new(kind: :zero_width, offenders: zero_width.uniq,
|
|
161
|
+
explanation: "Zero-width / invisible characters present (#{zero_width.uniq.join(' ')}) — " \
|
|
162
|
+
'the text may contain characters you cannot see.')
|
|
163
|
+
end
|
|
164
|
+
unless control.empty?
|
|
165
|
+
out << Warning.new(kind: :control_bytes, offenders: control.uniq,
|
|
166
|
+
explanation: "Non-printable control bytes present (#{control.uniq.join(' ')}) — in a " \
|
|
167
|
+
'terminal these can move the cursor, change colors, or hide output.')
|
|
168
|
+
end
|
|
169
|
+
unless mixed.empty?
|
|
170
|
+
out << Warning.new(kind: :mixed_script, offenders: mixed,
|
|
171
|
+
explanation: "Mixed-script tokens present (#{mixed.join(', ')}) — letters from different " \
|
|
172
|
+
"alphabets are combined within one word, a classic homoglyph spoof (e.g. " \
|
|
173
|
+
"Cyrillic 'а' standing in for Latin 'a').")
|
|
174
|
+
end
|
|
175
|
+
out
|
|
176
|
+
end
|
|
177
|
+
private_class_method :warnings_for
|
|
178
|
+
end
|
|
179
|
+
end
|