pikuri-core 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +5 -3
  3. data/lib/pikuri/agent/chat_transport.rb +135 -11
  4. data/lib/pikuri/agent/configurator.rb +4 -4
  5. data/lib/pikuri/agent/context_window_detector.rb +103 -52
  6. data/lib/pikuri/agent/control/step_limit.rb +39 -7
  7. data/lib/pikuri/agent/event.rb +43 -16
  8. data/lib/pikuri/agent/extension.rb +31 -17
  9. data/lib/pikuri/agent/extension_context.rb +147 -0
  10. data/lib/pikuri/agent/listener/terminal.rb +30 -37
  11. data/lib/pikuri/agent/listener/token_log.rb +60 -13
  12. data/lib/pikuri/agent/listener.rb +12 -5
  13. data/lib/pikuri/agent/listener_list.rb +7 -17
  14. data/lib/pikuri/agent/synthesizer.rb +93 -67
  15. data/lib/pikuri/agent.rb +358 -403
  16. data/lib/pikuri/extractor/html.rb +303 -0
  17. data/lib/pikuri/extractor/passthrough.rb +64 -0
  18. data/lib/pikuri/extractor.rb +314 -0
  19. data/lib/pikuri/file_type.rb +74 -266
  20. data/lib/pikuri/sanitizer.rb +179 -0
  21. data/lib/pikuri/subprocess.rb +73 -2
  22. data/lib/pikuri/tool/calculator.rb +213 -41
  23. data/lib/pikuri/tool/fetch.rb +10 -9
  24. data/lib/pikuri/tool/parameters.rb +65 -2
  25. data/lib/pikuri/tool/scraper.rb +186 -0
  26. data/lib/pikuri/tool/search/brave.rb +32 -18
  27. data/lib/pikuri/tool/search/duckduckgo.rb +18 -7
  28. data/lib/pikuri/tool/search/engines.rb +72 -49
  29. data/lib/pikuri/tool/search/exa.rb +34 -22
  30. data/lib/pikuri/tool/web_scrape.rb +5 -5
  31. data/lib/pikuri/tool/web_search.rb +45 -26
  32. data/lib/pikuri/version.rb +1 -1
  33. data/lib/pikuri-core.rb +11 -10
  34. metadata +9 -66
  35. data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
  36. data/lib/pikuri/tool/scraper/html.rb +0 -285
  37. data/lib/pikuri/tool/scraper/pdf.rb +0 -54
  38. data/lib/pikuri/tool/scraper/simple.rb +0 -183
@@ -1,10 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'pdf-reader'
4
-
5
3
  module Pikuri
6
- # Magic-byte content sniffing + text extraction, centralised. Three
7
- # responsibilities:
4
+ # Magic-byte content sniffing, plus the path-aware front over the
5
+ # {Extractor} registry. Two responsibilities:
8
6
  #
9
7
  # * {.detect_mime} — recognise a file from its leading bytes. Returns
10
8
  # a MIME String for formats pikuri knows how to handle specially
@@ -15,21 +13,16 @@ module Pikuri
15
13
  # {.detect_mime}: a file can be both recognised (e.g. PDF) *and*
16
14
  # binary. {.detect_mime} tells you what the bytes are;
17
15
  # {.binary?} tells you whether they're safe to render as text.
18
- # * {.read_as_text} — read a file and return its content as plain
19
- # UTF-8 text. PDFs go through +pdf-reader+ page-by-page; plain
20
- # text passes through; images / binaries / missing files raise.
21
- # The pure-extraction shape consumers like +Pikuri::VectorDb+'s
22
- # indexer want (no LLM-tool concerns no paging, no line
23
- # numbering, no byte caps; just bytes-in-text-out).
24
- # * {.read_as_text_paged} the LLM-tool shape: the same
25
- # extraction as {.read_as_text}, but lazily windowed to a
26
- # line range with a byte cap, returning a {Page} value the
27
- # caller renders. Shared by +Workspace::Read+ and
28
- # +VectorDb::Tools::Read+ so the offset/limit/byte-cap windowing lives
29
- # in one tested place; each tool keeps its own presentation
30
- # (cat-n numbering, trailer wording, citation vs. path). Same
31
- # refusal contract as {.read_as_text} (raises on image / binary
32
- # / missing / malformed-PDF).
16
+ #
17
+ # On top of those sit the two +Pathname+ conveniences,
18
+ # {.read_as_text} (whole document, the {Pikuri::VectorDb} indexer's
19
+ # shape) and {.read_as_text_paged} (line-windowed, the Read tools'
20
+ # shape). Both are thin wrappers: they own the *path-level* refusals
21
+ # (missing file, directory, image) and the exception mapping, then
22
+ # hand the opened IO to {Extractor.extract} /
23
+ # {Extractor.extract_paged} which format the bytes are and how
24
+ # they become text is entirely the registry's business, so a gem
25
+ # plugging a new extractor in extends these wrappers for free.
33
26
  #
34
27
  # {.detect_mime} and {.binary?} accept either a +String+ of bytes
35
28
  # (sample taken by the caller) or a +Pathname+ — when given a path,
@@ -37,8 +30,7 @@ module Pikuri
37
30
  # for the sniff itself. The Pathname form is the convenience path;
38
31
  # the bytes form is for callers that already have the sample or are
39
32
  # calling both methods on the same file and want to avoid a second
40
- # open. {.read_as_text} takes a +Pathname+ only — there's no
41
- # bytes-in shortcut because the PDF case needs to seek the file.
33
+ # open.
42
34
  #
43
35
  # == Why a separate module
44
36
  #
@@ -48,8 +40,7 @@ module Pikuri
48
40
  # {.binary?} reached for by {Workspace::Edit}. Collecting the
49
41
  # detection logic here lets {Read} focus on routing
50
42
  # (mime-to-formatter), {Edit} drop its cross-tool reach, and new
51
- # tools (a future +Workspace::Diff+, an attachment-aware web fetcher,
52
- # ...) share one set of magic-byte truths.
43
+ # tools share one set of magic-byte truths.
53
44
  #
54
45
  # == Deliberate non-goals
55
46
  #
@@ -94,58 +85,6 @@ module Pikuri
94
85
  # with this five-byte ASCII sequence per ISO 32000-1 §7.5.2.
95
86
  PDF_MAGIC = '%PDF-'
96
87
 
97
- # @return [Integer] default line-window size for
98
- # {.read_as_text_paged} when the caller omits +limit+.
99
- PAGE_DEFAULT_LIMIT = 2000
100
-
101
- # @return [Integer] default hard byte cap on the content collected
102
- # by a single {.read_as_text_paged} call. Bypassable by paging
103
- # via +offset+. The rendered output is slightly larger (line
104
- # numbering, trailer) — that's the caller's concern.
105
- PAGE_MAX_BYTES = 50 * 1024
106
-
107
- # @return [Integer] default per-line character cap;
108
- # {.read_as_text_paged} truncates longer lines and appends
109
- # {PAGE_LINE_TRUNCATION_MARKER}.
110
- PAGE_MAX_LINE_LENGTH = 2000
111
-
112
- # @return [String] suffix appended to a line truncated at
113
- # {PAGE_MAX_LINE_LENGTH}.
114
- PAGE_LINE_TRUNCATION_MARKER = "... (line truncated to #{PAGE_MAX_LINE_LENGTH} chars)"
115
-
116
- # One windowed slice of a document, returned by
117
- # {.read_as_text_paged}. The caller turns this into an
118
- # observation; this struct carries everything a trailer needs
119
- # without the caller re-reading the file.
120
- #
121
- # == Fields
122
- #
123
- # * +lines+ — +Array<String>+, the collected window. Already
124
- # per-line truncated (with {PAGE_LINE_TRUNCATION_MARKER}); *not*
125
- # line-numbered — numbering is presentation the caller adds. For
126
- # a PDF the array includes +"--- Page N ---"+ marker lines (one
127
- # per page that contributed text), which count toward +limit+ /
128
- # the byte cap like any other line.
129
- # * +start_line+ — the 1-indexed line number of +lines.first+
130
- # (i.e. the +offset+ the caller asked for). +lines.last+ is at
131
- # +start_line + lines.length - 1+.
132
- # * +total_lines+ — total line count of the document when known,
133
- # else +nil+. Known when extraction reached EOF (so the caller
134
- # can print "of N"); +nil+ when the read stopped early — the
135
- # byte cap fired, or a PDF filled the window before its last
136
- # page (counting the rest would defeat the laziness).
137
- # * +more+ — +true+ if content remains past this window (the
138
- # caller should offer +offset = start_line + lines.length+).
139
- # * +byte_capped+ — +true+ if {PAGE_MAX_BYTES} (not the line
140
- # limit) was the stopping criterion.
141
- # * +kind+ — +:text+ or +:pdf+; lets the caller word PDF-specific
142
- # trailers and the empty-document message.
143
- #
144
- # An empty document yields +lines: []+, +total_lines: 0+; an
145
- # +offset+ past EOF yields +lines: []+ with +total_lines+ set to
146
- # the real (non-zero) count — the caller distinguishes the two.
147
- Page = Data.define(:lines, :start_line, :total_lines, :more, :byte_capped, :kind)
148
-
149
88
  # Recognise a file from its leading bytes. Returns the MIME type
150
89
  # as a String for formats pikuri handles specially, or +nil+ for
151
90
  # "unrecognised" — callers interpret +nil+ themselves (text,
@@ -197,19 +136,13 @@ module Pikuri
197
136
  non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
198
137
  end
199
138
 
200
- # Read +path+ and return its content as plain UTF-8 text. Two
201
- # extraction paths, picked by {.detect_mime}:
202
- #
203
- # * **PDF** — walked page-by-page via +pdf-reader+; each page's
204
- # extracted text is stripped and pages are joined with a blank
205
- # line. A scanned-image PDF (no extractable text) comes back as
206
- # the empty String — a deliberate silent skip, callers detect by
207
- # length if they care.
208
- # * **Plain text** — anything that {.detect_mime} doesn't
209
- # recognise and that {.binary?} accepts. Read with UTF-8
210
- # encoding; behaviour on non-UTF-8 bytes is whatever +File.read+
211
- # does with +encoding: Encoding::UTF_8+ (which is "leave invalid
212
- # bytes in, let downstream decide").
139
+ # Read +path+ and return its content as plain UTF-8 text, routed
140
+ # through the {Extractor} registry: anything
141
+ # unrecognised-but-textual passes through verbatim
142
+ # ({Extractor::Passthrough}); with pikuri-pdf registered, PDFs
143
+ # are extracted with +"--- Page N ---"+ markers (a scanned-image
144
+ # PDF with no extractable text comes back as the empty String, a
145
+ # deliberate silent skip callers detect by length if they care).
213
146
  #
214
147
  # Refusal cases — all raise rather than returning a sentinel
215
148
  # because the callers are internal pikuri code, not an LLM
@@ -220,13 +153,11 @@ module Pikuri
220
153
  # * Path is a directory → +ArgumentError+.
221
154
  # * Image (PNG / JPEG / GIF / WebP per {.detect_mime}) →
222
155
  # +ArgumentError+; images aren't text.
223
- # * Binary content (per {.binary?}) and not a recognised MIME
224
- # +ArgumentError+.
225
- # * Malformed PDF +pdf-reader+'s
226
- # +MalformedPDFError+ / +UnsupportedFeatureError+ /
227
- # +InvalidPageError+ are re-raised as a +RuntimeError+ with the
228
- # path included so callers don't need to know pdf-reader's
229
- # exception hierarchy.
156
+ # * Content no extractor claims (opaque binary) →
157
+ # +ArgumentError+, mapped from {Extractor::Unsupported}.
158
+ # * Extraction failure (malformed PDF, ...) +RuntimeError+ with
159
+ # the path included, mapped from {Extractor::Error} so callers
160
+ # don't need to know any extractor's exception hierarchy.
230
161
  #
231
162
  # @param path [Pathname] file to read.
232
163
  # @return [String] UTF-8 text. May be empty (empty text file, or
@@ -234,56 +165,26 @@ module Pikuri
234
165
  # @raise [ArgumentError] if +path+ isn't a +Pathname+, points at
235
166
  # a directory, is an image, or is binary.
236
167
  # @raise [Errno::ENOENT] if +path+ doesn't exist.
237
- # @raise [RuntimeError] on a malformed / unsupported PDF.
168
+ # @raise [RuntimeError] on an extraction failure (malformed /
169
+ # unsupported PDF, ...).
238
170
  def read_as_text(path)
239
- raise ArgumentError, "expected Pathname, got #{path.class}" unless path.is_a?(Pathname)
240
- raise Errno::ENOENT, path.to_s unless path.exist?
241
- raise ArgumentError, "#{path} is a directory" if path.directory?
242
-
243
- mime = detect_mime(path)
244
- return read_pdf_text(path) if mime == 'application/pdf'
245
- raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
246
- raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
247
-
248
- path.read(encoding: Encoding::UTF_8)
171
+ mime = guard_extractable(path)
172
+ path.open('rb') { |io| Extractor.extract(io, content_type: mime) }
173
+ rescue Extractor::Unsupported
174
+ raise ArgumentError, "#{path} appears to be binary; cannot extract as text"
175
+ rescue Extractor::Error => e
176
+ raise "Cannot extract text from #{path}: #{e.message}"
249
177
  end
250
178
 
251
- # Walk a PDF page-by-page via +pdf-reader+, returning a single
252
- # String with non-empty page texts joined by blank lines. Catches
253
- # the three +PDF::Reader+ exceptions Workspace::Read also handles
254
- # and re-raises them as +RuntimeError+ with the path included.
255
- #
256
- # @param path [Pathname]
257
- # @return [String]
258
- # @raise [RuntimeError] on malformed / unsupported PDF.
259
- def read_pdf_text(path)
260
- pages = path.open('rb') do |io|
261
- ::PDF::Reader.new(io).pages.map { |p| p.text.strip }
262
- end
263
- pages.reject(&:empty?).join("\n\n")
264
- rescue ::PDF::Reader::MalformedPDFError,
265
- ::PDF::Reader::UnsupportedFeatureError,
266
- ::PDF::Reader::InvalidPageError => e
267
- raise "Cannot extract PDF text from #{path}: " \
268
- "#{e.class.name.split('::').last}: #{e.message}"
269
- end
270
- private_class_method :read_pdf_text
271
-
272
- # Extract +path+ as text and return a windowed {Page}: the lines
273
- # from +offset+ (1-indexed) up to +limit+ of them, stopping early
274
- # if +max_bytes+ is reached, with over-long lines truncated at
275
- # +max_line_length+. Lazy by design — a text file is streamed
276
- # line-by-line and a PDF is parsed page-by-page only until the
277
- # window fills, so reading the first page of a 500-page PDF parses
278
- # a handful of pages, not all of them.
279
- #
280
- # Same routing and refusal contract as {.read_as_text}: PDFs are
281
- # extracted (with +"--- Page N ---"+ marker lines, unlike
282
- # {.read_as_text}'s marker-free join — paging is a display path,
283
- # the marker-free form stays the indexing path); images, binaries,
284
- # directories, missing files, and malformed PDFs all raise rather
285
- # than returning a sentinel. The LLM-facing callers map those into
286
- # +"Error: ..."+ observations themselves.
179
+ # Extract +path+ and return a windowed {Extractor::Page}: the
180
+ # lines from +offset+ (1-indexed) up to +limit+ of them, stopping
181
+ # early if +max_bytes+ is reached, with over-long lines truncated
182
+ # at +max_line_length+. Same routing and refusal contract as
183
+ # {.read_as_text}; the windowing semantics (including the lazy
184
+ # +extract_lines+ consumption that stops parsing once the window
185
+ # fills) are {Extractor.extract_paged}'s.
186
+ # The LLM-facing callers map the exceptions into +"Error: ..."+
187
+ # observations themselves.
287
188
  #
288
189
  # @param path [Pathname] file to read.
289
190
  # @param offset [Integer] 1-indexed first line to include. The
@@ -292,141 +193,48 @@ module Pikuri
292
193
  # validates +limit >= 1+.
293
194
  # @param max_bytes [Integer] hard byte cap on collected content.
294
195
  # @param max_line_length [Integer] per-line truncation threshold.
295
- # @return [Page] the windowed slice.
196
+ # @return [Extractor::Page] the windowed slice.
296
197
  # @raise [ArgumentError] if +path+ isn't a +Pathname+, is a
297
198
  # directory, an image, or binary.
298
199
  # @raise [Errno::ENOENT] if +path+ doesn't exist.
299
- # @raise [RuntimeError] on a malformed / unsupported PDF.
300
- def read_as_text_paged(path, offset: 1, limit: PAGE_DEFAULT_LIMIT,
301
- max_bytes: PAGE_MAX_BYTES, max_line_length: PAGE_MAX_LINE_LENGTH)
200
+ # @raise [RuntimeError] on an extraction failure (malformed /
201
+ # unsupported PDF, ...).
202
+ def read_as_text_paged(path, offset: 1, limit: Extractor::PAGE_DEFAULT_LIMIT,
203
+ max_bytes: Extractor::PAGE_MAX_BYTES,
204
+ max_line_length: Extractor::PAGE_MAX_LINE_LENGTH)
205
+ mime = guard_extractable(path)
206
+ path.open('rb') do |io|
207
+ Extractor.extract_paged(io, content_type: mime, offset: offset, limit: limit,
208
+ max_bytes: max_bytes, max_line_length: max_line_length)
209
+ end
210
+ rescue Extractor::Unsupported
211
+ raise ArgumentError, "#{path} appears to be binary; cannot extract as text"
212
+ rescue Extractor::Error => e
213
+ raise "Cannot extract text from #{path}: #{e.message}"
214
+ end
215
+
216
+ # The shared path-level refusals for {.read_as_text} /
217
+ # {.read_as_text_paged}: must be an existing non-directory
218
+ # +Pathname+, and not an image (images are data for a vision
219
+ # model, never text). Returns the {.detect_mime} result so the
220
+ # caller can pass it to the {Extractor} as the content-type hint.
221
+ #
222
+ # @param path [Pathname]
223
+ # @return [String, nil] the sniffed MIME type.
224
+ # @raise [ArgumentError] on a non-Pathname, a directory, or an
225
+ # image.
226
+ # @raise [Errno::ENOENT] if +path+ doesn't exist.
227
+ def guard_extractable(path)
302
228
  raise ArgumentError, "expected Pathname, got #{path.class}" unless path.is_a?(Pathname)
303
229
  raise Errno::ENOENT, path.to_s unless path.exist?
304
230
  raise ArgumentError, "#{path} is a directory" if path.directory?
305
231
 
306
232
  mime = detect_mime(path)
307
- if mime == 'application/pdf'
308
- return paged_pdf(path, offset: offset, limit: limit,
309
- max_bytes: max_bytes, max_line_length: max_line_length)
310
- end
311
233
  raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
312
- raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
313
-
314
- paged_text(path, offset: offset, limit: limit,
315
- max_bytes: max_bytes, max_line_length: max_line_length)
316
- end
317
-
318
- # Stream a text file line-by-line into a {Page}. Keeps counting
319
- # lines past the collection window so +total_lines+ can report the
320
- # real total when the line limit (not the byte cap) stopped
321
- # collection; on the byte cap it breaks and leaves +total_lines+
322
- # +nil+ (the rest of the file is never read).
323
- #
324
- # @return [Page] +kind: :text+.
325
- def paged_text(path, offset:, limit:, max_bytes:, max_line_length:)
326
- start_index = offset - 1
327
- collected = []
328
- total_lines = 0
329
- bytes = 0
330
- byte_capped = false
331
- more = false
332
-
333
- path.each_line do |raw|
334
- total_lines += 1
335
- next if total_lines <= start_index
336
-
337
- if collected.length >= limit
338
- more = true
339
- next
340
- end
341
-
342
- line = truncate_line(raw.chomp, max_line_length)
343
- size = line.bytesize + 1 # +1 for the joining newline
344
- if bytes + size > max_bytes
345
- byte_capped = true
346
- more = true
347
- break
348
- end
349
- collected << line
350
- bytes += size
351
- end
352
-
353
- Page.new(lines: collected, start_line: offset,
354
- total_lines: byte_capped ? nil : total_lines,
355
- more: more, byte_capped: byte_capped, kind: :text)
356
- end
357
- private_class_method :paged_text
358
-
359
- # PDF counterpart to {paged_text}: walk +pdf-reader+'s lazy page
360
- # iterator, emitting a +"--- Page N ---"+ header line then each
361
- # line of the page's text, applying the same offset / limit /
362
- # byte-cap contract. The +throw :done+ short-circuits both loops
363
- # the moment the window fills, so parsing stops — which is why a
364
- # PDF that stops early can't report +total_lines+ (it would have
365
- # to parse every page to count).
366
- #
367
- # @return [Page] +kind: :pdf+.
368
- # @raise [RuntimeError] on a malformed / unsupported PDF.
369
- def paged_pdf(path, offset:, limit:, max_bytes:, max_line_length:)
370
- start_index = offset - 1
371
- collected = []
372
- total_lines = 0
373
- bytes = 0
374
- byte_capped = false
375
- more = false
376
-
377
- catch(:done) do
378
- path.open('rb') do |io|
379
- reader = ::PDF::Reader.new(io)
380
- reader.pages.each_with_index do |page, idx|
381
- text = page.text.strip
382
- next if text.empty?
383
-
384
- ["--- Page #{idx + 1} ---", *text.split("\n")].each do |raw|
385
- total_lines += 1
386
- next if total_lines <= start_index
387
-
388
- if collected.length >= limit
389
- more = true
390
- throw :done
391
- end
392
-
393
- line = truncate_line(raw, max_line_length)
394
- size = line.bytesize + 1
395
- if bytes + size > max_bytes
396
- byte_capped = true
397
- more = true
398
- throw :done
399
- end
400
- collected << line
401
- bytes += size
402
- end
403
- end
404
- end
405
- end
406
-
407
- Page.new(lines: collected, start_line: offset,
408
- total_lines: more ? nil : total_lines,
409
- more: more, byte_capped: byte_capped, kind: :pdf)
410
- rescue ::PDF::Reader::MalformedPDFError,
411
- ::PDF::Reader::InvalidPageError,
412
- ::PDF::Reader::UnsupportedFeatureError => e
413
- raise "Cannot extract PDF text from #{path}: " \
414
- "#{e.class.name.split('::').last}: #{e.message}"
415
- end
416
- private_class_method :paged_pdf
417
-
418
- # Truncate +line+ to +max_line_length+ chars, appending
419
- # {PAGE_LINE_TRUNCATION_MARKER} when it overflows.
420
- #
421
- # @param line [String]
422
- # @param max_line_length [Integer]
423
- # @return [String]
424
- def truncate_line(line, max_line_length)
425
- return line if line.length <= max_line_length
426
234
 
427
- line[0, max_line_length] + PAGE_LINE_TRUNCATION_MARKER
235
+ mime
428
236
  end
429
- private_class_method :truncate_line
237
+ private_class_method :guard_extractable
430
238
 
431
239
  # Coerce an +input+ argument into a bytes String for the sniffs.
432
240
  # +String+ inputs are returned as-is (caller already sampled);
@@ -0,0 +1,179 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ # Renders attacker-controlled text safe to display, and reports *why*
5
+ # it was unsafe.
6
+ #
7
+ # Every string an LLM composes is untrusted: a bash command, a tool
8
+ # observation echoed back to the user, a description it wrote for a
9
+ # confirmation prompt. A model that is broken — or, far more likely,
10
+ # being driven by a prompt injection — can embed bytes that a terminal
11
+ # acts on rather than prints: a carriage return that overwrites the
12
+ # line the user just read, an ESC that recolors or repositions, a
13
+ # backspace that erases, a bidirectional override that reorders text so
14
+ # it reads differently than it runs, a zero-width character that hides
15
+ # in plain sight, or a Cyrillic +а+ masquerading as a Latin +a+. The
16
+ # whole point of a confirmation prompt collapses if the bytes the user
17
+ # approves are not the bytes that execute.
18
+ #
19
+ # {.sanitize} is the one chrome-independent primitive every renderer
20
+ # (terminal, TUI, web) routes through. It does two things and returns
21
+ # both as a {Result}:
22
+ #
23
+ # 1. *Neutralize* — make the dangerous bytes visible without changing
24
+ # structure. Control bytes become +\xNN+, bidi/zero-width codepoints
25
+ # become +\u{NNNN}+, tab becomes +\t+. Newlines are preserved
26
+ # (multi-line commands are normal). This is *faithful, not
27
+ # beautifying*: it never collapses runs of whitespace or rewrites a
28
+ # tab to a space, because the user must see exactly what they are
29
+ # approving — a Makefile's leading tab stays visibly a tab. A web
30
+ # chrome composes +html_escape(sanitize(s).text)+; the HTML layer is
31
+ # the caller's, not ours.
32
+ # 2. *Warn* — return a {Warning} per category detected, each a semantic
33
+ # record (kind + offending tokens + a plain-English explanation).
34
+ # Presentation is the chrome's: a terminal renders these bold yellow,
35
+ # a web client a banner. The {Warning} carries no color or markup.
36
+ #
37
+ # == Scope (deliberately closed)
38
+ #
39
+ # Detection covers the *invisibility / cursor-control / reordering*
40
+ # attack classes completely, because each is a finite, enumerable set
41
+ # of codepoints: C0 controls, C1 controls (a second ANSI introducer on
42
+ # some emulators), DEL, the bidi overrides, and the zero-width
43
+ # characters. On top of that, {.sanitize} flags *mixed-script tokens* —
44
+ # a single word combining letters from Latin + Cyrillic + Greek, which
45
+ # is the signature of a homoglyph spoof and has near-zero false
46
+ # positives on real text (humans do not weld two alphabets inside one
47
+ # word; +café+ is all-Latin, +Москва+ all-Cyrillic, only +Pаypal+ mixes).
48
+ #
49
+ # Two confusable classes are explicitly *out of scope*, because
50
+ # detecting them needs Unicode confusables tables and produces heavy
51
+ # false positives on legitimate multilingual text:
52
+ #
53
+ # * *Whole-script* homoglyphs — an entirely-Cyrillic string that merely
54
+ # looks Latin (no mixing to detect).
55
+ # * *Single-symbol* confusables — the Greek question mark +;+ (U+037E)
56
+ # that looks like a semicolon, full-width forms, the division slash.
57
+ #
58
+ # "Solid" here means complete on the classes above, not exhaustive over
59
+ # all of Unicode.
60
+ module Sanitizer
61
+ # One reason a piece of text was flagged, ready for a chrome to
62
+ # render however it surfaces warnings (bold yellow line, web banner).
63
+ #
64
+ # * +kind+ — a {Symbol} category: +:backspace+, +:control_bytes+,
65
+ # +:bidi+, +:zero_width+, or +:mixed_script+.
66
+ # * +offenders+ — the distinct offending tokens, in first-seen order:
67
+ # the escaped forms (+"\\x1b"+, +"\\u{202e}"+) for byte categories,
68
+ # the raw tokens (+"Pаypal"+) for +:mixed_script+.
69
+ # * +explanation+ — a one-line, chrome-agnostic English summary of
70
+ # what the bytes can do.
71
+ Warning = Data.define(:kind, :offenders, :explanation)
72
+
73
+ # The output of {Sanitizer.sanitize}.
74
+ #
75
+ # * +text+ — the neutralized string, safe to print literally.
76
+ # * +warnings+ — {Array}<{Warning}>, empty when nothing was flagged.
77
+ Result = Data.define(:text, :warnings)
78
+
79
+ # Bidirectional-override codepoints: the explicit LRO/RLO/PDF/LRE/RLE
80
+ # set plus the isolate set (LRI/RLI/FSI/PDI). Reordering attacks.
81
+ BIDI_OVERRIDES = [*0x202a..0x202e, *0x2066..0x2069].freeze
82
+
83
+ # Zero-width and invisible codepoints: ZWSP, ZWNJ, ZWJ, and the BOM /
84
+ # zero-width no-break space.
85
+ ZERO_WIDTH = [0x200b, 0x200c, 0x200d, 0xfeff].freeze
86
+
87
+ # Codepoints {.sanitize} rewrites: C0 controls including tab (U+0009)
88
+ # but *excluding* newline (U+000A, which passes through untouched),
89
+ # C1 controls + DEL (U+007F–009F), the zero-width set, and the bidi
90
+ # overrides. Newline is the one control character a faithful render
91
+ # must keep, so the C0 range is split around it.
92
+ SUSPECT = /[\u0000-\u0009\u000b-\u001f\u007f-\u009f\u200b-\u200d\u202a-\u202e\u2066-\u2069\ufeff]/
93
+
94
+ # The three Latin-confusable scripts whose mixing inside one token
95
+ # signals a homoglyph spoof. Punctuation, digits and spaces are the
96
+ # +Common+ script and match none of these, so they never count toward
97
+ # the "two distinct scripts" threshold.
98
+ CONFUSABLE_SCRIPTS = { 'Latin' => /\p{Latin}/, 'Cyrillic' => /\p{Cyrillic}/, 'Greek' => /\p{Greek}/ }.freeze
99
+
100
+ # Neutralize +text+ for literal display and report what was flagged.
101
+ #
102
+ # @param text [String] attacker-controlled text (an LLM-composed
103
+ # command, description, or tool observation), e.g.
104
+ # +"echo hi\rrm -rf /"+
105
+ # @return [Result] the neutralized +text+ plus an {Array}<{Warning}>
106
+ # (empty when clean)
107
+ def self.sanitize(text)
108
+ backspace = false
109
+ control = []
110
+ bidi = []
111
+ zero_width = []
112
+
113
+ clean = text.gsub(SUSPECT) do |ch|
114
+ cp = ch.ord
115
+ if cp == 0x09
116
+ '\\t'
117
+ elsif cp == 0x08
118
+ backspace = true
119
+ '\\x08'
120
+ elsif BIDI_OVERRIDES.include?(cp)
121
+ format('\\u{%04x}', cp).tap { |t| bidi << t }
122
+ elsif ZERO_WIDTH.include?(cp)
123
+ format('\\u{%04x}', cp).tap { |t| zero_width << t }
124
+ else
125
+ format('\\x%02x', cp).tap { |t| control << t }
126
+ end
127
+ end
128
+
129
+ Result.new(text: clean, warnings: warnings_for(backspace, control, bidi, zero_width, mixed_script_tokens(text)))
130
+ end
131
+
132
+ # Tokens (whitespace-delimited runs) that combine letters from two or
133
+ # more of {CONFUSABLE_SCRIPTS} — the homoglyph-spoof signature.
134
+ #
135
+ # @param text [String]
136
+ # @return [Array<String>] distinct offending tokens, first-seen order
137
+ def self.mixed_script_tokens(text)
138
+ text.split(/\s+/).reject(&:empty?).select do |token|
139
+ CONFUSABLE_SCRIPTS.count { |_name, re| token.match?(re) } >= 2
140
+ end.uniq
141
+ end
142
+
143
+ # Assemble one {Warning} per non-empty category, in a stable order
144
+ # (most-deceptive first).
145
+ #
146
+ # @return [Array<Warning>]
147
+ def self.warnings_for(backspace, control, bidi, zero_width, mixed)
148
+ out = []
149
+ if backspace
150
+ out << Warning.new(kind: :backspace, offenders: ['\\x08'],
151
+ explanation: 'Backspace characters present — the model may be trying to visually erase ' \
152
+ 'part of the text after you have read it.')
153
+ end
154
+ unless bidi.empty?
155
+ out << Warning.new(kind: :bidi, offenders: bidi.uniq,
156
+ explanation: "Bidirectional-override characters present (#{bidi.uniq.join(' ')}) — these " \
157
+ 'can reorder how text is displayed so it reads differently than it runs.')
158
+ end
159
+ unless zero_width.empty?
160
+ out << Warning.new(kind: :zero_width, offenders: zero_width.uniq,
161
+ explanation: "Zero-width / invisible characters present (#{zero_width.uniq.join(' ')}) — " \
162
+ 'the text may contain characters you cannot see.')
163
+ end
164
+ unless control.empty?
165
+ out << Warning.new(kind: :control_bytes, offenders: control.uniq,
166
+ explanation: "Non-printable control bytes present (#{control.uniq.join(' ')}) — in a " \
167
+ 'terminal these can move the cursor, change colors, or hide output.')
168
+ end
169
+ unless mixed.empty?
170
+ out << Warning.new(kind: :mixed_script, offenders: mixed,
171
+ explanation: "Mixed-script tokens present (#{mixed.join(', ')}) — letters from different " \
172
+ "alphabets are combined within one word, a classic homoglyph spoof (e.g. " \
173
+ "Cyrillic 'а' standing in for Latin 'a').")
174
+ end
175
+ out
176
+ end
177
+ private_class_method :warnings_for
178
+ end
179
+ end