pikuri-core 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pikuri/agent/configurator.rb +9 -2
- data/lib/pikuri/agent/context_window_detector.rb +70 -10
- data/lib/pikuri/agent/control/interloper.rb +10 -2
- data/lib/pikuri/agent/event.rb +15 -0
- data/lib/pikuri/agent/extension.rb +37 -9
- data/lib/pikuri/agent/listener/terminal.rb +22 -36
- data/lib/pikuri/agent.rb +174 -73
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +87 -59
- data/lib/pikuri/finalizers.rb +118 -0
- data/lib/pikuri/paths.rb +29 -0
- data/lib/pikuri/subprocess.rb +109 -12
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +0 -1
- metadata +8 -62
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
# The format→text extraction seam: one registry of extractors that
|
|
5
|
+
# turn an +IO+ of some recognised format (HTML and plain text out
|
|
6
|
+
# of the box; PDF / office formats via the pikuri-pdf /
|
|
7
|
+
# pikuri-extractors plug-in gems) into Markdown-flavoured UTF-8
|
|
8
|
+
# text, consumed through two front doors:
|
|
9
|
+
#
|
|
10
|
+
# * {.extract} — the whole document as one String. The shape the
|
|
11
|
+
# indexing / caching callers want ({Pikuri::VectorDb}'s indexer,
|
|
12
|
+
# {Tool::WebScrape}'s URL cache): no windowing, no presentation.
|
|
13
|
+
# * {.extract_paged} — the LLM-tool shape: the same extraction,
|
|
14
|
+
# windowed to a line range with a byte cap, returned as a {Page}
|
|
15
|
+
# the caller renders. Backs +Workspace::Read+ and
|
|
16
|
+
# +VectorDb::Tools::Read+ (via the {FileType} path wrappers) so
|
|
17
|
+
# the offset/limit/byte-cap logic lives in one tested place.
|
|
18
|
+
#
|
|
19
|
+
# Both front doors — +Tool::Scraper+ dispatching on the HTTP
|
|
20
|
+
# +Content-Type+ header for the web tools, and {FileType} resolving
|
|
21
|
+
# local paths — route through this one registry, so both share one
|
|
22
|
+
# set of format truths and "support a new format" is a registry
|
|
23
|
+
# entry (pikuri-pdf and pikuri-extractors plug PDF and office
|
|
24
|
+
# formats in without pikuri-core knowing), not a new special case
|
|
25
|
+
# in two dispatchers.
|
|
26
|
+
#
|
|
27
|
+
# == The extractor duck type
|
|
28
|
+
#
|
|
29
|
+
# Each {.registry} entry implements three methods:
|
|
30
|
+
#
|
|
31
|
+
# * +matches?(sample:, content_type:)+ → +Boolean+ — claim the
|
|
32
|
+
# content. +sample+ is the leading {FileType::SAMPLE_BYTES} bytes
|
|
33
|
+
# (for magic-byte sniffs); +content_type+ is the normalized HTTP
|
|
34
|
+
# +Content-Type+ for web content, the {FileType.detect_mime}
|
|
35
|
+
# result for local files, and may be +nil+ ("no transport
|
|
36
|
+
# metadata — sniff if you can").
|
|
37
|
+
# * +extract(io)+ → +String+ — the whole document as
|
|
38
|
+
# Markdown-flavoured UTF-8 text. Raises {Error} on content the
|
|
39
|
+
# extractor claimed but cannot parse (malformed PDF, ...).
|
|
40
|
+
# * +kind+ → +Symbol+ — a short tag (+:text+ / +:pdf+ / +:html+)
|
|
41
|
+
# carried on {Page#kind} so rendering callers can word
|
|
42
|
+
# format-specific trailers ("End of PDF", the scanned-image
|
|
43
|
+
# hint) without re-sniffing.
|
|
44
|
+
#
|
|
45
|
+
# plus one *optional* method for formats whose lines can be
|
|
46
|
+
# produced incrementally:
|
|
47
|
+
#
|
|
48
|
+
# * +extract_lines(io)+ → +Enumerator<String>+ — the same content
|
|
49
|
+
# as +extract+, as a lazy stream of already-+chomp+ed lines.
|
|
50
|
+
# {.extract_paged} prefers this when present and stops consuming
|
|
51
|
+
# the moment the window fills, so the rest of the document is
|
|
52
|
+
# never parsed (pikuri-pdf's extractor: pdf-reader's page list
|
|
53
|
+
# parses on access; {Passthrough}: the IO is read line-by-line).
|
|
54
|
+
# The enumerator
|
|
55
|
+
# must be consumed while +io+ is still open, and may raise
|
|
56
|
+
# {Error} mid-iteration. Extractors that need the whole document
|
|
57
|
+
# to produce anything ({HTML}: Readability walks the full DOM —
|
|
58
|
+
# true of any subprocess-based extractor too) simply omit it;
|
|
59
|
+
# {.extract_paged} then extracts in full and windows the result.
|
|
60
|
+
#
|
|
61
|
+
# Windowing itself (offset / limit / byte cap / line truncation) is
|
|
62
|
+
# presentation and deliberately lives once in {.extract_paged}, not
|
|
63
|
+
# per extractor — +extract_lines+ is line *production*, the only
|
|
64
|
+
# genuinely format-specific half of paging.
|
|
65
|
+
#
|
|
66
|
+
# == Errors
|
|
67
|
+
#
|
|
68
|
+
# Both failure modes are failures the *caller's* LLM can react to,
|
|
69
|
+
# so they share one rescuable root:
|
|
70
|
+
#
|
|
71
|
+
# * {Unsupported} — nothing in {.registry} claimed the content
|
|
72
|
+
# (opaque binary, an unhandled content-type).
|
|
73
|
+
# * {Error} (the root) — an extractor claimed the content but the
|
|
74
|
+
# parse failed (malformed PDF, ...).
|
|
75
|
+
#
|
|
76
|
+
# Callers map them to their own conventions:
|
|
77
|
+
# +Tool::Scraper+ re-raises both as +FetchError+;
|
|
78
|
+
# {FileType.read_as_text} maps {Unsupported} to the +ArgumentError+
|
|
79
|
+
# binary refusal and {Error} to a +RuntimeError+ carrying the path.
|
|
80
|
+
module Extractor
|
|
81
|
+
module_function
|
|
82
|
+
|
|
83
|
+
# Raised when an extractor claims content but fails to parse it
|
|
84
|
+
# (e.g. a malformed PDF). Message is LLM-presentable.
|
|
85
|
+
Error = Class.new(StandardError)
|
|
86
|
+
|
|
87
|
+
# Raised by {.extract} / {.extract_paged} when no registry entry
|
|
88
|
+
# claims the content. Subclass of {Error} so callers that don't
|
|
89
|
+
# care about the distinction rescue one class.
|
|
90
|
+
Unsupported = Class.new(Error)
|
|
91
|
+
|
|
92
|
+
# @return [Integer] default line-window size for {.extract_paged}
|
|
93
|
+
# when the caller omits +limit+.
|
|
94
|
+
PAGE_DEFAULT_LIMIT = 2000
|
|
95
|
+
|
|
96
|
+
# @return [Integer] default hard byte cap on the content collected
|
|
97
|
+
# by a single {.extract_paged} call. Bypassable by paging via
|
|
98
|
+
# +offset+. The rendered output is slightly larger (line
|
|
99
|
+
# numbering, trailer) — that's the caller's concern.
|
|
100
|
+
PAGE_MAX_BYTES = 50 * 1024
|
|
101
|
+
|
|
102
|
+
# @return [Integer] default per-line character cap;
|
|
103
|
+
# {.extract_paged} truncates longer lines and appends
|
|
104
|
+
# {PAGE_LINE_TRUNCATION_MARKER}.
|
|
105
|
+
PAGE_MAX_LINE_LENGTH = 2000
|
|
106
|
+
|
|
107
|
+
# @return [String] suffix appended to a line truncated at
|
|
108
|
+
# {PAGE_MAX_LINE_LENGTH}.
|
|
109
|
+
PAGE_LINE_TRUNCATION_MARKER = "... (line truncated to #{PAGE_MAX_LINE_LENGTH} chars)"
|
|
110
|
+
|
|
111
|
+
# One windowed slice of a document, returned by {.extract_paged}.
|
|
112
|
+
# The caller turns this into an observation; this struct carries
|
|
113
|
+
# everything a trailer needs without the caller re-reading the
|
|
114
|
+
# document.
|
|
115
|
+
#
|
|
116
|
+
# == Fields
|
|
117
|
+
#
|
|
118
|
+
# * +lines+ — +Array<String>+, the collected window. Already
|
|
119
|
+
# per-line truncated (with {PAGE_LINE_TRUNCATION_MARKER}); *not*
|
|
120
|
+
# line-numbered — numbering is presentation the caller adds. For
|
|
121
|
+
# a PDF the array includes the +"--- Page N ---"+ marker lines
|
|
122
|
+
# pikuri-pdf's extractor emits, which count toward +limit+ / the
|
|
123
|
+
# byte cap like any other line.
|
|
124
|
+
# * +start_line+ — the 1-indexed line number of +lines.first+
|
|
125
|
+
# (i.e. the +offset+ the caller asked for). +lines.last+ is at
|
|
126
|
+
# +start_line + lines.length - 1+.
|
|
127
|
+
# * +total_lines+ — total line count of the document when known,
|
|
128
|
+
# else +nil+. Known when the read reached EOF, when the format
|
|
129
|
+
# was extracted in full (no +extract_lines+ — e.g. HTML), or
|
|
130
|
+
# when the lazy stream is cheap enough to count to the end
|
|
131
|
+
# (plain text). +nil+ when a lazy stream stopped early — the
|
|
132
|
+
# byte cap fired, or a PDF filled the window before its last
|
|
133
|
+
# page (counting the rest would mean parsing every page,
|
|
134
|
+
# defeating the laziness).
|
|
135
|
+
# * +more+ — +true+ if content remains past this window (the
|
|
136
|
+
# caller should offer +offset = start_line + lines.length+).
|
|
137
|
+
# * +byte_capped+ — +true+ if the byte cap (not the line limit)
|
|
138
|
+
# was the stopping criterion.
|
|
139
|
+
# * +kind+ — the matched extractor's +kind+ tag (+:text+ /
|
|
140
|
+
# +:pdf+ / +:html+); lets the caller word format-specific
|
|
141
|
+
# trailers and the empty-document message.
|
|
142
|
+
#
|
|
143
|
+
# An empty document yields +lines: []+, +total_lines: 0+; an
|
|
144
|
+
# +offset+ past EOF yields +lines: []+ with +total_lines+ set to
|
|
145
|
+
# the real (non-zero) count — the caller distinguishes the two.
|
|
146
|
+
Page = Data.define(:lines, :start_line, :total_lines, :more, :byte_capped, :kind)
|
|
147
|
+
|
|
148
|
+
# The extractor registry, consulted in order — first match wins.
|
|
149
|
+
# Core ships two entries: {HTML} matches on content-type, and
|
|
150
|
+
# {Passthrough} is the terminal plain-text arm. A gem adding a
|
|
151
|
+
# format picks its insertion point by the strength of its claim:
|
|
152
|
+
# a magic-byte sniff that never misfires on text goes at the
|
|
153
|
+
# *front* so it beats {HTML}'s content-type match even under a
|
|
154
|
+
# lying header (+registry.unshift(X)+ — pikuri-pdf does this);
|
|
155
|
+
# a content-type / weaker-sniff claimer inserts before the
|
|
156
|
+
# terminal entry (+registry.insert(-2, X)+ — pikuri-extractors
|
|
157
|
+
# does this).
|
|
158
|
+
#
|
|
159
|
+
# @return [Array<#matches?>] mutable, deliberately — this is the
|
|
160
|
+
# plug-in seam.
|
|
161
|
+
def registry
|
|
162
|
+
@registry ||= [HTML, Passthrough]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Extract the whole document behind +io+ as one Markdown-flavoured
|
|
166
|
+
# UTF-8 String. May be empty (empty text file, scanned-image PDF
|
|
167
|
+
# with no extractable text).
|
|
168
|
+
#
|
|
169
|
+
# @param io [IO, StringIO] seekable IO positioned at the start of
|
|
170
|
+
# the content; this method reads a leading sample for the
|
|
171
|
+
# +matches?+ sniff and rewinds before extracting.
|
|
172
|
+
# @param content_type [String, nil] normalized content-type when
|
|
173
|
+
# the transport supplies one (HTTP header, {FileType.detect_mime}
|
|
174
|
+
# result); +nil+ when unknown — extractors then rely on their
|
|
175
|
+
# byte sniffs.
|
|
176
|
+
# @return [String]
|
|
177
|
+
# @raise [Unsupported] when no registry entry claims the content.
|
|
178
|
+
# @raise [Error] when the matched extractor cannot parse it.
|
|
179
|
+
def extract(io, content_type: nil)
|
|
180
|
+
extractor_for(io, content_type).extract(io)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Extract +io+ and return a windowed {Page}: the lines from
|
|
184
|
+
# +offset+ (1-indexed) up to +limit+ of them, stopping early if
|
|
185
|
+
# +max_bytes+ is reached, with over-long lines truncated at
|
|
186
|
+
# +max_line_length+.
|
|
187
|
+
#
|
|
188
|
+
# Lazy where the format allows: extractors that implement
|
|
189
|
+
# +extract_lines+ (plain text; pikuri-pdf's PDF) are consumed
|
|
190
|
+
# only until the window fills — reading the first window of a
|
|
191
|
+
# 500-page PDF parses a handful of pages, and the first page of
|
|
192
|
+
# a gigabyte log never loads it. Extractors without it (HTML) are extracted
|
|
193
|
+
# in full and then windowed, which is also what makes their
|
|
194
|
+
# +total_lines+ always exact.
|
|
195
|
+
#
|
|
196
|
+
# @param io [IO, StringIO] seekable IO positioned at the start.
|
|
197
|
+
# @param content_type [String, nil] as for {.extract}.
|
|
198
|
+
# @param offset [Integer] 1-indexed first line to include. The
|
|
199
|
+
# caller is responsible for validating +offset >= 1+.
|
|
200
|
+
# @param limit [Integer] maximum lines to collect. Caller
|
|
201
|
+
# validates +limit >= 1+.
|
|
202
|
+
# @param max_bytes [Integer] hard byte cap on collected content.
|
|
203
|
+
# @param max_line_length [Integer] per-line truncation threshold.
|
|
204
|
+
# @return [Page] the windowed slice.
|
|
205
|
+
# @raise [Unsupported] when no registry entry claims the content.
|
|
206
|
+
# @raise [Error] when the matched extractor cannot parse it.
|
|
207
|
+
def extract_paged(io, content_type: nil, offset: 1, limit: PAGE_DEFAULT_LIMIT,
|
|
208
|
+
max_bytes: PAGE_MAX_BYTES, max_line_length: PAGE_MAX_LINE_LENGTH)
|
|
209
|
+
extractor = extractor_for(io, content_type)
|
|
210
|
+
if extractor.respond_to?(:extract_lines)
|
|
211
|
+
# count_tail is a per-format economics call: once the window
|
|
212
|
+
# fills, counting the rest of a plain-text stream is a cheap
|
|
213
|
+
# sequential read (so the trailer can say "of N"), while for
|
|
214
|
+
# a PDF it would mean parsing every remaining page — exactly
|
|
215
|
+
# what extract_lines exists to avoid. Plugged-in extractors
|
|
216
|
+
# (pikuri-pdf's included) get the conservative default (stop
|
|
217
|
+
# early, total unknown).
|
|
218
|
+
window(extractor.extract_lines(io),
|
|
219
|
+
offset: offset, limit: limit, max_bytes: max_bytes,
|
|
220
|
+
max_line_length: max_line_length, kind: extractor.kind,
|
|
221
|
+
known_total: nil, count_tail: extractor.equal?(Passthrough))
|
|
222
|
+
else
|
|
223
|
+
lines = extractor.extract(io).split("\n")
|
|
224
|
+
window(lines, offset: offset, limit: limit, max_bytes: max_bytes,
|
|
225
|
+
max_line_length: max_line_length, kind: extractor.kind,
|
|
226
|
+
known_total: lines.length)
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Find the first registry entry claiming +io+'s content: read the
|
|
231
|
+
# leading {FileType::SAMPLE_BYTES} for the sniff, rewind, and ask
|
|
232
|
+
# each extractor in order.
|
|
233
|
+
#
|
|
234
|
+
# @param io [IO, StringIO] seekable IO positioned at the start.
|
|
235
|
+
# @param content_type [String, nil]
|
|
236
|
+
# @return [#extract] the matched extractor.
|
|
237
|
+
# @raise [Unsupported] when nothing matches.
|
|
238
|
+
def extractor_for(io, content_type)
|
|
239
|
+
sample = io.read(FileType::SAMPLE_BYTES) || +''
|
|
240
|
+
io.rewind
|
|
241
|
+
registry.find { |ex| ex.matches?(sample: sample, content_type: content_type) } ||
|
|
242
|
+
raise(Unsupported, 'no extractor for this content' \
|
|
243
|
+
"#{content_type && !content_type.empty? ? " (content-type #{content_type.inspect})" : ''}")
|
|
244
|
+
end
|
|
245
|
+
private_class_method :extractor_for
|
|
246
|
+
|
|
247
|
+
# Collect a {Page} window out of +lines+ (an Array or a lazy
|
|
248
|
+
# Enumerator of already-+chomp+ed lines). +known_total+ is the
|
|
249
|
+
# full line count when the caller extracted everything up front
|
|
250
|
+
# (Array case), +nil+ for a lazy stream — then +total_lines+ is
|
|
251
|
+
# exact only if the iteration reached EOF: +count_tail+ keeps
|
|
252
|
+
# the loop counting (without collecting) past the line limit
|
|
253
|
+
# when consuming the rest of the stream is cheap; without it the
|
|
254
|
+
# loop breaks and leaves the total unknown. The byte cap always
|
|
255
|
+
# aborts the count.
|
|
256
|
+
#
|
|
257
|
+
# @param lines [Enumerable<String>]
|
|
258
|
+
# @param known_total [Integer, nil]
|
|
259
|
+
# @param count_tail [Boolean]
|
|
260
|
+
# @return [Page]
|
|
261
|
+
def window(lines, offset:, limit:, max_bytes:, max_line_length:, kind:,
|
|
262
|
+
known_total:, count_tail: false)
|
|
263
|
+
start_index = offset - 1
|
|
264
|
+
collected = []
|
|
265
|
+
seen = 0
|
|
266
|
+
bytes = 0
|
|
267
|
+
byte_capped = false
|
|
268
|
+
more = false
|
|
269
|
+
stopped_early = false
|
|
270
|
+
|
|
271
|
+
lines.each do |raw|
|
|
272
|
+
seen += 1
|
|
273
|
+
next if seen <= start_index
|
|
274
|
+
|
|
275
|
+
if collected.length >= limit
|
|
276
|
+
more = true
|
|
277
|
+
next if count_tail # keep counting so total_lines stays exact
|
|
278
|
+
|
|
279
|
+
stopped_early = true
|
|
280
|
+
break
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
line = truncate_line(raw, max_line_length)
|
|
284
|
+
size = line.bytesize + 1 # +1 for the joining newline
|
|
285
|
+
if bytes + size > max_bytes
|
|
286
|
+
byte_capped = true
|
|
287
|
+
more = true
|
|
288
|
+
stopped_early = true
|
|
289
|
+
break
|
|
290
|
+
end
|
|
291
|
+
collected << line
|
|
292
|
+
bytes += size
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
Page.new(lines: collected, start_line: offset,
|
|
296
|
+
total_lines: known_total || (stopped_early ? nil : seen),
|
|
297
|
+
more: more, byte_capped: byte_capped, kind: kind)
|
|
298
|
+
end
|
|
299
|
+
private_class_method :window
|
|
300
|
+
|
|
301
|
+
# Truncate +line+ to +max_line_length+ chars, appending
|
|
302
|
+
# {PAGE_LINE_TRUNCATION_MARKER} when it overflows.
|
|
303
|
+
#
|
|
304
|
+
# @param line [String]
|
|
305
|
+
# @param max_line_length [Integer]
|
|
306
|
+
# @return [String]
|
|
307
|
+
def truncate_line(line, max_line_length)
|
|
308
|
+
return line if line.length <= max_line_length
|
|
309
|
+
|
|
310
|
+
line[0, max_line_length] + PAGE_LINE_TRUNCATION_MARKER
|
|
311
|
+
end
|
|
312
|
+
private_class_method :truncate_line
|
|
313
|
+
end
|
|
314
|
+
end
|
data/lib/pikuri/file_type.rb
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'pdf-reader'
|
|
4
|
-
|
|
5
3
|
module Pikuri
|
|
6
|
-
# Magic-byte content sniffing
|
|
7
|
-
# responsibilities:
|
|
4
|
+
# Magic-byte content sniffing, plus the path-aware front over the
|
|
5
|
+
# {Extractor} registry. Two responsibilities:
|
|
8
6
|
#
|
|
9
7
|
# * {.detect_mime} — recognise a file from its leading bytes. Returns
|
|
10
8
|
# a MIME String for formats pikuri knows how to handle specially
|
|
@@ -15,12 +13,16 @@ module Pikuri
|
|
|
15
13
|
# {.detect_mime}: a file can be both recognised (e.g. PDF) *and*
|
|
16
14
|
# binary. {.detect_mime} tells you what the bytes are;
|
|
17
15
|
# {.binary?} tells you whether they're safe to render as text.
|
|
18
|
-
#
|
|
19
|
-
#
|
|
20
|
-
#
|
|
21
|
-
#
|
|
22
|
-
#
|
|
23
|
-
#
|
|
16
|
+
#
|
|
17
|
+
# On top of those sit the two +Pathname+ conveniences,
|
|
18
|
+
# {.read_as_text} (whole document, the {Pikuri::VectorDb} indexer's
|
|
19
|
+
# shape) and {.read_as_text_paged} (line-windowed, the Read tools'
|
|
20
|
+
# shape). Both are thin wrappers: they own the *path-level* refusals
|
|
21
|
+
# (missing file, directory, image) and the exception mapping, then
|
|
22
|
+
# hand the opened IO to {Extractor.extract} /
|
|
23
|
+
# {Extractor.extract_paged} — which format the bytes are and how
|
|
24
|
+
# they become text is entirely the registry's business, so a gem
|
|
25
|
+
# plugging a new extractor in extends these wrappers for free.
|
|
24
26
|
#
|
|
25
27
|
# {.detect_mime} and {.binary?} accept either a +String+ of bytes
|
|
26
28
|
# (sample taken by the caller) or a +Pathname+ — when given a path,
|
|
@@ -28,8 +30,7 @@ module Pikuri
|
|
|
28
30
|
# for the sniff itself. The Pathname form is the convenience path;
|
|
29
31
|
# the bytes form is for callers that already have the sample or are
|
|
30
32
|
# calling both methods on the same file and want to avoid a second
|
|
31
|
-
# open.
|
|
32
|
-
# bytes-in shortcut because the PDF case needs to seek the file.
|
|
33
|
+
# open.
|
|
33
34
|
#
|
|
34
35
|
# == Why a separate module
|
|
35
36
|
#
|
|
@@ -39,8 +40,7 @@ module Pikuri
|
|
|
39
40
|
# {.binary?} reached for by {Workspace::Edit}. Collecting the
|
|
40
41
|
# detection logic here lets {Read} focus on routing
|
|
41
42
|
# (mime-to-formatter), {Edit} drop its cross-tool reach, and new
|
|
42
|
-
# tools
|
|
43
|
-
# ...) share one set of magic-byte truths.
|
|
43
|
+
# tools share one set of magic-byte truths.
|
|
44
44
|
#
|
|
45
45
|
# == Deliberate non-goals
|
|
46
46
|
#
|
|
@@ -136,19 +136,13 @@ module Pikuri
|
|
|
136
136
|
non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
|
|
137
137
|
end
|
|
138
138
|
|
|
139
|
-
# Read +path+ and return its content as plain UTF-8 text
|
|
140
|
-
#
|
|
141
|
-
#
|
|
142
|
-
#
|
|
143
|
-
#
|
|
144
|
-
#
|
|
145
|
-
#
|
|
146
|
-
# length if they care.
|
|
147
|
-
# * **Plain text** — anything that {.detect_mime} doesn't
|
|
148
|
-
# recognise and that {.binary?} accepts. Read with UTF-8
|
|
149
|
-
# encoding; behaviour on non-UTF-8 bytes is whatever +File.read+
|
|
150
|
-
# does with +encoding: Encoding::UTF_8+ (which is "leave invalid
|
|
151
|
-
# bytes in, let downstream decide").
|
|
139
|
+
# Read +path+ and return its content as plain UTF-8 text, routed
|
|
140
|
+
# through the {Extractor} registry: anything
|
|
141
|
+
# unrecognised-but-textual passes through verbatim
|
|
142
|
+
# ({Extractor::Passthrough}); with pikuri-pdf registered, PDFs
|
|
143
|
+
# are extracted with +"--- Page N ---"+ markers (a scanned-image
|
|
144
|
+
# PDF with no extractable text comes back as the empty String, a
|
|
145
|
+
# deliberate silent skip callers detect by length if they care).
|
|
152
146
|
#
|
|
153
147
|
# Refusal cases — all raise rather than returning a sentinel
|
|
154
148
|
# because the callers are internal pikuri code, not an LLM
|
|
@@ -159,13 +153,11 @@ module Pikuri
|
|
|
159
153
|
# * Path is a directory → +ArgumentError+.
|
|
160
154
|
# * Image (PNG / JPEG / GIF / WebP per {.detect_mime}) →
|
|
161
155
|
# +ArgumentError+; images aren't text.
|
|
162
|
-
# *
|
|
163
|
-
# +ArgumentError
|
|
164
|
-
# *
|
|
165
|
-
#
|
|
166
|
-
#
|
|
167
|
-
# path included so callers don't need to know pdf-reader's
|
|
168
|
-
# exception hierarchy.
|
|
156
|
+
# * Content no extractor claims (opaque binary) →
|
|
157
|
+
# +ArgumentError+, mapped from {Extractor::Unsupported}.
|
|
158
|
+
# * Extraction failure (malformed PDF, ...) → +RuntimeError+ with
|
|
159
|
+
# the path included, mapped from {Extractor::Error} so callers
|
|
160
|
+
# don't need to know any extractor's exception hierarchy.
|
|
169
161
|
#
|
|
170
162
|
# @param path [Pathname] file to read.
|
|
171
163
|
# @return [String] UTF-8 text. May be empty (empty text file, or
|
|
@@ -173,40 +165,76 @@ module Pikuri
|
|
|
173
165
|
# @raise [ArgumentError] if +path+ isn't a +Pathname+, points at
|
|
174
166
|
# a directory, is an image, or is binary.
|
|
175
167
|
# @raise [Errno::ENOENT] if +path+ doesn't exist.
|
|
176
|
-
# @raise [RuntimeError] on
|
|
168
|
+
# @raise [RuntimeError] on an extraction failure (malformed /
|
|
169
|
+
# unsupported PDF, ...).
|
|
177
170
|
def read_as_text(path)
|
|
171
|
+
mime = guard_extractable(path)
|
|
172
|
+
path.open('rb') { |io| Extractor.extract(io, content_type: mime) }
|
|
173
|
+
rescue Extractor::Unsupported
|
|
174
|
+
raise ArgumentError, "#{path} appears to be binary; cannot extract as text"
|
|
175
|
+
rescue Extractor::Error => e
|
|
176
|
+
raise "Cannot extract text from #{path}: #{e.message}"
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Extract +path+ and return a windowed {Extractor::Page}: the
|
|
180
|
+
# lines from +offset+ (1-indexed) up to +limit+ of them, stopping
|
|
181
|
+
# early if +max_bytes+ is reached, with over-long lines truncated
|
|
182
|
+
# at +max_line_length+. Same routing and refusal contract as
|
|
183
|
+
# {.read_as_text}; the windowing semantics (including the lazy
|
|
184
|
+
# +extract_lines+ consumption that stops parsing once the window
|
|
185
|
+
# fills) are {Extractor.extract_paged}'s.
|
|
186
|
+
# The LLM-facing callers map the exceptions into +"Error: ..."+
|
|
187
|
+
# observations themselves.
|
|
188
|
+
#
|
|
189
|
+
# @param path [Pathname] file to read.
|
|
190
|
+
# @param offset [Integer] 1-indexed first line to include. The
|
|
191
|
+
# caller is responsible for validating +offset >= 1+.
|
|
192
|
+
# @param limit [Integer] maximum lines to collect. Caller
|
|
193
|
+
# validates +limit >= 1+.
|
|
194
|
+
# @param max_bytes [Integer] hard byte cap on collected content.
|
|
195
|
+
# @param max_line_length [Integer] per-line truncation threshold.
|
|
196
|
+
# @return [Extractor::Page] the windowed slice.
|
|
197
|
+
# @raise [ArgumentError] if +path+ isn't a +Pathname+, is a
|
|
198
|
+
# directory, an image, or binary.
|
|
199
|
+
# @raise [Errno::ENOENT] if +path+ doesn't exist.
|
|
200
|
+
# @raise [RuntimeError] on an extraction failure (malformed /
|
|
201
|
+
# unsupported PDF, ...).
|
|
202
|
+
def read_as_text_paged(path, offset: 1, limit: Extractor::PAGE_DEFAULT_LIMIT,
|
|
203
|
+
max_bytes: Extractor::PAGE_MAX_BYTES,
|
|
204
|
+
max_line_length: Extractor::PAGE_MAX_LINE_LENGTH)
|
|
205
|
+
mime = guard_extractable(path)
|
|
206
|
+
path.open('rb') do |io|
|
|
207
|
+
Extractor.extract_paged(io, content_type: mime, offset: offset, limit: limit,
|
|
208
|
+
max_bytes: max_bytes, max_line_length: max_line_length)
|
|
209
|
+
end
|
|
210
|
+
rescue Extractor::Unsupported
|
|
211
|
+
raise ArgumentError, "#{path} appears to be binary; cannot extract as text"
|
|
212
|
+
rescue Extractor::Error => e
|
|
213
|
+
raise "Cannot extract text from #{path}: #{e.message}"
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# The shared path-level refusals for {.read_as_text} /
|
|
217
|
+
# {.read_as_text_paged}: must be an existing non-directory
|
|
218
|
+
# +Pathname+, and not an image (images are data for a vision
|
|
219
|
+
# model, never text). Returns the {.detect_mime} result so the
|
|
220
|
+
# caller can pass it to the {Extractor} as the content-type hint.
|
|
221
|
+
#
|
|
222
|
+
# @param path [Pathname]
|
|
223
|
+
# @return [String, nil] the sniffed MIME type.
|
|
224
|
+
# @raise [ArgumentError] on a non-Pathname, a directory, or an
|
|
225
|
+
# image.
|
|
226
|
+
# @raise [Errno::ENOENT] if +path+ doesn't exist.
|
|
227
|
+
def guard_extractable(path)
|
|
178
228
|
raise ArgumentError, "expected Pathname, got #{path.class}" unless path.is_a?(Pathname)
|
|
179
229
|
raise Errno::ENOENT, path.to_s unless path.exist?
|
|
180
230
|
raise ArgumentError, "#{path} is a directory" if path.directory?
|
|
181
231
|
|
|
182
232
|
mime = detect_mime(path)
|
|
183
|
-
return read_pdf_text(path) if mime == 'application/pdf'
|
|
184
233
|
raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
|
|
185
|
-
raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
|
|
186
234
|
|
|
187
|
-
|
|
188
|
-
end
|
|
189
|
-
|
|
190
|
-
# Walk a PDF page-by-page via +pdf-reader+, returning a single
|
|
191
|
-
# String with non-empty page texts joined by blank lines. Catches
|
|
192
|
-
# the three +PDF::Reader+ exceptions Workspace::Read also handles
|
|
193
|
-
# and re-raises them as +RuntimeError+ with the path included.
|
|
194
|
-
#
|
|
195
|
-
# @param path [Pathname]
|
|
196
|
-
# @return [String]
|
|
197
|
-
# @raise [RuntimeError] on malformed / unsupported PDF.
|
|
198
|
-
def read_pdf_text(path)
|
|
199
|
-
pages = path.open('rb') do |io|
|
|
200
|
-
::PDF::Reader.new(io).pages.map { |p| p.text.strip }
|
|
201
|
-
end
|
|
202
|
-
pages.reject(&:empty?).join("\n\n")
|
|
203
|
-
rescue ::PDF::Reader::MalformedPDFError,
|
|
204
|
-
::PDF::Reader::UnsupportedFeatureError,
|
|
205
|
-
::PDF::Reader::InvalidPageError => e
|
|
206
|
-
raise "Cannot extract PDF text from #{path}: " \
|
|
207
|
-
"#{e.class.name.split('::').last}: #{e.message}"
|
|
235
|
+
mime
|
|
208
236
|
end
|
|
209
|
-
private_class_method :
|
|
237
|
+
private_class_method :guard_extractable
|
|
210
238
|
|
|
211
239
|
# Coerce an +input+ argument into a bytes String for the sniffs.
|
|
212
240
|
# +String+ inputs are returned as-is (caller already sampled);
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
# Process-global teardown registry: one +at_exit+ for the whole
|
|
5
|
+
# process, with everything that owns a resource needing orderly
|
|
6
|
+
# shutdown (agents, {VectorDb::Server::Chroma}, future background
|
|
7
|
+
# workers) registering here instead of growing its own +at_exit+.
|
|
8
|
+
# It is {Agent#on_close} promoted from per-agent to per-process —
|
|
9
|
+
# the same LIFO + per-handler-rescue + idempotent shape, one level
|
|
10
|
+
# up.
|
|
11
|
+
#
|
|
12
|
+
# == Why one chokepoint
|
|
13
|
+
#
|
|
14
|
+
# Independent +at_exit+ hooks fire in an order decided by file load
|
|
15
|
+
# order, which is invisible and fragile. Routing every teardown
|
|
16
|
+
# through one registry makes the order explicit and controllable:
|
|
17
|
+
# the SIGTERM-the-strays backstop ({Subprocess.cleanup!}) registers
|
|
18
|
+
# at *load* time, so it sits at the bottom of the LIFO stack and runs
|
|
19
|
+
# *last* — after agents and servers (which register at *construction*
|
|
20
|
+
# time) have closed gracefully, while the subprocess machinery they
|
|
21
|
+
# shell out to during close (e.g. {VectorDb::Server::Chroma#close}'s
|
|
22
|
+
# +docker stop+) is still live.
|
|
23
|
+
#
|
|
24
|
+
# == Contract
|
|
25
|
+
#
|
|
26
|
+
# A registrant MUST respond to +#close+, and +#close+ MUST be
|
|
27
|
+
# idempotent and tolerant of running at process exit — the host may
|
|
28
|
+
# also have closed it explicitly earlier. Pass a block instead for
|
|
29
|
+
# teardown that has no natural +#close+ (e.g.
|
|
30
|
+
# +Finalizers.register { Pikuri::Subprocess.cleanup! }+).
|
|
31
|
+
#
|
|
32
|
+
# == Order: LIFO
|
|
33
|
+
#
|
|
34
|
+
# Last registered, first closed — Ruby +ensure+ semantics. A
|
|
35
|
+
# registrant that depends on an earlier one (a background indexer
|
|
36
|
+
# writing into {VectorDb::Server::Chroma}) is registered later and so
|
|
37
|
+
# tears down first. Registration order is therefore dependency order;
|
|
38
|
+
# register the dependency before its dependents.
|
|
39
|
+
#
|
|
40
|
+
# == Errors are contained
|
|
41
|
+
#
|
|
42
|
+
# Each +#close+ runs inside its own +rescue+: a raise is logged via
|
|
43
|
+
# {Pikuri.logger_for} and the sweep continues, so one botched
|
|
44
|
+
# teardown can't strand the rest. {.run!} drains the registry, so a
|
|
45
|
+
# second call (an explicit one, then the +at_exit+) closes nothing.
|
|
46
|
+
module Finalizers
|
|
47
|
+
# @return [Logger] subsystem logger for contained teardown failures.
|
|
48
|
+
LOGGER = Pikuri.logger_for('Finalizers')
|
|
49
|
+
|
|
50
|
+
# Adapts a teardown block to the +#close+ protocol, so a block and a
|
|
51
|
+
# closeable object can share one registry.
|
|
52
|
+
Closer = Struct.new(:block) do
|
|
53
|
+
# @return [void]
|
|
54
|
+
def close
|
|
55
|
+
block.call
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
@registered = []
|
|
60
|
+
@mutex = Mutex.new
|
|
61
|
+
|
|
62
|
+
class << self
|
|
63
|
+
# Register a closeable (or a block) to be torn down at process
|
|
64
|
+
# exit. Returns the registered handle so the caller can later
|
|
65
|
+
# {.unregister} it — a resource closed explicitly before exit
|
|
66
|
+
# should drop out so it can be garbage-collected rather than
|
|
67
|
+
# pinned alive until the process dies.
|
|
68
|
+
#
|
|
69
|
+
# @param closeable [#close, nil] resource to close at exit; omit
|
|
70
|
+
# when passing a block
|
|
71
|
+
# @yield teardown to run at exit, for resources with no +#close+
|
|
72
|
+
# @return [#close] the registered handle — the object itself, or
|
|
73
|
+
# the {Closer} wrapping the block; pass it to {.unregister}
|
|
74
|
+
# @raise [ArgumentError] if neither an object nor a block is given
|
|
75
|
+
def register(closeable = nil, &block)
|
|
76
|
+
unless closeable || block
|
|
77
|
+
raise ArgumentError, 'Finalizers.register requires an object or a block'
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
handle = closeable || Closer.new(block)
|
|
81
|
+
@mutex.synchronize { @registered << handle }
|
|
82
|
+
handle
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Drop a previously-registered handle. Idempotent — unregistering
|
|
86
|
+
# something already gone (or never registered) is a no-op.
|
|
87
|
+
#
|
|
88
|
+
# @param handle [#close] the value returned by {.register}
|
|
89
|
+
# @return [void]
|
|
90
|
+
def unregister(handle)
|
|
91
|
+
@mutex.synchronize { @registered.delete(handle) }
|
|
92
|
+
nil
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Close every registrant in LIFO order, each guarded by its own
|
|
96
|
+
# +rescue+. Wired to +at_exit+ at the bottom of this file.
|
|
97
|
+
# Draining the registry under the lock makes a repeat call a
|
|
98
|
+
# no-op and keeps it safe against a concurrent caller.
|
|
99
|
+
#
|
|
100
|
+
# @return [void]
|
|
101
|
+
def run!
|
|
102
|
+
handles = @mutex.synchronize do
|
|
103
|
+
taken = @registered.reverse
|
|
104
|
+
@registered.clear
|
|
105
|
+
taken
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
handles.each do |handle|
|
|
109
|
+
handle.close
|
|
110
|
+
rescue StandardError => e
|
|
111
|
+
LOGGER.warn("finalizer #{handle.class} raised #{e.class}: #{e.message}")
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
at_exit { Pikuri::Finalizers.run! }
|
data/lib/pikuri/paths.rb
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
|
|
5
|
+
module Pikuri
|
|
6
|
+
# Standardized on-disk locations for pikuri's local state. Centralizes
|
|
7
|
+
# the XDG resolution so every component that caches to disk roots under
|
|
8
|
+
# one place instead of each re-deriving it — currently
|
|
9
|
+
# {Pikuri::VectorDb::Server::Chroma} (its corpus volume) and
|
|
10
|
+
# {Pikuri::Memory::Mem0Server} (its mem0 checkout + data volume).
|
|
11
|
+
module Paths
|
|
12
|
+
# Pikuri's cache root: +$XDG_CACHE_HOME/pikuri+ when +XDG_CACHE_HOME+
|
|
13
|
+
# is set and non-empty, else +~/.cache/pikuri+.
|
|
14
|
+
#
|
|
15
|
+
# A method, not a frozen constant, on purpose: a constant would
|
|
16
|
+
# snapshot +XDG_CACHE_HOME+ at +require+ time, which breaks
|
|
17
|
+
# env-stubbing in tests and ignores a runtime change in a long-lived
|
|
18
|
+
# process. The directory is *not* created — callers +mkdir_p+ the
|
|
19
|
+
# subdirectory they need (e.g. +Paths.cache.join('chroma')+,
|
|
20
|
+
# +Paths.cache.join('mem0')+).
|
|
21
|
+
#
|
|
22
|
+
# @return [Pathname] the +<cache home>/pikuri+ directory
|
|
23
|
+
def self.cache
|
|
24
|
+
home = ENV['XDG_CACHE_HOME']
|
|
25
|
+
home = File.expand_path('~/.cache') if home.nil? || home.empty?
|
|
26
|
+
Pathname.new(home).join('pikuri')
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|