pikuri-core 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pikuri/agent/listener/terminal.rb +18 -36
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +74 -266
- data/lib/pikuri/subprocess.rb +73 -2
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +0 -1
- metadata +5 -61
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
# The format→text extraction seam: one registry of extractors that
|
|
5
|
+
# turn an +IO+ of some recognised format (HTML and plain text out
|
|
6
|
+
# of the box; PDF / office formats via the pikuri-pdf /
|
|
7
|
+
# pikuri-extractors plug-in gems) into Markdown-flavoured UTF-8
|
|
8
|
+
# text, consumed through two front doors:
|
|
9
|
+
#
|
|
10
|
+
# * {.extract} — the whole document as one String. The shape the
|
|
11
|
+
# indexing / caching callers want ({Pikuri::VectorDb}'s indexer,
|
|
12
|
+
# {Tool::WebScrape}'s URL cache): no windowing, no presentation.
|
|
13
|
+
# * {.extract_paged} — the LLM-tool shape: the same extraction,
|
|
14
|
+
# windowed to a line range with a byte cap, returned as a {Page}
|
|
15
|
+
# the caller renders. Backs +Workspace::Read+ and
|
|
16
|
+
# +VectorDb::Tools::Read+ (via the {FileType} path wrappers) so
|
|
17
|
+
# the offset/limit/byte-cap logic lives in one tested place.
|
|
18
|
+
#
|
|
19
|
+
# Both front doors — +Tool::Scraper+ dispatching on the HTTP
|
|
20
|
+
# +Content-Type+ header for the web tools, and {FileType} resolving
|
|
21
|
+
# local paths — route through this one registry, so both share one
|
|
22
|
+
# set of format truths and "support a new format" is a registry
|
|
23
|
+
# entry (pikuri-pdf and pikuri-extractors plug PDF and office
|
|
24
|
+
# formats in without pikuri-core knowing), not a new special case
|
|
25
|
+
# in two dispatchers.
|
|
26
|
+
#
|
|
27
|
+
# == The extractor duck type
|
|
28
|
+
#
|
|
29
|
+
# Each {.registry} entry implements three methods:
|
|
30
|
+
#
|
|
31
|
+
# * +matches?(sample:, content_type:)+ → +Boolean+ — claim the
|
|
32
|
+
# content. +sample+ is the leading {FileType::SAMPLE_BYTES} bytes
|
|
33
|
+
# (for magic-byte sniffs); +content_type+ is the normalized HTTP
|
|
34
|
+
# +Content-Type+ for web content, the {FileType.detect_mime}
|
|
35
|
+
# result for local files, and may be +nil+ ("no transport
|
|
36
|
+
# metadata — sniff if you can").
|
|
37
|
+
# * +extract(io)+ → +String+ — the whole document as
|
|
38
|
+
# Markdown-flavoured UTF-8 text. Raises {Error} on content the
|
|
39
|
+
# extractor claimed but cannot parse (malformed PDF, ...).
|
|
40
|
+
# * +kind+ → +Symbol+ — a short tag (+:text+ / +:pdf+ / +:html+)
|
|
41
|
+
# carried on {Page#kind} so rendering callers can word
|
|
42
|
+
# format-specific trailers ("End of PDF", the scanned-image
|
|
43
|
+
# hint) without re-sniffing.
|
|
44
|
+
#
|
|
45
|
+
# plus one *optional* method for formats whose lines can be
|
|
46
|
+
# produced incrementally:
|
|
47
|
+
#
|
|
48
|
+
# * +extract_lines(io)+ → +Enumerator<String>+ — the same content
|
|
49
|
+
# as +extract+, as a lazy stream of already-+chomp+ed lines.
|
|
50
|
+
# {.extract_paged} prefers this when present and stops consuming
|
|
51
|
+
# the moment the window fills, so the rest of the document is
|
|
52
|
+
# never parsed (pikuri-pdf's extractor: pdf-reader's page list
|
|
53
|
+
# parses on access; {Passthrough}: the IO is read line-by-line).
|
|
54
|
+
# The enumerator
|
|
55
|
+
# must be consumed while +io+ is still open, and may raise
|
|
56
|
+
# {Error} mid-iteration. Extractors that need the whole document
|
|
57
|
+
# to produce anything ({HTML}: Readability walks the full DOM —
|
|
58
|
+
# true of any subprocess-based extractor too) simply omit it;
|
|
59
|
+
# {.extract_paged} then extracts in full and windows the result.
|
|
60
|
+
#
|
|
61
|
+
# Windowing itself (offset / limit / byte cap / line truncation) is
|
|
62
|
+
# presentation and deliberately lives once in {.extract_paged}, not
|
|
63
|
+
# per extractor — +extract_lines+ is line *production*, the only
|
|
64
|
+
# genuinely format-specific half of paging.
|
|
65
|
+
#
|
|
66
|
+
# == Errors
|
|
67
|
+
#
|
|
68
|
+
# Both failure modes are failures the *caller's* LLM can react to,
|
|
69
|
+
# so they share one rescuable root:
|
|
70
|
+
#
|
|
71
|
+
# * {Unsupported} — nothing in {.registry} claimed the content
|
|
72
|
+
# (opaque binary, an unhandled content-type).
|
|
73
|
+
# * {Error} (the root) — an extractor claimed the content but the
|
|
74
|
+
# parse failed (malformed PDF, ...).
|
|
75
|
+
#
|
|
76
|
+
# Callers map them to their own conventions:
|
|
77
|
+
# +Tool::Scraper+ re-raises both as +FetchError+;
|
|
78
|
+
# {FileType.read_as_text} maps {Unsupported} to the +ArgumentError+
|
|
79
|
+
# binary refusal and {Error} to a +RuntimeError+ carrying the path.
|
|
80
|
+
module Extractor
|
|
81
|
+
module_function
|
|
82
|
+
|
|
83
|
+
# Raised when an extractor claims content but fails to parse it
|
|
84
|
+
# (e.g. a malformed PDF). Message is LLM-presentable.
|
|
85
|
+
Error = Class.new(StandardError)
|
|
86
|
+
|
|
87
|
+
# Raised by {.extract} / {.extract_paged} when no registry entry
|
|
88
|
+
# claims the content. Subclass of {Error} so callers that don't
|
|
89
|
+
# care about the distinction rescue one class.
|
|
90
|
+
Unsupported = Class.new(Error)
|
|
91
|
+
|
|
92
|
+
# @return [Integer] default line-window size for {.extract_paged}
|
|
93
|
+
# when the caller omits +limit+.
|
|
94
|
+
PAGE_DEFAULT_LIMIT = 2000
|
|
95
|
+
|
|
96
|
+
# @return [Integer] default hard byte cap on the content collected
|
|
97
|
+
# by a single {.extract_paged} call. Bypassable by paging via
|
|
98
|
+
# +offset+. The rendered output is slightly larger (line
|
|
99
|
+
# numbering, trailer) — that's the caller's concern.
|
|
100
|
+
PAGE_MAX_BYTES = 50 * 1024
|
|
101
|
+
|
|
102
|
+
# @return [Integer] default per-line character cap;
|
|
103
|
+
# {.extract_paged} truncates longer lines and appends
|
|
104
|
+
# {PAGE_LINE_TRUNCATION_MARKER}.
|
|
105
|
+
PAGE_MAX_LINE_LENGTH = 2000
|
|
106
|
+
|
|
107
|
+
# @return [String] suffix appended to a line truncated at
|
|
108
|
+
# {PAGE_MAX_LINE_LENGTH}.
|
|
109
|
+
PAGE_LINE_TRUNCATION_MARKER = "... (line truncated to #{PAGE_MAX_LINE_LENGTH} chars)"
|
|
110
|
+
|
|
111
|
+
# One windowed slice of a document, returned by {.extract_paged}.
|
|
112
|
+
# The caller turns this into an observation; this struct carries
|
|
113
|
+
# everything a trailer needs without the caller re-reading the
|
|
114
|
+
# document.
|
|
115
|
+
#
|
|
116
|
+
# == Fields
|
|
117
|
+
#
|
|
118
|
+
# * +lines+ — +Array<String>+, the collected window. Already
|
|
119
|
+
# per-line truncated (with {PAGE_LINE_TRUNCATION_MARKER}); *not*
|
|
120
|
+
# line-numbered — numbering is presentation the caller adds. For
|
|
121
|
+
# a PDF the array includes the +"--- Page N ---"+ marker lines
|
|
122
|
+
# pikuri-pdf's extractor emits, which count toward +limit+ / the
|
|
123
|
+
# byte cap like any other line.
|
|
124
|
+
# * +start_line+ — the 1-indexed line number of +lines.first+
|
|
125
|
+
# (i.e. the +offset+ the caller asked for). +lines.last+ is at
|
|
126
|
+
# +start_line + lines.length - 1+.
|
|
127
|
+
# * +total_lines+ — total line count of the document when known,
|
|
128
|
+
# else +nil+. Known when the read reached EOF, when the format
|
|
129
|
+
# was extracted in full (no +extract_lines+ — e.g. HTML), or
|
|
130
|
+
# when the lazy stream is cheap enough to count to the end
|
|
131
|
+
# (plain text). +nil+ when a lazy stream stopped early — the
|
|
132
|
+
# byte cap fired, or a PDF filled the window before its last
|
|
133
|
+
# page (counting the rest would mean parsing every page,
|
|
134
|
+
# defeating the laziness).
|
|
135
|
+
# * +more+ — +true+ if content remains past this window (the
|
|
136
|
+
# caller should offer +offset = start_line + lines.length+).
|
|
137
|
+
# * +byte_capped+ — +true+ if the byte cap (not the line limit)
|
|
138
|
+
# was the stopping criterion.
|
|
139
|
+
# * +kind+ — the matched extractor's +kind+ tag (+:text+ /
|
|
140
|
+
# +:pdf+ / +:html+); lets the caller word format-specific
|
|
141
|
+
# trailers and the empty-document message.
|
|
142
|
+
#
|
|
143
|
+
# An empty document yields +lines: []+, +total_lines: 0+; an
|
|
144
|
+
# +offset+ past EOF yields +lines: []+ with +total_lines+ set to
|
|
145
|
+
# the real (non-zero) count — the caller distinguishes the two.
|
|
146
|
+
Page = Data.define(:lines, :start_line, :total_lines, :more, :byte_capped, :kind)
|
|
147
|
+
|
|
148
|
+
# The extractor registry, consulted in order — first match wins.
|
|
149
|
+
# Core ships two entries: {HTML} matches on content-type, and
|
|
150
|
+
# {Passthrough} is the terminal plain-text arm. A gem adding a
|
|
151
|
+
# format picks its insertion point by the strength of its claim:
|
|
152
|
+
# a magic-byte sniff that never misfires on text goes at the
|
|
153
|
+
# *front* so it beats {HTML}'s content-type match even under a
|
|
154
|
+
# lying header (+registry.unshift(X)+ — pikuri-pdf does this);
|
|
155
|
+
# a content-type / weaker-sniff claimer inserts before the
|
|
156
|
+
# terminal entry (+registry.insert(-2, X)+ — pikuri-extractors
|
|
157
|
+
# does this).
|
|
158
|
+
#
|
|
159
|
+
# @return [Array<#matches?>] mutable, deliberately — this is the
|
|
160
|
+
# plug-in seam.
|
|
161
|
+
def registry
|
|
162
|
+
@registry ||= [HTML, Passthrough]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Extract the whole document behind +io+ as one Markdown-flavoured
|
|
166
|
+
# UTF-8 String. May be empty (empty text file, scanned-image PDF
|
|
167
|
+
# with no extractable text).
|
|
168
|
+
#
|
|
169
|
+
# @param io [IO, StringIO] seekable IO positioned at the start of
|
|
170
|
+
# the content; this method reads a leading sample for the
|
|
171
|
+
# +matches?+ sniff and rewinds before extracting.
|
|
172
|
+
# @param content_type [String, nil] normalized content-type when
|
|
173
|
+
# the transport supplies one (HTTP header, {FileType.detect_mime}
|
|
174
|
+
# result); +nil+ when unknown — extractors then rely on their
|
|
175
|
+
# byte sniffs.
|
|
176
|
+
# @return [String]
|
|
177
|
+
# @raise [Unsupported] when no registry entry claims the content.
|
|
178
|
+
# @raise [Error] when the matched extractor cannot parse it.
|
|
179
|
+
def extract(io, content_type: nil)
|
|
180
|
+
extractor_for(io, content_type).extract(io)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Extract +io+ and return a windowed {Page}: the lines from
|
|
184
|
+
# +offset+ (1-indexed) up to +limit+ of them, stopping early if
|
|
185
|
+
# +max_bytes+ is reached, with over-long lines truncated at
|
|
186
|
+
# +max_line_length+.
|
|
187
|
+
#
|
|
188
|
+
# Lazy where the format allows: extractors that implement
|
|
189
|
+
# +extract_lines+ (plain text; pikuri-pdf's PDF) are consumed
|
|
190
|
+
# only until the window fills — reading the first window of a
|
|
191
|
+
# 500-page PDF parses a handful of pages, and the first page of
|
|
192
|
+
# a gigabyte log never loads it. Extractors without it (HTML) are extracted
|
|
193
|
+
# in full and then windowed, which is also what makes their
|
|
194
|
+
# +total_lines+ always exact.
|
|
195
|
+
#
|
|
196
|
+
# @param io [IO, StringIO] seekable IO positioned at the start.
|
|
197
|
+
# @param content_type [String, nil] as for {.extract}.
|
|
198
|
+
# @param offset [Integer] 1-indexed first line to include. The
|
|
199
|
+
# caller is responsible for validating +offset >= 1+.
|
|
200
|
+
# @param limit [Integer] maximum lines to collect. Caller
|
|
201
|
+
# validates +limit >= 1+.
|
|
202
|
+
# @param max_bytes [Integer] hard byte cap on collected content.
|
|
203
|
+
# @param max_line_length [Integer] per-line truncation threshold.
|
|
204
|
+
# @return [Page] the windowed slice.
|
|
205
|
+
# @raise [Unsupported] when no registry entry claims the content.
|
|
206
|
+
# @raise [Error] when the matched extractor cannot parse it.
|
|
207
|
+
def extract_paged(io, content_type: nil, offset: 1, limit: PAGE_DEFAULT_LIMIT,
|
|
208
|
+
max_bytes: PAGE_MAX_BYTES, max_line_length: PAGE_MAX_LINE_LENGTH)
|
|
209
|
+
extractor = extractor_for(io, content_type)
|
|
210
|
+
if extractor.respond_to?(:extract_lines)
|
|
211
|
+
# count_tail is a per-format economics call: once the window
|
|
212
|
+
# fills, counting the rest of a plain-text stream is a cheap
|
|
213
|
+
# sequential read (so the trailer can say "of N"), while for
|
|
214
|
+
# a PDF it would mean parsing every remaining page — exactly
|
|
215
|
+
# what extract_lines exists to avoid. Plugged-in extractors
|
|
216
|
+
# (pikuri-pdf's included) get the conservative default (stop
|
|
217
|
+
# early, total unknown).
|
|
218
|
+
window(extractor.extract_lines(io),
|
|
219
|
+
offset: offset, limit: limit, max_bytes: max_bytes,
|
|
220
|
+
max_line_length: max_line_length, kind: extractor.kind,
|
|
221
|
+
known_total: nil, count_tail: extractor.equal?(Passthrough))
|
|
222
|
+
else
|
|
223
|
+
lines = extractor.extract(io).split("\n")
|
|
224
|
+
window(lines, offset: offset, limit: limit, max_bytes: max_bytes,
|
|
225
|
+
max_line_length: max_line_length, kind: extractor.kind,
|
|
226
|
+
known_total: lines.length)
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Find the first registry entry claiming +io+'s content: read the
|
|
231
|
+
# leading {FileType::SAMPLE_BYTES} for the sniff, rewind, and ask
|
|
232
|
+
# each extractor in order.
|
|
233
|
+
#
|
|
234
|
+
# @param io [IO, StringIO] seekable IO positioned at the start.
|
|
235
|
+
# @param content_type [String, nil]
|
|
236
|
+
# @return [#extract] the matched extractor.
|
|
237
|
+
# @raise [Unsupported] when nothing matches.
|
|
238
|
+
def extractor_for(io, content_type)
|
|
239
|
+
sample = io.read(FileType::SAMPLE_BYTES) || +''
|
|
240
|
+
io.rewind
|
|
241
|
+
registry.find { |ex| ex.matches?(sample: sample, content_type: content_type) } ||
|
|
242
|
+
raise(Unsupported, 'no extractor for this content' \
|
|
243
|
+
"#{content_type && !content_type.empty? ? " (content-type #{content_type.inspect})" : ''}")
|
|
244
|
+
end
|
|
245
|
+
private_class_method :extractor_for
|
|
246
|
+
|
|
247
|
+
# Collect a {Page} window out of +lines+ (an Array or a lazy
|
|
248
|
+
# Enumerator of already-+chomp+ed lines). +known_total+ is the
|
|
249
|
+
# full line count when the caller extracted everything up front
|
|
250
|
+
# (Array case), +nil+ for a lazy stream — then +total_lines+ is
|
|
251
|
+
# exact only if the iteration reached EOF: +count_tail+ keeps
|
|
252
|
+
# the loop counting (without collecting) past the line limit
|
|
253
|
+
# when consuming the rest of the stream is cheap; without it the
|
|
254
|
+
# loop breaks and leaves the total unknown. The byte cap always
|
|
255
|
+
# aborts the count.
|
|
256
|
+
#
|
|
257
|
+
# @param lines [Enumerable<String>]
|
|
258
|
+
# @param known_total [Integer, nil]
|
|
259
|
+
# @param count_tail [Boolean]
|
|
260
|
+
# @return [Page]
|
|
261
|
+
def window(lines, offset:, limit:, max_bytes:, max_line_length:, kind:,
|
|
262
|
+
known_total:, count_tail: false)
|
|
263
|
+
start_index = offset - 1
|
|
264
|
+
collected = []
|
|
265
|
+
seen = 0
|
|
266
|
+
bytes = 0
|
|
267
|
+
byte_capped = false
|
|
268
|
+
more = false
|
|
269
|
+
stopped_early = false
|
|
270
|
+
|
|
271
|
+
lines.each do |raw|
|
|
272
|
+
seen += 1
|
|
273
|
+
next if seen <= start_index
|
|
274
|
+
|
|
275
|
+
if collected.length >= limit
|
|
276
|
+
more = true
|
|
277
|
+
next if count_tail # keep counting so total_lines stays exact
|
|
278
|
+
|
|
279
|
+
stopped_early = true
|
|
280
|
+
break
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
line = truncate_line(raw, max_line_length)
|
|
284
|
+
size = line.bytesize + 1 # +1 for the joining newline
|
|
285
|
+
if bytes + size > max_bytes
|
|
286
|
+
byte_capped = true
|
|
287
|
+
more = true
|
|
288
|
+
stopped_early = true
|
|
289
|
+
break
|
|
290
|
+
end
|
|
291
|
+
collected << line
|
|
292
|
+
bytes += size
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
Page.new(lines: collected, start_line: offset,
|
|
296
|
+
total_lines: known_total || (stopped_early ? nil : seen),
|
|
297
|
+
more: more, byte_capped: byte_capped, kind: kind)
|
|
298
|
+
end
|
|
299
|
+
private_class_method :window
|
|
300
|
+
|
|
301
|
+
# Truncate +line+ to +max_line_length+ chars, appending
|
|
302
|
+
# {PAGE_LINE_TRUNCATION_MARKER} when it overflows.
|
|
303
|
+
#
|
|
304
|
+
# @param line [String]
|
|
305
|
+
# @param max_line_length [Integer]
|
|
306
|
+
# @return [String]
|
|
307
|
+
def truncate_line(line, max_line_length)
|
|
308
|
+
return line if line.length <= max_line_length
|
|
309
|
+
|
|
310
|
+
line[0, max_line_length] + PAGE_LINE_TRUNCATION_MARKER
|
|
311
|
+
end
|
|
312
|
+
private_class_method :truncate_line
|
|
313
|
+
end
|
|
314
|
+
end
|