pikuri-core 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,314 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ # The format→text extraction seam: one registry of extractors that
5
+ # turn an +IO+ of some recognised format (HTML and plain text out
6
+ # of the box; PDF / office formats via the pikuri-pdf /
7
+ # pikuri-extractors plug-in gems) into Markdown-flavoured UTF-8
8
+ # text, consumed through two front doors:
9
+ #
10
+ # * {.extract} — the whole document as one String. The shape the
11
+ # indexing / caching callers want ({Pikuri::VectorDb}'s indexer,
12
+ # {Tool::WebScrape}'s URL cache): no windowing, no presentation.
13
+ # * {.extract_paged} — the LLM-tool shape: the same extraction,
14
+ # windowed to a line range with a byte cap, returned as a {Page}
15
+ # the caller renders. Backs +Workspace::Read+ and
16
+ # +VectorDb::Tools::Read+ (via the {FileType} path wrappers) so
17
+ # the offset/limit/byte-cap logic lives in one tested place.
18
+ #
19
+ # Both front doors — +Tool::Scraper+ dispatching on the HTTP
20
+ # +Content-Type+ header for the web tools, and {FileType} resolving
21
+ # local paths — route through this one registry, so both share one
22
+ # set of format truths and "support a new format" is a registry
23
+ # entry (pikuri-pdf and pikuri-extractors plug PDF and office
24
+ # formats in without pikuri-core knowing), not a new special case
25
+ # in two dispatchers.
26
+ #
27
+ # == The extractor duck type
28
+ #
29
+ # Each {.registry} entry implements three methods:
30
+ #
31
+ # * +matches?(sample:, content_type:)+ → +Boolean+ — claim the
32
+ # content. +sample+ is the leading {FileType::SAMPLE_BYTES} bytes
33
+ # (for magic-byte sniffs); +content_type+ is the normalized HTTP
34
+ # +Content-Type+ for web content, the {FileType.detect_mime}
35
+ # result for local files, and may be +nil+ ("no transport
36
+ # metadata — sniff if you can").
37
+ # * +extract(io)+ → +String+ — the whole document as
38
+ # Markdown-flavoured UTF-8 text. Raises {Error} on content the
39
+ # extractor claimed but cannot parse (malformed PDF, ...).
40
+ # * +kind+ → +Symbol+ — a short tag (+:text+ / +:pdf+ / +:html+)
41
+ # carried on {Page#kind} so rendering callers can word
42
+ # format-specific trailers ("End of PDF", the scanned-image
43
+ # hint) without re-sniffing.
44
+ #
45
+ # plus one *optional* method for formats whose lines can be
46
+ # produced incrementally:
47
+ #
48
+ # * +extract_lines(io)+ → +Enumerator<String>+ — the same content
49
+ # as +extract+, as a lazy stream of already-+chomp+ed lines.
50
+ # {.extract_paged} prefers this when present and stops consuming
51
+ # the moment the window fills, so the rest of the document is
52
+ # never parsed (pikuri-pdf's extractor: pdf-reader's page list
53
+ # parses on access; {Passthrough}: the IO is read line-by-line).
54
+ # The enumerator
55
+ # must be consumed while +io+ is still open, and may raise
56
+ # {Error} mid-iteration. Extractors that need the whole document
57
+ # to produce anything ({HTML}: Readability walks the full DOM —
58
+ # true of any subprocess-based extractor too) simply omit it;
59
+ # {.extract_paged} then extracts in full and windows the result.
60
+ #
61
+ # Windowing itself (offset / limit / byte cap / line truncation) is
62
+ # presentation and deliberately lives once in {.extract_paged}, not
63
+ # per extractor — +extract_lines+ is line *production*, the only
64
+ # genuinely format-specific half of paging.
65
+ #
66
+ # == Errors
67
+ #
68
+ # Both failure modes are failures the *caller's* LLM can react to,
69
+ # so they share one rescuable root:
70
+ #
71
+ # * {Unsupported} — nothing in {.registry} claimed the content
72
+ # (opaque binary, an unhandled content-type).
73
+ # * {Error} (the root) — an extractor claimed the content but the
74
+ # parse failed (malformed PDF, ...).
75
+ #
76
+ # Callers map them to their own conventions:
77
+ # +Tool::Scraper+ re-raises both as +FetchError+;
78
+ # {FileType.read_as_text} maps {Unsupported} to the +ArgumentError+
79
+ # binary refusal and {Error} to a +RuntimeError+ carrying the path.
80
+ module Extractor
81
+ module_function
82
+
83
+ # Raised when an extractor claims content but fails to parse it
84
+ # (e.g. a malformed PDF). Message is LLM-presentable.
85
+ Error = Class.new(StandardError)
86
+
87
+ # Raised by {.extract} / {.extract_paged} when no registry entry
88
+ # claims the content. Subclass of {Error} so callers that don't
89
+ # care about the distinction rescue one class.
90
+ Unsupported = Class.new(Error)
91
+
92
+ # @return [Integer] default line-window size for {.extract_paged}
93
+ # when the caller omits +limit+.
94
+ PAGE_DEFAULT_LIMIT = 2000
95
+
96
+ # @return [Integer] default hard byte cap on the content collected
97
+ # by a single {.extract_paged} call. Bypassable by paging via
98
+ # +offset+. The rendered output is slightly larger (line
99
+ # numbering, trailer) — that's the caller's concern.
100
+ PAGE_MAX_BYTES = 50 * 1024
101
+
102
+ # @return [Integer] default per-line character cap;
103
+ # {.extract_paged} truncates longer lines and appends
104
+ # {PAGE_LINE_TRUNCATION_MARKER}.
105
+ PAGE_MAX_LINE_LENGTH = 2000
106
+
107
+ # @return [String] suffix appended to a line truncated at
108
+ # {PAGE_MAX_LINE_LENGTH}.
109
+ PAGE_LINE_TRUNCATION_MARKER = "... (line truncated to #{PAGE_MAX_LINE_LENGTH} chars)"
110
+
111
+ # One windowed slice of a document, returned by {.extract_paged}.
112
+ # The caller turns this into an observation; this struct carries
113
+ # everything a trailer needs without the caller re-reading the
114
+ # document.
115
+ #
116
+ # == Fields
117
+ #
118
+ # * +lines+ — +Array<String>+, the collected window. Already
119
+ # per-line truncated (with {PAGE_LINE_TRUNCATION_MARKER}); *not*
120
+ # line-numbered — numbering is presentation the caller adds. For
121
+ # a PDF the array includes the +"--- Page N ---"+ marker lines
122
+ # pikuri-pdf's extractor emits, which count toward +limit+ / the
123
+ # byte cap like any other line.
124
+ # * +start_line+ — the 1-indexed line number of +lines.first+
125
+ # (i.e. the +offset+ the caller asked for). +lines.last+ is at
126
+ # +start_line + lines.length - 1+.
127
+ # * +total_lines+ — total line count of the document when known,
128
+ # else +nil+. Known when the read reached EOF, when the format
129
+ # was extracted in full (no +extract_lines+ — e.g. HTML), or
130
+ # when the lazy stream is cheap enough to count to the end
131
+ # (plain text). +nil+ when a lazy stream stopped early — the
132
+ # byte cap fired, or a PDF filled the window before its last
133
+ # page (counting the rest would mean parsing every page,
134
+ # defeating the laziness).
135
+ # * +more+ — +true+ if content remains past this window (the
136
+ # caller should offer +offset = start_line + lines.length+).
137
+ # * +byte_capped+ — +true+ if the byte cap (not the line limit)
138
+ # was the stopping criterion.
139
+ # * +kind+ — the matched extractor's +kind+ tag (+:text+ /
140
+ # +:pdf+ / +:html+); lets the caller word format-specific
141
+ # trailers and the empty-document message.
142
+ #
143
+ # An empty document yields +lines: []+, +total_lines: 0+; an
144
+ # +offset+ past EOF yields +lines: []+ with +total_lines+ set to
145
+ # the real (non-zero) count — the caller distinguishes the two.
146
+ Page = Data.define(:lines, :start_line, :total_lines, :more, :byte_capped, :kind)
147
+
148
+ # The extractor registry, consulted in order — first match wins.
149
+ # Core ships two entries: {HTML} matches on content-type, and
150
+ # {Passthrough} is the terminal plain-text arm. A gem adding a
151
+ # format picks its insertion point by the strength of its claim:
152
+ # a magic-byte sniff that never misfires on text goes at the
153
+ # *front* so it beats {HTML}'s content-type match even under a
154
+ # lying header (+registry.unshift(X)+ — pikuri-pdf does this);
155
+ # a content-type / weaker-sniff claimer inserts before the
156
+ # terminal entry (+registry.insert(-2, X)+ — pikuri-extractors
157
+ # does this).
158
+ #
159
+ # @return [Array<#matches?>] mutable, deliberately — this is the
160
+ # plug-in seam.
161
+ def registry
162
+ @registry ||= [HTML, Passthrough]
163
+ end
164
+
165
+ # Extract the whole document behind +io+ as one Markdown-flavoured
166
+ # UTF-8 String. May be empty (empty text file, scanned-image PDF
167
+ # with no extractable text).
168
+ #
169
+ # @param io [IO, StringIO] seekable IO positioned at the start of
170
+ # the content; this method reads a leading sample for the
171
+ # +matches?+ sniff and rewinds before extracting.
172
+ # @param content_type [String, nil] normalized content-type when
173
+ # the transport supplies one (HTTP header, {FileType.detect_mime}
174
+ # result); +nil+ when unknown — extractors then rely on their
175
+ # byte sniffs.
176
+ # @return [String]
177
+ # @raise [Unsupported] when no registry entry claims the content.
178
+ # @raise [Error] when the matched extractor cannot parse it.
179
+ def extract(io, content_type: nil)
180
+ extractor_for(io, content_type).extract(io)
181
+ end
182
+
183
+ # Extract +io+ and return a windowed {Page}: the lines from
184
+ # +offset+ (1-indexed) up to +limit+ of them, stopping early if
185
+ # +max_bytes+ is reached, with over-long lines truncated at
186
+ # +max_line_length+.
187
+ #
188
+ # Lazy where the format allows: extractors that implement
189
+ # +extract_lines+ (plain text; pikuri-pdf's PDF) are consumed
190
+ # only until the window fills — reading the first window of a
191
+ # 500-page PDF parses a handful of pages, and the first page of
192
+ # a gigabyte log never loads it. Extractors without it (HTML) are extracted
193
+ # in full and then windowed, which is also what makes their
194
+ # +total_lines+ always exact.
195
+ #
196
+ # @param io [IO, StringIO] seekable IO positioned at the start.
197
+ # @param content_type [String, nil] as for {.extract}.
198
+ # @param offset [Integer] 1-indexed first line to include. The
199
+ # caller is responsible for validating +offset >= 1+.
200
+ # @param limit [Integer] maximum lines to collect. Caller
201
+ # validates +limit >= 1+.
202
+ # @param max_bytes [Integer] hard byte cap on collected content.
203
+ # @param max_line_length [Integer] per-line truncation threshold.
204
+ # @return [Page] the windowed slice.
205
+ # @raise [Unsupported] when no registry entry claims the content.
206
+ # @raise [Error] when the matched extractor cannot parse it.
207
+ def extract_paged(io, content_type: nil, offset: 1, limit: PAGE_DEFAULT_LIMIT,
208
+ max_bytes: PAGE_MAX_BYTES, max_line_length: PAGE_MAX_LINE_LENGTH)
209
+ extractor = extractor_for(io, content_type)
210
+ if extractor.respond_to?(:extract_lines)
211
+ # count_tail is a per-format economics call: once the window
212
+ # fills, counting the rest of a plain-text stream is a cheap
213
+ # sequential read (so the trailer can say "of N"), while for
214
+ # a PDF it would mean parsing every remaining page — exactly
215
+ # what extract_lines exists to avoid. Plugged-in extractors
216
+ # (pikuri-pdf's included) get the conservative default (stop
217
+ # early, total unknown).
218
+ window(extractor.extract_lines(io),
219
+ offset: offset, limit: limit, max_bytes: max_bytes,
220
+ max_line_length: max_line_length, kind: extractor.kind,
221
+ known_total: nil, count_tail: extractor.equal?(Passthrough))
222
+ else
223
+ lines = extractor.extract(io).split("\n")
224
+ window(lines, offset: offset, limit: limit, max_bytes: max_bytes,
225
+ max_line_length: max_line_length, kind: extractor.kind,
226
+ known_total: lines.length)
227
+ end
228
+ end
229
+
230
+ # Find the first registry entry claiming +io+'s content: read the
231
+ # leading {FileType::SAMPLE_BYTES} for the sniff, rewind, and ask
232
+ # each extractor in order.
233
+ #
234
+ # @param io [IO, StringIO] seekable IO positioned at the start.
235
+ # @param content_type [String, nil]
236
+ # @return [#extract] the matched extractor.
237
+ # @raise [Unsupported] when nothing matches.
238
+ def extractor_for(io, content_type)
239
+ sample = io.read(FileType::SAMPLE_BYTES) || +''
240
+ io.rewind
241
+ registry.find { |ex| ex.matches?(sample: sample, content_type: content_type) } ||
242
+ raise(Unsupported, 'no extractor for this content' \
243
+ "#{content_type && !content_type.empty? ? " (content-type #{content_type.inspect})" : ''}")
244
+ end
245
+ private_class_method :extractor_for
246
+
247
+ # Collect a {Page} window out of +lines+ (an Array or a lazy
248
+ # Enumerator of already-+chomp+ed lines). +known_total+ is the
249
+ # full line count when the caller extracted everything up front
250
+ # (Array case), +nil+ for a lazy stream — then +total_lines+ is
251
+ # exact only if the iteration reached EOF: +count_tail+ keeps
252
+ # the loop counting (without collecting) past the line limit
253
+ # when consuming the rest of the stream is cheap; without it the
254
+ # loop breaks and leaves the total unknown. The byte cap always
255
+ # aborts the count.
256
+ #
257
+ # @param lines [Enumerable<String>]
258
+ # @param known_total [Integer, nil]
259
+ # @param count_tail [Boolean]
260
+ # @return [Page]
261
+ def window(lines, offset:, limit:, max_bytes:, max_line_length:, kind:,
262
+ known_total:, count_tail: false)
263
+ start_index = offset - 1
264
+ collected = []
265
+ seen = 0
266
+ bytes = 0
267
+ byte_capped = false
268
+ more = false
269
+ stopped_early = false
270
+
271
+ lines.each do |raw|
272
+ seen += 1
273
+ next if seen <= start_index
274
+
275
+ if collected.length >= limit
276
+ more = true
277
+ next if count_tail # keep counting so total_lines stays exact
278
+
279
+ stopped_early = true
280
+ break
281
+ end
282
+
283
+ line = truncate_line(raw, max_line_length)
284
+ size = line.bytesize + 1 # +1 for the joining newline
285
+ if bytes + size > max_bytes
286
+ byte_capped = true
287
+ more = true
288
+ stopped_early = true
289
+ break
290
+ end
291
+ collected << line
292
+ bytes += size
293
+ end
294
+
295
+ Page.new(lines: collected, start_line: offset,
296
+ total_lines: known_total || (stopped_early ? nil : seen),
297
+ more: more, byte_capped: byte_capped, kind: kind)
298
+ end
299
+ private_class_method :window
300
+
301
+ # Truncate +line+ to +max_line_length+ chars, appending
302
+ # {PAGE_LINE_TRUNCATION_MARKER} when it overflows.
303
+ #
304
+ # @param line [String]
305
+ # @param max_line_length [Integer]
306
+ # @return [String]
307
+ def truncate_line(line, max_line_length)
308
+ return line if line.length <= max_line_length
309
+
310
+ line[0, max_line_length] + PAGE_LINE_TRUNCATION_MARKER
311
+ end
312
+ private_class_method :truncate_line
313
+ end
314
+ end