pikuri-core 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,314 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ # The format→text extraction seam: one registry of extractors that
5
+ # turn an +IO+ of some recognised format (HTML and plain text out
6
+ # of the box; PDF / office formats via the pikuri-pdf /
7
+ # pikuri-extractors plug-in gems) into Markdown-flavoured UTF-8
8
+ # text, consumed through two front doors:
9
+ #
10
+ # * {.extract} — the whole document as one String. The shape the
11
+ # indexing / caching callers want ({Pikuri::VectorDb}'s indexer,
12
+ # {Tool::WebScrape}'s URL cache): no windowing, no presentation.
13
+ # * {.extract_paged} — the LLM-tool shape: the same extraction,
14
+ # windowed to a line range with a byte cap, returned as a {Page}
15
+ # the caller renders. Backs +Workspace::Read+ and
16
+ # +VectorDb::Tools::Read+ (via the {FileType} path wrappers) so
17
+ # the offset/limit/byte-cap logic lives in one tested place.
18
+ #
19
+ # Both front doors — +Tool::Scraper+ dispatching on the HTTP
20
+ # +Content-Type+ header for the web tools, and {FileType} resolving
21
+ # local paths — route through this one registry, so both share one
22
+ # set of format truths and "support a new format" is a registry
23
+ # entry (pikuri-pdf and pikuri-extractors plug PDF and office
24
+ # formats in without pikuri-core knowing), not a new special case
25
+ # in two dispatchers.
26
+ #
27
+ # == The extractor duck type
28
+ #
29
+ # Each {.registry} entry implements three methods:
30
+ #
31
+ # * +matches?(sample:, content_type:)+ → +Boolean+ — claim the
32
+ # content. +sample+ is the leading {FileType::SAMPLE_BYTES} bytes
33
+ # (for magic-byte sniffs); +content_type+ is the normalized HTTP
34
+ # +Content-Type+ for web content, the {FileType.detect_mime}
35
+ # result for local files, and may be +nil+ ("no transport
36
+ # metadata — sniff if you can").
37
+ # * +extract(io)+ → +String+ — the whole document as
38
+ # Markdown-flavoured UTF-8 text. Raises {Error} on content the
39
+ # extractor claimed but cannot parse (malformed PDF, ...).
40
+ # * +kind+ → +Symbol+ — a short tag (+:text+ / +:pdf+ / +:html+)
41
+ # carried on {Page#kind} so rendering callers can word
42
+ # format-specific trailers ("End of PDF", the scanned-image
43
+ # hint) without re-sniffing.
44
+ #
45
+ # plus one *optional* method for formats whose lines can be
46
+ # produced incrementally:
47
+ #
48
+ # * +extract_lines(io)+ → +Enumerator<String>+ — the same content
49
+ # as +extract+, as a lazy stream of already-+chomp+ed lines.
50
+ # {.extract_paged} prefers this when present and stops consuming
51
+ # the moment the window fills, so the rest of the document is
52
+ # never parsed (pikuri-pdf's extractor: pdf-reader's page list
53
+ # parses on access; {Passthrough}: the IO is read line-by-line).
54
+ # The enumerator
55
+ # must be consumed while +io+ is still open, and may raise
56
+ # {Error} mid-iteration. Extractors that need the whole document
57
+ # to produce anything ({HTML}: Readability walks the full DOM —
58
+ # true of any subprocess-based extractor too) simply omit it;
59
+ # {.extract_paged} then extracts in full and windows the result.
60
+ #
61
+ # Windowing itself (offset / limit / byte cap / line truncation) is
62
+ # presentation and deliberately lives once in {.extract_paged}, not
63
+ # per extractor — +extract_lines+ is line *production*, the only
64
+ # genuinely format-specific half of paging.
65
+ #
66
+ # == Errors
67
+ #
68
+ # Both failure modes are failures the *caller's* LLM can react to,
69
+ # so they share one rescuable root:
70
+ #
71
+ # * {Unsupported} — nothing in {.registry} claimed the content
72
+ # (opaque binary, an unhandled content-type).
73
+ # * {Error} (the root) — an extractor claimed the content but the
74
+ # parse failed (malformed PDF, ...).
75
+ #
76
+ # Callers map them to their own conventions:
77
+ # +Tool::Scraper+ re-raises both as +FetchError+;
78
+ # {FileType.read_as_text} maps {Unsupported} to the +ArgumentError+
79
+ # binary refusal and {Error} to a +RuntimeError+ carrying the path.
80
+ module Extractor
81
+ module_function
82
+
83
+ # Raised when an extractor claims content but fails to parse it
84
+ # (e.g. a malformed PDF). Message is LLM-presentable.
85
+ Error = Class.new(StandardError)
86
+
87
+ # Raised by {.extract} / {.extract_paged} when no registry entry
88
+ # claims the content. Subclass of {Error} so callers that don't
89
+ # care about the distinction rescue one class.
90
+ Unsupported = Class.new(Error)
91
+
92
+ # @return [Integer] default line-window size for {.extract_paged}
93
+ # when the caller omits +limit+.
94
+ PAGE_DEFAULT_LIMIT = 2000
95
+
96
+ # @return [Integer] default hard byte cap on the content collected
97
+ # by a single {.extract_paged} call. Bypassable by paging via
98
+ # +offset+. The rendered output is slightly larger (line
99
+ # numbering, trailer) — that's the caller's concern.
100
+ PAGE_MAX_BYTES = 50 * 1024
101
+
102
+ # @return [Integer] default per-line character cap;
103
+ # {.extract_paged} truncates longer lines and appends
104
+ # {PAGE_LINE_TRUNCATION_MARKER}.
105
+ PAGE_MAX_LINE_LENGTH = 2000
106
+
107
+ # @return [String] suffix appended to a line truncated at
108
+ # {PAGE_MAX_LINE_LENGTH}.
109
+ PAGE_LINE_TRUNCATION_MARKER = "... (line truncated to #{PAGE_MAX_LINE_LENGTH} chars)"
110
+
111
+ # One windowed slice of a document, returned by {.extract_paged}.
112
+ # The caller turns this into an observation; this struct carries
113
+ # everything a trailer needs without the caller re-reading the
114
+ # document.
115
+ #
116
+ # == Fields
117
+ #
118
+ # * +lines+ — +Array<String>+, the collected window. Already
119
+ # per-line truncated (with {PAGE_LINE_TRUNCATION_MARKER}); *not*
120
+ # line-numbered — numbering is presentation the caller adds. For
121
+ # a PDF the array includes the +"--- Page N ---"+ marker lines
122
+ # pikuri-pdf's extractor emits, which count toward +limit+ / the
123
+ # byte cap like any other line.
124
+ # * +start_line+ — the 1-indexed line number of +lines.first+
125
+ # (i.e. the +offset+ the caller asked for). +lines.last+ is at
126
+ # +start_line + lines.length - 1+.
127
+ # * +total_lines+ — total line count of the document when known,
128
+ # else +nil+. Known when the read reached EOF, when the format
129
+ # was extracted in full (no +extract_lines+ — e.g. HTML), or
130
+ # when the lazy stream is cheap enough to count to the end
131
+ # (plain text). +nil+ when a lazy stream stopped early — the
132
+ # byte cap fired, or a PDF filled the window before its last
133
+ # page (counting the rest would mean parsing every page,
134
+ # defeating the laziness).
135
+ # * +more+ — +true+ if content remains past this window (the
136
+ # caller should offer +offset = start_line + lines.length+).
137
+ # * +byte_capped+ — +true+ if the byte cap (not the line limit)
138
+ # was the stopping criterion.
139
+ # * +kind+ — the matched extractor's +kind+ tag (+:text+ /
140
+ # +:pdf+ / +:html+); lets the caller word format-specific
141
+ # trailers and the empty-document message.
142
+ #
143
+ # An empty document yields +lines: []+, +total_lines: 0+; an
144
+ # +offset+ past EOF yields +lines: []+ with +total_lines+ set to
145
+ # the real (non-zero) count — the caller distinguishes the two.
146
+ Page = Data.define(:lines, :start_line, :total_lines, :more, :byte_capped, :kind)
147
+
148
+ # The extractor registry, consulted in order — first match wins.
149
+ # Core ships two entries: {HTML} matches on content-type, and
150
+ # {Passthrough} is the terminal plain-text arm. A gem adding a
151
+ # format picks its insertion point by the strength of its claim:
152
+ # a magic-byte sniff that never misfires on text goes at the
153
+ # *front* so it beats {HTML}'s content-type match even under a
154
+ # lying header (+registry.unshift(X)+ — pikuri-pdf does this);
155
+ # a content-type / weaker-sniff claimer inserts before the
156
+ # terminal entry (+registry.insert(-2, X)+ — pikuri-extractors
157
+ # does this).
158
+ #
159
+ # @return [Array<#matches?>] mutable, deliberately — this is the
160
+ # plug-in seam.
161
+ def registry
162
+ @registry ||= [HTML, Passthrough]
163
+ end
164
+
165
+ # Extract the whole document behind +io+ as one Markdown-flavoured
166
+ # UTF-8 String. May be empty (empty text file, scanned-image PDF
167
+ # with no extractable text).
168
+ #
169
+ # @param io [IO, StringIO] seekable IO positioned at the start of
170
+ # the content; this method reads a leading sample for the
171
+ # +matches?+ sniff and rewinds before extracting.
172
+ # @param content_type [String, nil] normalized content-type when
173
+ # the transport supplies one (HTTP header, {FileType.detect_mime}
174
+ # result); +nil+ when unknown — extractors then rely on their
175
+ # byte sniffs.
176
+ # @return [String]
177
+ # @raise [Unsupported] when no registry entry claims the content.
178
+ # @raise [Error] when the matched extractor cannot parse it.
179
+ def extract(io, content_type: nil)
180
+ extractor_for(io, content_type).extract(io)
181
+ end
182
+
183
+ # Extract +io+ and return a windowed {Page}: the lines from
184
+ # +offset+ (1-indexed) up to +limit+ of them, stopping early if
185
+ # +max_bytes+ is reached, with over-long lines truncated at
186
+ # +max_line_length+.
187
+ #
188
+ # Lazy where the format allows: extractors that implement
189
+ # +extract_lines+ (plain text; pikuri-pdf's PDF) are consumed
190
+ # only until the window fills — reading the first window of a
191
+ # 500-page PDF parses a handful of pages, and the first page of
192
+ # a gigabyte log never loads it. Extractors without it (HTML) are extracted
193
+ # in full and then windowed, which is also what makes their
194
+ # +total_lines+ always exact.
195
+ #
196
+ # @param io [IO, StringIO] seekable IO positioned at the start.
197
+ # @param content_type [String, nil] as for {.extract}.
198
+ # @param offset [Integer] 1-indexed first line to include. The
199
+ # caller is responsible for validating +offset >= 1+.
200
+ # @param limit [Integer] maximum lines to collect. Caller
201
+ # validates +limit >= 1+.
202
+ # @param max_bytes [Integer] hard byte cap on collected content.
203
+ # @param max_line_length [Integer] per-line truncation threshold.
204
+ # @return [Page] the windowed slice.
205
+ # @raise [Unsupported] when no registry entry claims the content.
206
+ # @raise [Error] when the matched extractor cannot parse it.
207
+ def extract_paged(io, content_type: nil, offset: 1, limit: PAGE_DEFAULT_LIMIT,
208
+ max_bytes: PAGE_MAX_BYTES, max_line_length: PAGE_MAX_LINE_LENGTH)
209
+ extractor = extractor_for(io, content_type)
210
+ if extractor.respond_to?(:extract_lines)
211
+ # count_tail is a per-format economics call: once the window
212
+ # fills, counting the rest of a plain-text stream is a cheap
213
+ # sequential read (so the trailer can say "of N"), while for
214
+ # a PDF it would mean parsing every remaining page — exactly
215
+ # what extract_lines exists to avoid. Plugged-in extractors
216
+ # (pikuri-pdf's included) get the conservative default (stop
217
+ # early, total unknown).
218
+ window(extractor.extract_lines(io),
219
+ offset: offset, limit: limit, max_bytes: max_bytes,
220
+ max_line_length: max_line_length, kind: extractor.kind,
221
+ known_total: nil, count_tail: extractor.equal?(Passthrough))
222
+ else
223
+ lines = extractor.extract(io).split("\n")
224
+ window(lines, offset: offset, limit: limit, max_bytes: max_bytes,
225
+ max_line_length: max_line_length, kind: extractor.kind,
226
+ known_total: lines.length)
227
+ end
228
+ end
229
+
230
+ # Find the first registry entry claiming +io+'s content: read the
231
+ # leading {FileType::SAMPLE_BYTES} for the sniff, rewind, and ask
232
+ # each extractor in order.
233
+ #
234
+ # @param io [IO, StringIO] seekable IO positioned at the start.
235
+ # @param content_type [String, nil]
236
+ # @return [#extract] the matched extractor.
237
+ # @raise [Unsupported] when nothing matches.
238
+ def extractor_for(io, content_type)
239
+ sample = io.read(FileType::SAMPLE_BYTES) || +''
240
+ io.rewind
241
+ registry.find { |ex| ex.matches?(sample: sample, content_type: content_type) } ||
242
+ raise(Unsupported, 'no extractor for this content' \
243
+ "#{content_type && !content_type.empty? ? " (content-type #{content_type.inspect})" : ''}")
244
+ end
245
+ private_class_method :extractor_for
246
+
247
+ # Collect a {Page} window out of +lines+ (an Array or a lazy
248
+ # Enumerator of already-+chomp+ed lines). +known_total+ is the
249
+ # full line count when the caller extracted everything up front
250
+ # (Array case), +nil+ for a lazy stream — then +total_lines+ is
251
+ # exact only if the iteration reached EOF: +count_tail+ keeps
252
+ # the loop counting (without collecting) past the line limit
253
+ # when consuming the rest of the stream is cheap; without it the
254
+ # loop breaks and leaves the total unknown. The byte cap always
255
+ # aborts the count.
256
+ #
257
+ # @param lines [Enumerable<String>]
258
+ # @param known_total [Integer, nil]
259
+ # @param count_tail [Boolean]
260
+ # @return [Page]
261
+ def window(lines, offset:, limit:, max_bytes:, max_line_length:, kind:,
262
+ known_total:, count_tail: false)
263
+ start_index = offset - 1
264
+ collected = []
265
+ seen = 0
266
+ bytes = 0
267
+ byte_capped = false
268
+ more = false
269
+ stopped_early = false
270
+
271
+ lines.each do |raw|
272
+ seen += 1
273
+ next if seen <= start_index
274
+
275
+ if collected.length >= limit
276
+ more = true
277
+ next if count_tail # keep counting so total_lines stays exact
278
+
279
+ stopped_early = true
280
+ break
281
+ end
282
+
283
+ line = truncate_line(raw, max_line_length)
284
+ size = line.bytesize + 1 # +1 for the joining newline
285
+ if bytes + size > max_bytes
286
+ byte_capped = true
287
+ more = true
288
+ stopped_early = true
289
+ break
290
+ end
291
+ collected << line
292
+ bytes += size
293
+ end
294
+
295
+ Page.new(lines: collected, start_line: offset,
296
+ total_lines: known_total || (stopped_early ? nil : seen),
297
+ more: more, byte_capped: byte_capped, kind: kind)
298
+ end
299
+ private_class_method :window
300
+
301
+ # Truncate +line+ to +max_line_length+ chars, appending
302
+ # {PAGE_LINE_TRUNCATION_MARKER} when it overflows.
303
+ #
304
+ # @param line [String]
305
+ # @param max_line_length [Integer]
306
+ # @return [String]
307
+ def truncate_line(line, max_line_length)
308
+ return line if line.length <= max_line_length
309
+
310
+ line[0, max_line_length] + PAGE_LINE_TRUNCATION_MARKER
311
+ end
312
+ private_class_method :truncate_line
313
+ end
314
+ end
@@ -1,10 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'pdf-reader'
4
-
5
3
  module Pikuri
6
- # Magic-byte content sniffing + text extraction, centralised. Three
7
- # responsibilities:
4
+ # Magic-byte content sniffing, plus the path-aware front over the
5
+ # {Extractor} registry. Two responsibilities:
8
6
  #
9
7
  # * {.detect_mime} — recognise a file from its leading bytes. Returns
10
8
  # a MIME String for formats pikuri knows how to handle specially
@@ -15,12 +13,16 @@ module Pikuri
15
13
  # {.detect_mime}: a file can be both recognised (e.g. PDF) *and*
16
14
  # binary. {.detect_mime} tells you what the bytes are;
17
15
  # {.binary?} tells you whether they're safe to render as text.
18
- # * {.read_as_text} — read a file and return its content as plain
19
- # UTF-8 text. PDFs go through +pdf-reader+ page-by-page; plain
20
- # text passes through; images / binaries / missing files raise.
21
- # The pure-extraction shape consumers like +Pikuri::VectorDb+'s
22
- # indexer want (no LLM-tool concerns no paging, no line
23
- # numbering, no byte caps; just bytes-in-text-out).
16
+ #
17
+ # On top of those sit the two +Pathname+ conveniences,
18
+ # {.read_as_text} (whole document, the {Pikuri::VectorDb} indexer's
19
+ # shape) and {.read_as_text_paged} (line-windowed, the Read tools'
20
+ # shape). Both are thin wrappers: they own the *path-level* refusals
21
+ # (missing file, directory, image) and the exception mapping, then
22
+ # hand the opened IO to {Extractor.extract} /
23
+ # {Extractor.extract_paged} — which format the bytes are and how
24
+ # they become text is entirely the registry's business, so a gem
25
+ # plugging a new extractor in extends these wrappers for free.
24
26
  #
25
27
  # {.detect_mime} and {.binary?} accept either a +String+ of bytes
26
28
  # (sample taken by the caller) or a +Pathname+ — when given a path,
@@ -28,8 +30,7 @@ module Pikuri
28
30
  # for the sniff itself. The Pathname form is the convenience path;
29
31
  # the bytes form is for callers that already have the sample or are
30
32
  # calling both methods on the same file and want to avoid a second
31
- # open. {.read_as_text} takes a +Pathname+ only — there's no
32
- # bytes-in shortcut because the PDF case needs to seek the file.
33
+ # open.
33
34
  #
34
35
  # == Why a separate module
35
36
  #
@@ -39,8 +40,7 @@ module Pikuri
39
40
  # {.binary?} reached for by {Workspace::Edit}. Collecting the
40
41
  # detection logic here lets {Read} focus on routing
41
42
  # (mime-to-formatter), {Edit} drop its cross-tool reach, and new
42
- # tools (a future +Workspace::Diff+, an attachment-aware web fetcher,
43
- # ...) share one set of magic-byte truths.
43
+ # tools share one set of magic-byte truths.
44
44
  #
45
45
  # == Deliberate non-goals
46
46
  #
@@ -136,19 +136,13 @@ module Pikuri
136
136
  non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
137
137
  end
138
138
 
139
- # Read +path+ and return its content as plain UTF-8 text. Two
140
- # extraction paths, picked by {.detect_mime}:
141
- #
142
- # * **PDF** — walked page-by-page via +pdf-reader+; each page's
143
- # extracted text is stripped and pages are joined with a blank
144
- # line. A scanned-image PDF (no extractable text) comes back as
145
- # the empty String — a deliberate silent skip, callers detect by
146
- # length if they care.
147
- # * **Plain text** — anything that {.detect_mime} doesn't
148
- # recognise and that {.binary?} accepts. Read with UTF-8
149
- # encoding; behaviour on non-UTF-8 bytes is whatever +File.read+
150
- # does with +encoding: Encoding::UTF_8+ (which is "leave invalid
151
- # bytes in, let downstream decide").
139
+ # Read +path+ and return its content as plain UTF-8 text, routed
140
+ # through the {Extractor} registry: anything
141
+ # unrecognised-but-textual passes through verbatim
142
+ # ({Extractor::Passthrough}); with pikuri-pdf registered, PDFs
143
+ # are extracted with +"--- Page N ---"+ markers (a scanned-image
144
+ # PDF with no extractable text comes back as the empty String, a
145
+ # deliberate silent skip callers detect by length if they care).
152
146
  #
153
147
  # Refusal cases — all raise rather than returning a sentinel
154
148
  # because the callers are internal pikuri code, not an LLM
@@ -159,13 +153,11 @@ module Pikuri
159
153
  # * Path is a directory → +ArgumentError+.
160
154
  # * Image (PNG / JPEG / GIF / WebP per {.detect_mime}) →
161
155
  # +ArgumentError+; images aren't text.
162
- # * Binary content (per {.binary?}) and not a recognised MIME
163
- # +ArgumentError+.
164
- # * Malformed PDF +pdf-reader+'s
165
- # +MalformedPDFError+ / +UnsupportedFeatureError+ /
166
- # +InvalidPageError+ are re-raised as a +RuntimeError+ with the
167
- # path included so callers don't need to know pdf-reader's
168
- # exception hierarchy.
156
+ # * Content no extractor claims (opaque binary) →
157
+ # +ArgumentError+, mapped from {Extractor::Unsupported}.
158
+ # * Extraction failure (malformed PDF, ...) +RuntimeError+ with
159
+ # the path included, mapped from {Extractor::Error} so callers
160
+ # don't need to know any extractor's exception hierarchy.
169
161
  #
170
162
  # @param path [Pathname] file to read.
171
163
  # @return [String] UTF-8 text. May be empty (empty text file, or
@@ -173,40 +165,76 @@ module Pikuri
173
165
  # @raise [ArgumentError] if +path+ isn't a +Pathname+, points at
174
166
  # a directory, is an image, or is binary.
175
167
  # @raise [Errno::ENOENT] if +path+ doesn't exist.
176
- # @raise [RuntimeError] on a malformed / unsupported PDF.
168
+ # @raise [RuntimeError] on an extraction failure (malformed /
169
+ # unsupported PDF, ...).
177
170
  def read_as_text(path)
171
+ mime = guard_extractable(path)
172
+ path.open('rb') { |io| Extractor.extract(io, content_type: mime) }
173
+ rescue Extractor::Unsupported
174
+ raise ArgumentError, "#{path} appears to be binary; cannot extract as text"
175
+ rescue Extractor::Error => e
176
+ raise "Cannot extract text from #{path}: #{e.message}"
177
+ end
178
+
179
+ # Extract +path+ and return a windowed {Extractor::Page}: the
180
+ # lines from +offset+ (1-indexed) up to +limit+ of them, stopping
181
+ # early if +max_bytes+ is reached, with over-long lines truncated
182
+ # at +max_line_length+. Same routing and refusal contract as
183
+ # {.read_as_text}; the windowing semantics (including the lazy
184
+ # +extract_lines+ consumption that stops parsing once the window
185
+ # fills) are {Extractor.extract_paged}'s.
186
+ # The LLM-facing callers map the exceptions into +"Error: ..."+
187
+ # observations themselves.
188
+ #
189
+ # @param path [Pathname] file to read.
190
+ # @param offset [Integer] 1-indexed first line to include. The
191
+ # caller is responsible for validating +offset >= 1+.
192
+ # @param limit [Integer] maximum lines to collect. Caller
193
+ # validates +limit >= 1+.
194
+ # @param max_bytes [Integer] hard byte cap on collected content.
195
+ # @param max_line_length [Integer] per-line truncation threshold.
196
+ # @return [Extractor::Page] the windowed slice.
197
+ # @raise [ArgumentError] if +path+ isn't a +Pathname+, is a
198
+ # directory, an image, or binary.
199
+ # @raise [Errno::ENOENT] if +path+ doesn't exist.
200
+ # @raise [RuntimeError] on an extraction failure (malformed /
201
+ # unsupported PDF, ...).
202
+ def read_as_text_paged(path, offset: 1, limit: Extractor::PAGE_DEFAULT_LIMIT,
203
+ max_bytes: Extractor::PAGE_MAX_BYTES,
204
+ max_line_length: Extractor::PAGE_MAX_LINE_LENGTH)
205
+ mime = guard_extractable(path)
206
+ path.open('rb') do |io|
207
+ Extractor.extract_paged(io, content_type: mime, offset: offset, limit: limit,
208
+ max_bytes: max_bytes, max_line_length: max_line_length)
209
+ end
210
+ rescue Extractor::Unsupported
211
+ raise ArgumentError, "#{path} appears to be binary; cannot extract as text"
212
+ rescue Extractor::Error => e
213
+ raise "Cannot extract text from #{path}: #{e.message}"
214
+ end
215
+
216
+ # The shared path-level refusals for {.read_as_text} /
217
+ # {.read_as_text_paged}: must be an existing non-directory
218
+ # +Pathname+, and not an image (images are data for a vision
219
+ # model, never text). Returns the {.detect_mime} result so the
220
+ # caller can pass it to the {Extractor} as the content-type hint.
221
+ #
222
+ # @param path [Pathname]
223
+ # @return [String, nil] the sniffed MIME type.
224
+ # @raise [ArgumentError] on a non-Pathname, a directory, or an
225
+ # image.
226
+ # @raise [Errno::ENOENT] if +path+ doesn't exist.
227
+ def guard_extractable(path)
178
228
  raise ArgumentError, "expected Pathname, got #{path.class}" unless path.is_a?(Pathname)
179
229
  raise Errno::ENOENT, path.to_s unless path.exist?
180
230
  raise ArgumentError, "#{path} is a directory" if path.directory?
181
231
 
182
232
  mime = detect_mime(path)
183
- return read_pdf_text(path) if mime == 'application/pdf'
184
233
  raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
185
- raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
186
234
 
187
- path.read(encoding: Encoding::UTF_8)
188
- end
189
-
190
- # Walk a PDF page-by-page via +pdf-reader+, returning a single
191
- # String with non-empty page texts joined by blank lines. Catches
192
- # the three +PDF::Reader+ exceptions Workspace::Read also handles
193
- # and re-raises them as +RuntimeError+ with the path included.
194
- #
195
- # @param path [Pathname]
196
- # @return [String]
197
- # @raise [RuntimeError] on malformed / unsupported PDF.
198
- def read_pdf_text(path)
199
- pages = path.open('rb') do |io|
200
- ::PDF::Reader.new(io).pages.map { |p| p.text.strip }
201
- end
202
- pages.reject(&:empty?).join("\n\n")
203
- rescue ::PDF::Reader::MalformedPDFError,
204
- ::PDF::Reader::UnsupportedFeatureError,
205
- ::PDF::Reader::InvalidPageError => e
206
- raise "Cannot extract PDF text from #{path}: " \
207
- "#{e.class.name.split('::').last}: #{e.message}"
235
+ mime
208
236
  end
209
- private_class_method :read_pdf_text
237
+ private_class_method :guard_extractable
210
238
 
211
239
  # Coerce an +input+ argument into a bytes String for the sniffs.
212
240
  # +String+ inputs are returned as-is (caller already sampled);
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ # Process-global teardown registry: one +at_exit+ for the whole
5
+ # process, with everything that owns a resource needing orderly
6
+ # shutdown (agents, {VectorDb::Server::Chroma}, future background
7
+ # workers) registering here instead of growing its own +at_exit+.
8
+ # It is {Agent#on_close} promoted from per-agent to per-process —
9
+ # the same LIFO + per-handler-rescue + idempotent shape, one level
10
+ # up.
11
+ #
12
+ # == Why one chokepoint
13
+ #
14
+ # Independent +at_exit+ hooks fire in an order decided by file load
15
+ # order, which is invisible and fragile. Routing every teardown
16
+ # through one registry makes the order explicit and controllable:
17
+ # the SIGTERM-the-strays backstop ({Subprocess.cleanup!}) registers
18
+ # at *load* time, so it sits at the bottom of the LIFO stack and runs
19
+ # *last* — after agents and servers (which register at *construction*
20
+ # time) have closed gracefully, while the subprocess machinery they
21
+ # shell out to during close (e.g. {VectorDb::Server::Chroma#close}'s
22
+ # +docker stop+) is still live.
23
+ #
24
+ # == Contract
25
+ #
26
+ # A registrant MUST respond to +#close+, and +#close+ MUST be
27
+ # idempotent and tolerant of running at process exit — the host may
28
+ # also have closed it explicitly earlier. Pass a block instead for
29
+ # teardown that has no natural +#close+ (e.g.
30
+ # +Finalizers.register { Pikuri::Subprocess.cleanup! }+).
31
+ #
32
+ # == Order: LIFO
33
+ #
34
+ # Last registered, first closed — Ruby +ensure+ semantics. A
35
+ # registrant that depends on an earlier one (a background indexer
36
+ # writing into {VectorDb::Server::Chroma}) is registered later and so
37
+ # tears down first. Registration order is therefore dependency order;
38
+ # register the dependency before its dependents.
39
+ #
40
+ # == Errors are contained
41
+ #
42
+ # Each +#close+ runs inside its own +rescue+: a raise is logged via
43
+ # {Pikuri.logger_for} and the sweep continues, so one botched
44
+ # teardown can't strand the rest. {.run!} drains the registry, so a
45
+ # second call (an explicit one, then the +at_exit+) closes nothing.
46
+ module Finalizers
47
+ # @return [Logger] subsystem logger for contained teardown failures.
48
+ LOGGER = Pikuri.logger_for('Finalizers')
49
+
50
+ # Adapts a teardown block to the +#close+ protocol, so a block and a
51
+ # closeable object can share one registry.
52
+ Closer = Struct.new(:block) do
53
+ # @return [void]
54
+ def close
55
+ block.call
56
+ end
57
+ end
58
+
59
+ @registered = []
60
+ @mutex = Mutex.new
61
+
62
+ class << self
63
+ # Register a closeable (or a block) to be torn down at process
64
+ # exit. Returns the registered handle so the caller can later
65
+ # {.unregister} it — a resource closed explicitly before exit
66
+ # should drop out so it can be garbage-collected rather than
67
+ # pinned alive until the process dies.
68
+ #
69
+ # @param closeable [#close, nil] resource to close at exit; omit
70
+ # when passing a block
71
+ # @yield teardown to run at exit, for resources with no +#close+
72
+ # @return [#close] the registered handle — the object itself, or
73
+ # the {Closer} wrapping the block; pass it to {.unregister}
74
+ # @raise [ArgumentError] if neither an object nor a block is given
75
+ def register(closeable = nil, &block)
76
+ unless closeable || block
77
+ raise ArgumentError, 'Finalizers.register requires an object or a block'
78
+ end
79
+
80
+ handle = closeable || Closer.new(block)
81
+ @mutex.synchronize { @registered << handle }
82
+ handle
83
+ end
84
+
85
+ # Drop a previously-registered handle. Idempotent — unregistering
86
+ # something already gone (or never registered) is a no-op.
87
+ #
88
+ # @param handle [#close] the value returned by {.register}
89
+ # @return [void]
90
+ def unregister(handle)
91
+ @mutex.synchronize { @registered.delete(handle) }
92
+ nil
93
+ end
94
+
95
+ # Close every registrant in LIFO order, each guarded by its own
96
+ # +rescue+. Wired to +at_exit+ at the bottom of this file.
97
+ # Draining the registry under the lock makes a repeat call a
98
+ # no-op and keeps it safe against a concurrent caller.
99
+ #
100
+ # @return [void]
101
+ def run!
102
+ handles = @mutex.synchronize do
103
+ taken = @registered.reverse
104
+ @registered.clear
105
+ taken
106
+ end
107
+
108
+ handles.each do |handle|
109
+ handle.close
110
+ rescue StandardError => e
111
+ LOGGER.warn("finalizer #{handle.class} raised #{e.class}: #{e.message}")
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
117
+
118
+ at_exit { Pikuri::Finalizers.run! }
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+
5
+ module Pikuri
6
+ # Standardized on-disk locations for pikuri's local state. Centralizes
7
+ # the XDG resolution so every component that caches to disk roots under
8
+ # one place instead of each re-deriving it — currently
9
+ # {Pikuri::VectorDb::Server::Chroma} (its corpus volume) and
10
+ # {Pikuri::Memory::Mem0Server} (its mem0 checkout + data volume).
11
+ module Paths
12
+ # Pikuri's cache root: +$XDG_CACHE_HOME/pikuri+ when +XDG_CACHE_HOME+
13
+ # is set and non-empty, else +~/.cache/pikuri+.
14
+ #
15
+ # A method, not a frozen constant, on purpose: a constant would
16
+ # snapshot +XDG_CACHE_HOME+ at +require+ time, which breaks
17
+ # env-stubbing in tests and ignores a runtime change in a long-lived
18
+ # process. The directory is *not* created — callers +mkdir_p+ the
19
+ # subdirectory they need (e.g. +Paths.cache.join('chroma')+,
20
+ # +Paths.cache.join('mem0')+).
21
+ #
22
+ # @return [Pathname] the +<cache home>/pikuri+ directory
23
+ def self.cache
24
+ home = ENV['XDG_CACHE_HOME']
25
+ home = File.expand_path('~/.cache') if home.nil? || home.empty?
26
+ Pathname.new(home).join('pikuri')
27
+ end
28
+ end
29
+ end