pikuri-core 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -0
- data/lib/pikuri/agent/chat_transport.rb +6 -5
- data/lib/pikuri/agent/configurator.rb +59 -77
- data/lib/pikuri/agent/context_window_detector.rb +70 -10
- data/lib/pikuri/agent/control/cancellable.rb +7 -17
- data/lib/pikuri/agent/control/interloper.rb +20 -23
- data/lib/pikuri/agent/control/step_limit.rb +0 -14
- data/lib/pikuri/agent/event.rb +15 -0
- data/lib/pikuri/agent/extension.rb +49 -23
- data/lib/pikuri/agent/listener/terminal.rb +5 -1
- data/lib/pikuri/agent/listener/token_log.rb +20 -21
- data/lib/pikuri/agent/listener_list.rb +7 -5
- data/lib/pikuri/agent/synthesizer.rb +2 -2
- data/lib/pikuri/agent.rb +257 -164
- data/lib/pikuri/file_type.rb +457 -0
- data/lib/pikuri/finalizers.rb +118 -0
- data/lib/pikuri/paths.rb +29 -0
- data/lib/pikuri/subprocess.rb +45 -12
- data/lib/pikuri/tool/parameters.rb +64 -3
- data/lib/pikuri/tool.rb +15 -7
- data/lib/pikuri/version.rb +1 -1
- metadata +5 -3
- data/lib/pikuri/tool/sub_agent.rb +0 -150
|
@@ -0,0 +1,457 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pdf-reader'
|
|
4
|
+
|
|
5
|
+
module Pikuri
|
|
6
|
+
# Magic-byte content sniffing + text extraction, centralised. Three
|
|
7
|
+
# responsibilities:
|
|
8
|
+
#
|
|
9
|
+
# * {.detect_mime} — recognise a file from its leading bytes. Returns
|
|
10
|
+
# a MIME String for formats pikuri knows how to handle specially
|
|
11
|
+
# ({+application/pdf+}, the four image formats), or +nil+ for
|
|
12
|
+
# "unrecognised — could be text, could be opaque binary; caller
|
|
13
|
+
# decides".
|
|
14
|
+
# * {.binary?} — heuristic text-vs-binary classifier. Independent of
|
|
15
|
+
# {.detect_mime}: a file can be both recognised (e.g. PDF) *and*
|
|
16
|
+
# binary. {.detect_mime} tells you what the bytes are;
|
|
17
|
+
# {.binary?} tells you whether they're safe to render as text.
|
|
18
|
+
# * {.read_as_text} — read a file and return its content as plain
|
|
19
|
+
# UTF-8 text. PDFs go through +pdf-reader+ page-by-page; plain
|
|
20
|
+
# text passes through; images / binaries / missing files raise.
|
|
21
|
+
# The pure-extraction shape consumers like +Pikuri::VectorDb+'s
|
|
22
|
+
# indexer want (no LLM-tool concerns — no paging, no line
|
|
23
|
+
# numbering, no byte caps; just bytes-in-text-out).
|
|
24
|
+
# * {.read_as_text_paged} — the LLM-tool shape: the same
|
|
25
|
+
# extraction as {.read_as_text}, but lazily windowed to a
|
|
26
|
+
# line range with a byte cap, returning a {Page} value the
|
|
27
|
+
# caller renders. Shared by +Workspace::Read+ and
|
|
28
|
+
# +VectorDb::Tools::Read+ so the offset/limit/byte-cap windowing lives
|
|
29
|
+
# in one tested place; each tool keeps its own presentation
|
|
30
|
+
# (cat-n numbering, trailer wording, citation vs. path). Same
|
|
31
|
+
# refusal contract as {.read_as_text} (raises on image / binary
|
|
32
|
+
# / missing / malformed-PDF).
|
|
33
|
+
#
|
|
34
|
+
# {.detect_mime} and {.binary?} accept either a +String+ of bytes
|
|
35
|
+
# (sample taken by the caller) or a +Pathname+ — when given a path,
|
|
36
|
+
# the module opens the file in binary mode and reads {SAMPLE_BYTES}
|
|
37
|
+
# for the sniff itself. The Pathname form is the convenience path;
|
|
38
|
+
# the bytes form is for callers that already have the sample or are
|
|
39
|
+
# calling both methods on the same file and want to avoid a second
|
|
40
|
+
# open. {.read_as_text} takes a +Pathname+ only — there's no
|
|
41
|
+
# bytes-in shortcut because the PDF case needs to seek the file.
|
|
42
|
+
#
|
|
43
|
+
# == Why a separate module
|
|
44
|
+
#
|
|
45
|
+
# Without this module, magic-byte tables and the binary heuristic
|
|
46
|
+
# ended up scattered through whichever tool needed them — first PDF
|
|
47
|
+
# in {Workspace::Read}, then images alongside it, then a copy of
|
|
48
|
+
# {.binary?} reached for by {Workspace::Edit}. Collecting the
|
|
49
|
+
# detection logic here lets {Read} focus on routing
|
|
50
|
+
# (mime-to-formatter), {Edit} drop its cross-tool reach, and new
|
|
51
|
+
# tools (a future +Workspace::Diff+, an attachment-aware web fetcher,
|
|
52
|
+
# ...) share one set of magic-byte truths.
|
|
53
|
+
#
|
|
54
|
+
# == Deliberate non-goals
|
|
55
|
+
#
|
|
56
|
+
# * *Not a full MIME database.* The set grows when a pikuri tool
|
|
57
|
+
# needs a new format, not speculatively. Keeps the "audit in an
|
|
58
|
+
# evening" ceiling honest.
|
|
59
|
+
# * *No path / extension fallback.* Extensions lie (a renamed
|
|
60
|
+
# +.png+ → opaque garbage); magic-byte detection on the actual
|
|
61
|
+
# content is the source of truth. Callers that need
|
|
62
|
+
# extension-based behaviour can layer it themselves.
|
|
63
|
+
# * *No convenience predicates* like +image?+ / +pdf?+. Callers do
|
|
64
|
+
# +mime == 'application/pdf'+ or +mime&.start_with?('image/')+ —
|
|
65
|
+
# one extra character, zero added API surface.
|
|
66
|
+
module FileType
|
|
67
|
+
module_function
|
|
68
|
+
|
|
69
|
+
# @return [Integer] recommended number of bytes to sample for
|
|
70
|
+
# {.detect_mime} and {.binary?}. Big enough to catch every
|
|
71
|
+
# prefix pikuri sniffs today (the largest is WebP's 12-byte
|
|
72
|
+
# container header) with comfortable slack; small enough that
|
|
73
|
+
# reading it off any reasonable filesystem is effectively free.
|
|
74
|
+
SAMPLE_BYTES = 4096
|
|
75
|
+
|
|
76
|
+
# @return [Float] fraction of the sample that may be non-printable
|
|
77
|
+
# before {.binary?} flags the bytes as binary. Matches opencode's
|
|
78
|
+
# threshold.
|
|
79
|
+
BINARY_NONPRINTABLE_THRESHOLD = 0.30
|
|
80
|
+
|
|
81
|
+
# @return [Hash{String => String}] magic-byte prefixes → MIME types
|
|
82
|
+
# for the image formats with flat (offset-zero, fixed-length)
|
|
83
|
+
# signatures. WebP isn't here — its signature is split across the
|
|
84
|
+
# RIFF container header — and is handled directly in
|
|
85
|
+
# {.detect_mime}.
|
|
86
|
+
IMAGE_MAGIC_BYTES = {
|
|
87
|
+
"\x89PNG\r\n\x1a\n".b => 'image/png',
|
|
88
|
+
"\xff\xd8\xff".b => 'image/jpeg',
|
|
89
|
+
"GIF87a".b => 'image/gif',
|
|
90
|
+
"GIF89a".b => 'image/gif'
|
|
91
|
+
}.freeze
|
|
92
|
+
|
|
93
|
+
# @return [String] PDF magic prefix. Every conformant PDF starts
|
|
94
|
+
# with this five-byte ASCII sequence per ISO 32000-1 §7.5.2.
|
|
95
|
+
PDF_MAGIC = '%PDF-'
|
|
96
|
+
|
|
97
|
+
# @return [Integer] default line-window size for
|
|
98
|
+
# {.read_as_text_paged} when the caller omits +limit+.
|
|
99
|
+
PAGE_DEFAULT_LIMIT = 2000
|
|
100
|
+
|
|
101
|
+
# @return [Integer] default hard byte cap on the content collected
|
|
102
|
+
# by a single {.read_as_text_paged} call. Bypassable by paging
|
|
103
|
+
# via +offset+. The rendered output is slightly larger (line
|
|
104
|
+
# numbering, trailer) — that's the caller's concern.
|
|
105
|
+
PAGE_MAX_BYTES = 50 * 1024
|
|
106
|
+
|
|
107
|
+
# @return [Integer] default per-line character cap;
|
|
108
|
+
# {.read_as_text_paged} truncates longer lines and appends
|
|
109
|
+
# {PAGE_LINE_TRUNCATION_MARKER}.
|
|
110
|
+
PAGE_MAX_LINE_LENGTH = 2000
|
|
111
|
+
|
|
112
|
+
# @return [String] suffix appended to a line truncated at
|
|
113
|
+
# {PAGE_MAX_LINE_LENGTH}.
|
|
114
|
+
PAGE_LINE_TRUNCATION_MARKER = "... (line truncated to #{PAGE_MAX_LINE_LENGTH} chars)"
|
|
115
|
+
|
|
116
|
+
# One windowed slice of a document, returned by
|
|
117
|
+
# {.read_as_text_paged}. The caller turns this into an
|
|
118
|
+
# observation; this struct carries everything a trailer needs
|
|
119
|
+
# without the caller re-reading the file.
|
|
120
|
+
#
|
|
121
|
+
# == Fields
|
|
122
|
+
#
|
|
123
|
+
# * +lines+ — +Array<String>+, the collected window. Already
|
|
124
|
+
# per-line truncated (with {PAGE_LINE_TRUNCATION_MARKER}); *not*
|
|
125
|
+
# line-numbered — numbering is presentation the caller adds. For
|
|
126
|
+
# a PDF the array includes +"--- Page N ---"+ marker lines (one
|
|
127
|
+
# per page that contributed text), which count toward +limit+ /
|
|
128
|
+
# the byte cap like any other line.
|
|
129
|
+
# * +start_line+ — the 1-indexed line number of +lines.first+
|
|
130
|
+
# (i.e. the +offset+ the caller asked for). +lines.last+ is at
|
|
131
|
+
# +start_line + lines.length - 1+.
|
|
132
|
+
# * +total_lines+ — total line count of the document when known,
|
|
133
|
+
# else +nil+. Known when extraction reached EOF (so the caller
|
|
134
|
+
# can print "of N"); +nil+ when the read stopped early — the
|
|
135
|
+
# byte cap fired, or a PDF filled the window before its last
|
|
136
|
+
# page (counting the rest would defeat the laziness).
|
|
137
|
+
# * +more+ — +true+ if content remains past this window (the
|
|
138
|
+
# caller should offer +offset = start_line + lines.length+).
|
|
139
|
+
# * +byte_capped+ — +true+ if {PAGE_MAX_BYTES} (not the line
|
|
140
|
+
# limit) was the stopping criterion.
|
|
141
|
+
# * +kind+ — +:text+ or +:pdf+; lets the caller word PDF-specific
|
|
142
|
+
# trailers and the empty-document message.
|
|
143
|
+
#
|
|
144
|
+
# An empty document yields +lines: []+, +total_lines: 0+; an
|
|
145
|
+
# +offset+ past EOF yields +lines: []+ with +total_lines+ set to
|
|
146
|
+
# the real (non-zero) count — the caller distinguishes the two.
|
|
147
|
+
Page = Data.define(:lines, :start_line, :total_lines, :more, :byte_capped, :kind)
|
|
148
|
+
|
|
149
|
+
# Recognise a file from its leading bytes. Returns the MIME type
|
|
150
|
+
# as a String for formats pikuri handles specially, or +nil+ for
|
|
151
|
+
# "unrecognised" — callers interpret +nil+ themselves (text,
|
|
152
|
+
# opaque binary, ...).
|
|
153
|
+
#
|
|
154
|
+
# @param input [String, Pathname] the bytes to inspect, or a
|
|
155
|
+
# +Pathname+ that this method opens in binary mode and reads up
|
|
156
|
+
# to {SAMPLE_BYTES} from. Caller is responsible for verifying the
|
|
157
|
+
# path exists; missing-file errors propagate as +Errno::ENOENT+.
|
|
158
|
+
# @return [String, nil]
|
|
159
|
+
def detect_mime(input)
|
|
160
|
+
bytes = sample_of(input)
|
|
161
|
+
return 'application/pdf' if bytes.start_with?(PDF_MAGIC)
|
|
162
|
+
|
|
163
|
+
IMAGE_MAGIC_BYTES.each do |prefix, mime|
|
|
164
|
+
return mime if bytes.start_with?(prefix)
|
|
165
|
+
end
|
|
166
|
+
return 'image/webp' if bytes.bytesize >= 12 &&
|
|
167
|
+
bytes.byteslice(0, 4) == 'RIFF'.b &&
|
|
168
|
+
bytes.byteslice(8, 4) == 'WEBP'.b
|
|
169
|
+
|
|
170
|
+
nil
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Heuristic text-vs-binary classifier matching opencode's: any
|
|
174
|
+
# +NUL+ byte forces +true+; otherwise count bytes outside the
|
|
175
|
+
# printable +\t \n \v \f \r+ + ASCII-32..126 range and ratio
|
|
176
|
+
# against the sample size. UTF-8 continuation bytes (0x80-0xBF)
|
|
177
|
+
# are >127 so they sit outside the non-printable ranges and pass
|
|
178
|
+
# through unflagged, letting UTF-8 text read fine. An empty
|
|
179
|
+
# sample is treated as not-binary (callers reading an empty file
|
|
180
|
+
# take the empty-text path).
|
|
181
|
+
#
|
|
182
|
+
# @param input [String, Pathname] the bytes to inspect, or a
|
|
183
|
+
# +Pathname+ that this method opens in binary mode and reads up
|
|
184
|
+
# to {SAMPLE_BYTES} from. Caller is responsible for verifying
|
|
185
|
+
# the path exists.
|
|
186
|
+
# @return [Boolean]
|
|
187
|
+
def binary?(input)
|
|
188
|
+
bytes = sample_of(input)
|
|
189
|
+
return false if bytes.empty?
|
|
190
|
+
|
|
191
|
+
non_printable = 0
|
|
192
|
+
bytes.each_byte do |b|
|
|
193
|
+
return true if b.zero?
|
|
194
|
+
|
|
195
|
+
non_printable += 1 if b < 9 || (b > 13 && b < 32)
|
|
196
|
+
end
|
|
197
|
+
non_printable.to_f / bytes.bytesize > BINARY_NONPRINTABLE_THRESHOLD
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
# Read +path+ and return its content as plain UTF-8 text. Two
|
|
201
|
+
# extraction paths, picked by {.detect_mime}:
|
|
202
|
+
#
|
|
203
|
+
# * **PDF** — walked page-by-page via +pdf-reader+; each page's
|
|
204
|
+
# extracted text is stripped and pages are joined with a blank
|
|
205
|
+
# line. A scanned-image PDF (no extractable text) comes back as
|
|
206
|
+
# the empty String — a deliberate silent skip, callers detect by
|
|
207
|
+
# length if they care.
|
|
208
|
+
# * **Plain text** — anything that {.detect_mime} doesn't
|
|
209
|
+
# recognise and that {.binary?} accepts. Read with UTF-8
|
|
210
|
+
# encoding; behaviour on non-UTF-8 bytes is whatever +File.read+
|
|
211
|
+
# does with +encoding: Encoding::UTF_8+ (which is "leave invalid
|
|
212
|
+
# bytes in, let downstream decide").
|
|
213
|
+
#
|
|
214
|
+
# Refusal cases — all raise rather than returning a sentinel
|
|
215
|
+
# because the callers are internal pikuri code, not an LLM
|
|
216
|
+
# tool. The LLM-facing +Workspace::Read+ does its own routing and
|
|
217
|
+
# returns "Error: ..." observations; that's a separate concern.
|
|
218
|
+
#
|
|
219
|
+
# * Path doesn't exist → +Errno::ENOENT+.
|
|
220
|
+
# * Path is a directory → +ArgumentError+.
|
|
221
|
+
# * Image (PNG / JPEG / GIF / WebP per {.detect_mime}) →
|
|
222
|
+
# +ArgumentError+; images aren't text.
|
|
223
|
+
# * Binary content (per {.binary?}) and not a recognised MIME →
|
|
224
|
+
# +ArgumentError+.
|
|
225
|
+
# * Malformed PDF — +pdf-reader+'s
|
|
226
|
+
# +MalformedPDFError+ / +UnsupportedFeatureError+ /
|
|
227
|
+
# +InvalidPageError+ are re-raised as a +RuntimeError+ with the
|
|
228
|
+
# path included so callers don't need to know pdf-reader's
|
|
229
|
+
# exception hierarchy.
|
|
230
|
+
#
|
|
231
|
+
# @param path [Pathname] file to read.
|
|
232
|
+
# @return [String] UTF-8 text. May be empty (empty text file, or
|
|
233
|
+
# scanned-image PDF).
|
|
234
|
+
# @raise [ArgumentError] if +path+ isn't a +Pathname+, points at
|
|
235
|
+
# a directory, is an image, or is binary.
|
|
236
|
+
# @raise [Errno::ENOENT] if +path+ doesn't exist.
|
|
237
|
+
# @raise [RuntimeError] on a malformed / unsupported PDF.
|
|
238
|
+
def read_as_text(path)
|
|
239
|
+
raise ArgumentError, "expected Pathname, got #{path.class}" unless path.is_a?(Pathname)
|
|
240
|
+
raise Errno::ENOENT, path.to_s unless path.exist?
|
|
241
|
+
raise ArgumentError, "#{path} is a directory" if path.directory?
|
|
242
|
+
|
|
243
|
+
mime = detect_mime(path)
|
|
244
|
+
return read_pdf_text(path) if mime == 'application/pdf'
|
|
245
|
+
raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
|
|
246
|
+
raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
|
|
247
|
+
|
|
248
|
+
path.read(encoding: Encoding::UTF_8)
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Walk a PDF page-by-page via +pdf-reader+, returning a single
|
|
252
|
+
# String with non-empty page texts joined by blank lines. Catches
|
|
253
|
+
# the three +PDF::Reader+ exceptions Workspace::Read also handles
|
|
254
|
+
# and re-raises them as +RuntimeError+ with the path included.
|
|
255
|
+
#
|
|
256
|
+
# @param path [Pathname]
|
|
257
|
+
# @return [String]
|
|
258
|
+
# @raise [RuntimeError] on malformed / unsupported PDF.
|
|
259
|
+
def read_pdf_text(path)
|
|
260
|
+
pages = path.open('rb') do |io|
|
|
261
|
+
::PDF::Reader.new(io).pages.map { |p| p.text.strip }
|
|
262
|
+
end
|
|
263
|
+
pages.reject(&:empty?).join("\n\n")
|
|
264
|
+
rescue ::PDF::Reader::MalformedPDFError,
|
|
265
|
+
::PDF::Reader::UnsupportedFeatureError,
|
|
266
|
+
::PDF::Reader::InvalidPageError => e
|
|
267
|
+
raise "Cannot extract PDF text from #{path}: " \
|
|
268
|
+
"#{e.class.name.split('::').last}: #{e.message}"
|
|
269
|
+
end
|
|
270
|
+
private_class_method :read_pdf_text
|
|
271
|
+
|
|
272
|
+
# Extract +path+ as text and return a windowed {Page}: the lines
|
|
273
|
+
# from +offset+ (1-indexed) up to +limit+ of them, stopping early
|
|
274
|
+
# if +max_bytes+ is reached, with over-long lines truncated at
|
|
275
|
+
# +max_line_length+. Lazy by design — a text file is streamed
|
|
276
|
+
# line-by-line and a PDF is parsed page-by-page only until the
|
|
277
|
+
# window fills, so reading the first page of a 500-page PDF parses
|
|
278
|
+
# a handful of pages, not all of them.
|
|
279
|
+
#
|
|
280
|
+
# Same routing and refusal contract as {.read_as_text}: PDFs are
|
|
281
|
+
# extracted (with +"--- Page N ---"+ marker lines, unlike
|
|
282
|
+
# {.read_as_text}'s marker-free join — paging is a display path,
|
|
283
|
+
# the marker-free form stays the indexing path); images, binaries,
|
|
284
|
+
# directories, missing files, and malformed PDFs all raise rather
|
|
285
|
+
# than returning a sentinel. The LLM-facing callers map those into
|
|
286
|
+
# +"Error: ..."+ observations themselves.
|
|
287
|
+
#
|
|
288
|
+
# @param path [Pathname] file to read.
|
|
289
|
+
# @param offset [Integer] 1-indexed first line to include. The
|
|
290
|
+
# caller is responsible for validating +offset >= 1+.
|
|
291
|
+
# @param limit [Integer] maximum lines to collect. Caller
|
|
292
|
+
# validates +limit >= 1+.
|
|
293
|
+
# @param max_bytes [Integer] hard byte cap on collected content.
|
|
294
|
+
# @param max_line_length [Integer] per-line truncation threshold.
|
|
295
|
+
# @return [Page] the windowed slice.
|
|
296
|
+
# @raise [ArgumentError] if +path+ isn't a +Pathname+, is a
|
|
297
|
+
# directory, an image, or binary.
|
|
298
|
+
# @raise [Errno::ENOENT] if +path+ doesn't exist.
|
|
299
|
+
# @raise [RuntimeError] on a malformed / unsupported PDF.
|
|
300
|
+
def read_as_text_paged(path, offset: 1, limit: PAGE_DEFAULT_LIMIT,
|
|
301
|
+
max_bytes: PAGE_MAX_BYTES, max_line_length: PAGE_MAX_LINE_LENGTH)
|
|
302
|
+
raise ArgumentError, "expected Pathname, got #{path.class}" unless path.is_a?(Pathname)
|
|
303
|
+
raise Errno::ENOENT, path.to_s unless path.exist?
|
|
304
|
+
raise ArgumentError, "#{path} is a directory" if path.directory?
|
|
305
|
+
|
|
306
|
+
mime = detect_mime(path)
|
|
307
|
+
if mime == 'application/pdf'
|
|
308
|
+
return paged_pdf(path, offset: offset, limit: limit,
|
|
309
|
+
max_bytes: max_bytes, max_line_length: max_line_length)
|
|
310
|
+
end
|
|
311
|
+
raise ArgumentError, "#{path} is an image (#{mime}); cannot extract as text" if mime&.start_with?('image/')
|
|
312
|
+
raise ArgumentError, "#{path} appears to be binary; cannot extract as text" if binary?(path)
|
|
313
|
+
|
|
314
|
+
paged_text(path, offset: offset, limit: limit,
|
|
315
|
+
max_bytes: max_bytes, max_line_length: max_line_length)
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
# Stream a text file line-by-line into a {Page}. Keeps counting
|
|
319
|
+
# lines past the collection window so +total_lines+ can report the
|
|
320
|
+
# real total when the line limit (not the byte cap) stopped
|
|
321
|
+
# collection; on the byte cap it breaks and leaves +total_lines+
|
|
322
|
+
# +nil+ (the rest of the file is never read).
|
|
323
|
+
#
|
|
324
|
+
# @return [Page] +kind: :text+.
|
|
325
|
+
def paged_text(path, offset:, limit:, max_bytes:, max_line_length:)
|
|
326
|
+
start_index = offset - 1
|
|
327
|
+
collected = []
|
|
328
|
+
total_lines = 0
|
|
329
|
+
bytes = 0
|
|
330
|
+
byte_capped = false
|
|
331
|
+
more = false
|
|
332
|
+
|
|
333
|
+
path.each_line do |raw|
|
|
334
|
+
total_lines += 1
|
|
335
|
+
next if total_lines <= start_index
|
|
336
|
+
|
|
337
|
+
if collected.length >= limit
|
|
338
|
+
more = true
|
|
339
|
+
next
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
line = truncate_line(raw.chomp, max_line_length)
|
|
343
|
+
size = line.bytesize + 1 # +1 for the joining newline
|
|
344
|
+
if bytes + size > max_bytes
|
|
345
|
+
byte_capped = true
|
|
346
|
+
more = true
|
|
347
|
+
break
|
|
348
|
+
end
|
|
349
|
+
collected << line
|
|
350
|
+
bytes += size
|
|
351
|
+
end
|
|
352
|
+
|
|
353
|
+
Page.new(lines: collected, start_line: offset,
|
|
354
|
+
total_lines: byte_capped ? nil : total_lines,
|
|
355
|
+
more: more, byte_capped: byte_capped, kind: :text)
|
|
356
|
+
end
|
|
357
|
+
private_class_method :paged_text
|
|
358
|
+
|
|
359
|
+
# PDF counterpart to {paged_text}: walk +pdf-reader+'s lazy page
|
|
360
|
+
# iterator, emitting a +"--- Page N ---"+ header line then each
|
|
361
|
+
# line of the page's text, applying the same offset / limit /
|
|
362
|
+
# byte-cap contract. The +throw :done+ short-circuits both loops
|
|
363
|
+
# the moment the window fills, so parsing stops — which is why a
|
|
364
|
+
# PDF that stops early can't report +total_lines+ (it would have
|
|
365
|
+
# to parse every page to count).
|
|
366
|
+
#
|
|
367
|
+
# @return [Page] +kind: :pdf+.
|
|
368
|
+
# @raise [RuntimeError] on a malformed / unsupported PDF.
|
|
369
|
+
def paged_pdf(path, offset:, limit:, max_bytes:, max_line_length:)
|
|
370
|
+
start_index = offset - 1
|
|
371
|
+
collected = []
|
|
372
|
+
total_lines = 0
|
|
373
|
+
bytes = 0
|
|
374
|
+
byte_capped = false
|
|
375
|
+
more = false
|
|
376
|
+
|
|
377
|
+
catch(:done) do
|
|
378
|
+
path.open('rb') do |io|
|
|
379
|
+
reader = ::PDF::Reader.new(io)
|
|
380
|
+
reader.pages.each_with_index do |page, idx|
|
|
381
|
+
text = page.text.strip
|
|
382
|
+
next if text.empty?
|
|
383
|
+
|
|
384
|
+
["--- Page #{idx + 1} ---", *text.split("\n")].each do |raw|
|
|
385
|
+
total_lines += 1
|
|
386
|
+
next if total_lines <= start_index
|
|
387
|
+
|
|
388
|
+
if collected.length >= limit
|
|
389
|
+
more = true
|
|
390
|
+
throw :done
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
line = truncate_line(raw, max_line_length)
|
|
394
|
+
size = line.bytesize + 1
|
|
395
|
+
if bytes + size > max_bytes
|
|
396
|
+
byte_capped = true
|
|
397
|
+
more = true
|
|
398
|
+
throw :done
|
|
399
|
+
end
|
|
400
|
+
collected << line
|
|
401
|
+
bytes += size
|
|
402
|
+
end
|
|
403
|
+
end
|
|
404
|
+
end
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
Page.new(lines: collected, start_line: offset,
|
|
408
|
+
total_lines: more ? nil : total_lines,
|
|
409
|
+
more: more, byte_capped: byte_capped, kind: :pdf)
|
|
410
|
+
rescue ::PDF::Reader::MalformedPDFError,
|
|
411
|
+
::PDF::Reader::InvalidPageError,
|
|
412
|
+
::PDF::Reader::UnsupportedFeatureError => e
|
|
413
|
+
raise "Cannot extract PDF text from #{path}: " \
|
|
414
|
+
"#{e.class.name.split('::').last}: #{e.message}"
|
|
415
|
+
end
|
|
416
|
+
private_class_method :paged_pdf
|
|
417
|
+
|
|
418
|
+
# Truncate +line+ to +max_line_length+ chars, appending
|
|
419
|
+
# {PAGE_LINE_TRUNCATION_MARKER} when it overflows.
|
|
420
|
+
#
|
|
421
|
+
# @param line [String]
|
|
422
|
+
# @param max_line_length [Integer]
|
|
423
|
+
# @return [String]
|
|
424
|
+
def truncate_line(line, max_line_length)
|
|
425
|
+
return line if line.length <= max_line_length
|
|
426
|
+
|
|
427
|
+
line[0, max_line_length] + PAGE_LINE_TRUNCATION_MARKER
|
|
428
|
+
end
|
|
429
|
+
private_class_method :truncate_line
|
|
430
|
+
|
|
431
|
+
# Coerce an +input+ argument into a bytes String for the sniffs.
|
|
432
|
+
# +String+ inputs are returned as-is (caller already sampled);
|
|
433
|
+
# +Pathname+ inputs are opened in binary mode and up to
|
|
434
|
+
# {SAMPLE_BYTES} are read off the front. Empty files come back
|
|
435
|
+
# as an empty String — {.binary?} treats that as not-binary and
|
|
436
|
+
# {.detect_mime} returns +nil+ for it, which is what the
|
|
437
|
+
# empty-text path wants.
|
|
438
|
+
#
|
|
439
|
+
# @param input [String, Pathname]
|
|
440
|
+
# @return [String] raw bytes (ASCII-8BIT encoding for the path
|
|
441
|
+
# case; whatever the caller passed for the bytes case)
|
|
442
|
+
# @raise [ArgumentError] if +input+ is neither a +String+ nor a
|
|
443
|
+
# +Pathname+ — refuses to guess, since a bare String could be
|
|
444
|
+
# either a path or actual bytes.
|
|
445
|
+
def sample_of(input)
|
|
446
|
+
case input
|
|
447
|
+
when String
|
|
448
|
+
input
|
|
449
|
+
when Pathname
|
|
450
|
+
input.open('rb') { |io| io.read(SAMPLE_BYTES) || +'' }
|
|
451
|
+
else
|
|
452
|
+
raise ArgumentError, "expected String bytes or Pathname, got #{input.class}"
|
|
453
|
+
end
|
|
454
|
+
end
|
|
455
|
+
private_class_method :sample_of
|
|
456
|
+
end
|
|
457
|
+
end
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
# Process-global teardown registry: one +at_exit+ for the whole
|
|
5
|
+
# process, with everything that owns a resource needing orderly
|
|
6
|
+
# shutdown (agents, {VectorDb::Server::Chroma}, future background
|
|
7
|
+
# workers) registering here instead of growing its own +at_exit+.
|
|
8
|
+
# It is {Agent#on_close} promoted from per-agent to per-process —
|
|
9
|
+
# the same LIFO + per-handler-rescue + idempotent shape, one level
|
|
10
|
+
# up.
|
|
11
|
+
#
|
|
12
|
+
# == Why one chokepoint
|
|
13
|
+
#
|
|
14
|
+
# Independent +at_exit+ hooks fire in an order decided by file load
|
|
15
|
+
# order, which is invisible and fragile. Routing every teardown
|
|
16
|
+
# through one registry makes the order explicit and controllable:
|
|
17
|
+
# the SIGTERM-the-strays backstop ({Subprocess.cleanup!}) registers
|
|
18
|
+
# at *load* time, so it sits at the bottom of the LIFO stack and runs
|
|
19
|
+
# *last* — after agents and servers (which register at *construction*
|
|
20
|
+
# time) have closed gracefully, while the subprocess machinery they
|
|
21
|
+
# shell out to during close (e.g. {VectorDb::Server::Chroma#close}'s
|
|
22
|
+
# +docker stop+) is still live.
|
|
23
|
+
#
|
|
24
|
+
# == Contract
|
|
25
|
+
#
|
|
26
|
+
# A registrant MUST respond to +#close+, and +#close+ MUST be
|
|
27
|
+
# idempotent and tolerant of running at process exit — the host may
|
|
28
|
+
# also have closed it explicitly earlier. Pass a block instead for
|
|
29
|
+
# teardown that has no natural +#close+ (e.g.
|
|
30
|
+
# +Finalizers.register { Pikuri::Subprocess.cleanup! }+).
|
|
31
|
+
#
|
|
32
|
+
# == Order: LIFO
|
|
33
|
+
#
|
|
34
|
+
# Last registered, first closed — Ruby +ensure+ semantics. A
|
|
35
|
+
# registrant that depends on an earlier one (a background indexer
|
|
36
|
+
# writing into {VectorDb::Server::Chroma}) is registered later and so
|
|
37
|
+
# tears down first. Registration order is therefore dependency order;
|
|
38
|
+
# register the dependency before its dependents.
|
|
39
|
+
#
|
|
40
|
+
# == Errors are contained
|
|
41
|
+
#
|
|
42
|
+
# Each +#close+ runs inside its own +rescue+: a raise is logged via
|
|
43
|
+
# {Pikuri.logger_for} and the sweep continues, so one botched
|
|
44
|
+
# teardown can't strand the rest. {.run!} drains the registry, so a
|
|
45
|
+
# second call (an explicit one, then the +at_exit+) closes nothing.
|
|
46
|
+
module Finalizers
|
|
47
|
+
# @return [Logger] subsystem logger for contained teardown failures.
|
|
48
|
+
LOGGER = Pikuri.logger_for('Finalizers')
|
|
49
|
+
|
|
50
|
+
# Adapts a teardown block to the +#close+ protocol, so a block and a
|
|
51
|
+
# closeable object can share one registry.
|
|
52
|
+
Closer = Struct.new(:block) do
|
|
53
|
+
# @return [void]
|
|
54
|
+
def close
|
|
55
|
+
block.call
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
@registered = []
|
|
60
|
+
@mutex = Mutex.new
|
|
61
|
+
|
|
62
|
+
class << self
|
|
63
|
+
# Register a closeable (or a block) to be torn down at process
|
|
64
|
+
# exit. Returns the registered handle so the caller can later
|
|
65
|
+
# {.unregister} it — a resource closed explicitly before exit
|
|
66
|
+
# should drop out so it can be garbage-collected rather than
|
|
67
|
+
# pinned alive until the process dies.
|
|
68
|
+
#
|
|
69
|
+
# @param closeable [#close, nil] resource to close at exit; omit
|
|
70
|
+
# when passing a block
|
|
71
|
+
# @yield teardown to run at exit, for resources with no +#close+
|
|
72
|
+
# @return [#close] the registered handle — the object itself, or
|
|
73
|
+
# the {Closer} wrapping the block; pass it to {.unregister}
|
|
74
|
+
# @raise [ArgumentError] if neither an object nor a block is given
|
|
75
|
+
def register(closeable = nil, &block)
|
|
76
|
+
unless closeable || block
|
|
77
|
+
raise ArgumentError, 'Finalizers.register requires an object or a block'
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
handle = closeable || Closer.new(block)
|
|
81
|
+
@mutex.synchronize { @registered << handle }
|
|
82
|
+
handle
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Drop a previously-registered handle. Idempotent — unregistering
|
|
86
|
+
# something already gone (or never registered) is a no-op.
|
|
87
|
+
#
|
|
88
|
+
# @param handle [#close] the value returned by {.register}
|
|
89
|
+
# @return [void]
|
|
90
|
+
def unregister(handle)
|
|
91
|
+
@mutex.synchronize { @registered.delete(handle) }
|
|
92
|
+
nil
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Close every registrant in LIFO order, each guarded by its own
|
|
96
|
+
# +rescue+. Wired to +at_exit+ at the bottom of this file.
|
|
97
|
+
# Draining the registry under the lock makes a repeat call a
|
|
98
|
+
# no-op and keeps it safe against a concurrent caller.
|
|
99
|
+
#
|
|
100
|
+
# @return [void]
|
|
101
|
+
def run!
|
|
102
|
+
handles = @mutex.synchronize do
|
|
103
|
+
taken = @registered.reverse
|
|
104
|
+
@registered.clear
|
|
105
|
+
taken
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
handles.each do |handle|
|
|
109
|
+
handle.close
|
|
110
|
+
rescue StandardError => e
|
|
111
|
+
LOGGER.warn("finalizer #{handle.class} raised #{e.class}: #{e.message}")
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
at_exit { Pikuri::Finalizers.run! }
|
data/lib/pikuri/paths.rb
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'pathname'
|
|
4
|
+
|
|
5
|
+
module Pikuri
|
|
6
|
+
# Standardized on-disk locations for pikuri's local state. Centralizes
|
|
7
|
+
# the XDG resolution so every component that caches to disk roots under
|
|
8
|
+
# one place instead of each re-deriving it — currently
|
|
9
|
+
# {Pikuri::VectorDb::Server::Chroma} (its corpus volume) and
|
|
10
|
+
# {Pikuri::Memory::Mem0Server} (its mem0 checkout + data volume).
|
|
11
|
+
module Paths
|
|
12
|
+
# Pikuri's cache root: +$XDG_CACHE_HOME/pikuri+ when +XDG_CACHE_HOME+
|
|
13
|
+
# is set and non-empty, else +~/.cache/pikuri+.
|
|
14
|
+
#
|
|
15
|
+
# A method, not a frozen constant, on purpose: a constant would
|
|
16
|
+
# snapshot +XDG_CACHE_HOME+ at +require+ time, which breaks
|
|
17
|
+
# env-stubbing in tests and ignores a runtime change in a long-lived
|
|
18
|
+
# process. The directory is *not* created — callers +mkdir_p+ the
|
|
19
|
+
# subdirectory they need (e.g. +Paths.cache.join('chroma')+,
|
|
20
|
+
# +Paths.cache.join('mem0')+).
|
|
21
|
+
#
|
|
22
|
+
# @return [Pathname] the +<cache home>/pikuri+ directory
|
|
23
|
+
def self.cache
|
|
24
|
+
home = ENV['XDG_CACHE_HOME']
|
|
25
|
+
home = File.expand_path('~/.cache') if home.nil? || home.empty?
|
|
26
|
+
Pathname.new(home).join('pikuri')
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|