pikuri-core 0.0.5 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -3
- data/lib/pikuri/agent/chat_transport.rb +135 -11
- data/lib/pikuri/agent/configurator.rb +4 -4
- data/lib/pikuri/agent/context_window_detector.rb +103 -52
- data/lib/pikuri/agent/control/step_limit.rb +39 -7
- data/lib/pikuri/agent/event.rb +43 -16
- data/lib/pikuri/agent/extension.rb +31 -17
- data/lib/pikuri/agent/extension_context.rb +147 -0
- data/lib/pikuri/agent/listener/terminal.rb +30 -37
- data/lib/pikuri/agent/listener/token_log.rb +60 -13
- data/lib/pikuri/agent/listener.rb +12 -5
- data/lib/pikuri/agent/listener_list.rb +7 -17
- data/lib/pikuri/agent/synthesizer.rb +93 -67
- data/lib/pikuri/agent.rb +358 -403
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +74 -266
- data/lib/pikuri/sanitizer.rb +179 -0
- data/lib/pikuri/subprocess.rb +73 -2
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/parameters.rb +65 -2
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/search/brave.rb +32 -18
- data/lib/pikuri/tool/search/duckduckgo.rb +18 -7
- data/lib/pikuri/tool/search/engines.rb +72 -49
- data/lib/pikuri/tool/search/exa.rb +34 -22
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/tool/web_search.rb +45 -26
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +11 -10
- metadata +9 -66
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
require 'readability'
|
|
6
|
+
require 'reverse_markdown'
|
|
7
|
+
|
|
8
|
+
module Pikuri
|
|
9
|
+
module Extractor
|
|
10
|
+
# HTML → Markdown extractor.
|
|
11
|
+
#
|
|
12
|
+
# Matched by content-type only (+text/html+ /
|
|
13
|
+
# +application/xhtml+xml+) — deliberately no byte sniff. The web
|
|
14
|
+
# path always has the header; for local files a sniff would route
|
|
15
|
+
# +Workspace::Read+ of an +.html+ source file through readability
|
|
16
|
+
# extraction, when a developer reading an HTML file wants the
|
|
17
|
+
# source. Local HTML stays on the {Passthrough} arm until a
|
|
18
|
+
# consumer genuinely needs otherwise.
|
|
19
|
+
#
|
|
20
|
+
# Always renders both views of the page when available:
|
|
21
|
+
#
|
|
22
|
+
# 1. JSON-LD section. Any +<script type="application/ld+json">+ node
|
|
23
|
+
# whose +@type+ matches a substantive schema.org content type
|
|
24
|
+
# (Product, Article, Recipe, ...) is rendered as a header — title,
|
|
25
|
+
# metadata bullets (brand, SKU, price, rating, author, published),
|
|
26
|
+
# and the +articleBody+/+description+ copy when present.
|
|
27
|
+
# 2. Readability section. The page is run through +Readability+ +
|
|
28
|
+
# +reverse_markdown+, with a +<main>+/+<article>+ fallback for
|
|
29
|
+
# pages whose content sits mostly outside +<p>+ tags.
|
|
30
|
+
#
|
|
31
|
+
# Concatenated with a horizontal rule, so the LLM gets both the
|
|
32
|
+
# structured metadata and the rendered body and can pick whichever
|
|
33
|
+
# is more useful for the task. Trades some duplication (when a
|
|
34
|
+
# publisher embeds the article body in JSON-LD AND in HTML) for
|
|
35
|
+
# fewer type-based heuristics on which branch should win — the
|
|
36
|
+
# earlier "is this Article's +description+ a teaser or the real
|
|
37
|
+
# body?" carve-out is no longer needed because both end up in
|
|
38
|
+
# the output regardless.
|
|
39
|
+
module HTML
|
|
40
|
+
# @return [Array<String>] content-types this extractor claims.
|
|
41
|
+
CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
42
|
+
|
|
43
|
+
# @return [Array<String>] schema.org +@type+ values that we treat
|
|
44
|
+
# as "the primary entity of this page" when picking a JSON-LD
|
|
45
|
+
# node to render. Order does not matter — the first matching
|
|
46
|
+
# node wins. Skips noise nodes (Organization, BreadcrumbList,
|
|
47
|
+
# WebSite, ...) that ship on most pages but carry no page
|
|
48
|
+
# content.
|
|
49
|
+
INTERESTING_TYPES = %w[
|
|
50
|
+
Product Article NewsArticle BlogPosting Recipe Event Book Movie
|
|
51
|
+
].freeze
|
|
52
|
+
|
|
53
|
+
# @return [Array<String>] HTML tags preserved by the readability
|
|
54
|
+
# pass. Anything outside this list is stripped before Markdown
|
|
55
|
+
# conversion.
|
|
56
|
+
READABILITY_TAGS = %w[
|
|
57
|
+
h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
|
|
58
|
+
strong em b i br hr table thead tbody tr td th
|
|
59
|
+
].freeze
|
|
60
|
+
|
|
61
|
+
# @return [Array<String>] HTML attributes preserved by the
|
|
62
|
+
# readability pass; everything else (class, id, style, data-*)
|
|
63
|
+
# is dropped before Markdown conversion
|
|
64
|
+
READABILITY_ATTRS = %w[href src alt title].freeze
|
|
65
|
+
|
|
66
|
+
# @return [Float] minimum +<main>+/+<article>+ to Readability
|
|
67
|
+
# text-length ratio that triggers the semantic-container
|
|
68
|
+
# fallback in {.readability_to_markdown}. Picked low enough to
|
|
69
|
+
# catch the failure mode (Readability collapsing a page that
|
|
70
|
+
# uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
|
|
71
|
+
# ~5x) but high enough that pages where both produce
|
|
72
|
+
# comparable output keep Readability's noise filtering.
|
|
73
|
+
MAIN_FALLBACK_RATIO = 2.0
|
|
74
|
+
|
|
75
|
+
# @return [Integer] minimum text length the
|
|
76
|
+
# +<main>+/+<article>+ container must hold before the fallback
|
|
77
|
+
# in {.readability_to_markdown} can fire. Below this, the
|
|
78
|
+
# ratio comparison is dominated by noise and we'd swap on
|
|
79
|
+
# tiny pages where Readability is doing the right thing.
|
|
80
|
+
MAIN_FALLBACK_MIN_CHARS = 500
|
|
81
|
+
|
|
82
|
+
# @return [Symbol] {Page#kind} tag.
|
|
83
|
+
def self.kind
|
|
84
|
+
:html
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# @param sample [String] leading bytes of the content (unused —
|
|
88
|
+
# see the no-sniff rationale in the module doc).
|
|
89
|
+
# @param content_type [String, nil] normalized content-type.
|
|
90
|
+
# @return [Boolean]
|
|
91
|
+
def self.matches?(sample:, content_type:)
|
|
92
|
+
CONTENT_TYPES.include?(content_type)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Render the HTML document behind +io+ as Markdown by emitting
|
|
96
|
+
# both the JSON-LD section (when an interesting node is present)
|
|
97
|
+
# and the readability / +<main>+ section, joined by a horizontal
|
|
98
|
+
# rule. Either section may be missing — pages with no JSON-LD
|
|
99
|
+
# return only the readability output, and a malformed page with
|
|
100
|
+
# no extractable body returns only the JSON-LD render.
|
|
101
|
+
#
|
|
102
|
+
# @param io [IO, StringIO] IO over the HTML document.
|
|
103
|
+
# @return [String] Markdown representation
|
|
104
|
+
def self.extract(io)
|
|
105
|
+
html = io.read
|
|
106
|
+
sections = [jsonld_section(html), readability_to_markdown(html)]
|
|
107
|
+
sections.reject! { |s| s.nil? || s.strip.empty? }
|
|
108
|
+
sections.join("\n\n---\n\n")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Pick the first JSON-LD node whose +@type+ matches one of
|
|
112
|
+
# {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
|
|
113
|
+
# when no such node exists, in which case {.extract} emits only
|
|
114
|
+
# the readability section.
|
|
115
|
+
#
|
|
116
|
+
# No content-field gating: a node carrying just +name+/+author+/
|
|
117
|
+
# +datePublished+ still renders (as a metadata-only header),
|
|
118
|
+
# because the readability pass independently produces the page
|
|
119
|
+
# body. That is the trade-off that lets us drop the type-based
|
|
120
|
+
# "is this teaser or article copy?" heuristics — duplication is
|
|
121
|
+
# acceptable when both views are available, and the LLM can
|
|
122
|
+
# pick whichever it needs.
|
|
123
|
+
#
|
|
124
|
+
# @param html [String] HTML document body
|
|
125
|
+
# @return [String, nil] Markdown render of the picked JSON-LD
|
|
126
|
+
# node, or +nil+ when nothing matched
|
|
127
|
+
def self.jsonld_section(html)
|
|
128
|
+
node = parse_jsonld(html).find do |n|
|
|
129
|
+
Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
|
|
130
|
+
end
|
|
131
|
+
node ? jsonld_to_markdown(node) : nil
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Collect every JSON-LD payload embedded in +html+, flattening
|
|
135
|
+
# +@graph+ wrappers so callers see one flat array of schema.org
|
|
136
|
+
# nodes. Malformed JSON blocks are silently skipped — sites
|
|
137
|
+
# frequently ship broken JSON-LD and we only need at least one
|
|
138
|
+
# parseable block.
|
|
139
|
+
#
|
|
140
|
+
# @param html [String] HTML document body
|
|
141
|
+
# @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
|
|
142
|
+
def self.parse_jsonld(html)
|
|
143
|
+
doc = Nokogiri::HTML(html)
|
|
144
|
+
blobs = doc.css('script[type="application/ld+json"]').map(&:text)
|
|
145
|
+
|
|
146
|
+
blobs.flat_map do |raw|
|
|
147
|
+
parsed = begin
|
|
148
|
+
JSON.parse(raw)
|
|
149
|
+
rescue JSON::ParserError
|
|
150
|
+
nil
|
|
151
|
+
end
|
|
152
|
+
next [] unless parsed
|
|
153
|
+
|
|
154
|
+
nodes = parsed.is_a?(Array) ? parsed : [parsed]
|
|
155
|
+
nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Render a single JSON-LD +node+ as Markdown: a top-level title
|
|
160
|
+
# from +name+/+headline+, a bullet list of common useful fields
|
|
161
|
+
# (brand, SKU, price, rating, author, published date, ...), the
|
|
162
|
+
# body copy, and the lead image.
|
|
163
|
+
#
|
|
164
|
+
# When the node carries +articleBody+ (the full publisher-supplied
|
|
165
|
+
# article text), that wins over +description+ — the description
|
|
166
|
+
# is typically a lede teaser and would just repeat the article's
|
|
167
|
+
# opening lines.
|
|
168
|
+
#
|
|
169
|
+
# @param node [Hash] JSON-LD node, typically picked by
|
|
170
|
+
# {.jsonld_section}
|
|
171
|
+
# @return [String] Markdown representation
|
|
172
|
+
def self.jsonld_to_markdown(node)
|
|
173
|
+
out = +''
|
|
174
|
+
name = node['name'] || node['headline']
|
|
175
|
+
out << "# #{name}\n\n" if name
|
|
176
|
+
|
|
177
|
+
offer = first_obj(node['offers'])
|
|
178
|
+
rating = first_obj(node['aggregateRating'])
|
|
179
|
+
brand = first_obj_or_string(node['brand'])
|
|
180
|
+
author = first_obj_or_string(node['author'])
|
|
181
|
+
|
|
182
|
+
brand_name = brand.is_a?(Hash) ? brand['name'] : brand
|
|
183
|
+
author_name = author.is_a?(Hash) ? author['name'] : author
|
|
184
|
+
|
|
185
|
+
fields = {
|
|
186
|
+
'Brand' => brand_name,
|
|
187
|
+
'SKU' => node['sku'],
|
|
188
|
+
'GTIN' => node['gtin13'] || node['gtin'],
|
|
189
|
+
'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
|
|
190
|
+
'Availability' => offer['availability'],
|
|
191
|
+
'Rating' => rating['ratingValue'],
|
|
192
|
+
'Reviews' => rating['reviewCount'],
|
|
193
|
+
'Author' => author_name,
|
|
194
|
+
'Published' => node['datePublished']
|
|
195
|
+
}.reject { |_, v| v.nil? || v.to_s.strip.empty? }
|
|
196
|
+
|
|
197
|
+
unless fields.empty?
|
|
198
|
+
fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
|
|
199
|
+
out << "\n"
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
if (body = node['articleBody'] || node['description'])
|
|
203
|
+
out << "#{body}\n\n"
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
if (img = node['image'])
|
|
207
|
+
img = img.first if img.is_a?(Array)
|
|
208
|
+
img = img['url'] if img.is_a?(Hash)
|
|
209
|
+
out << "\n\n" if img
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
out
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Run +Readability+ over +html+ to isolate the main content node,
|
|
216
|
+
# then convert that to Markdown via +reverse_markdown+. The page
|
|
217
|
+
# +<title>+ is rendered as a top-level heading.
|
|
218
|
+
#
|
|
219
|
+
# When the page uses semantic HTML5 (+<main>+ or +<article>+) but
|
|
220
|
+
# leaves most of its content outside +<p>+ tags — divs, lists,
|
|
221
|
+
# spans — Readability's paragraph-density scoring collapses the
|
|
222
|
+
# extraction to a sliver of the page. In that case we render the
|
|
223
|
+
# +<main>+/+<article>+ container directly. The fallback only
|
|
224
|
+
# fires when the container holds substantially more text than
|
|
225
|
+
# Readability picked up (see {MAIN_FALLBACK_RATIO} /
|
|
226
|
+
# {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
|
|
227
|
+
# Readability so its noise filtering still strips nav/ads/etc.
|
|
228
|
+
#
|
|
229
|
+
# @param html [String] HTML document body
|
|
230
|
+
# @return [String] Markdown representation
|
|
231
|
+
def self.readability_to_markdown(html)
|
|
232
|
+
rdoc = Readability::Document.new(
|
|
233
|
+
html,
|
|
234
|
+
tags: READABILITY_TAGS,
|
|
235
|
+
attributes: READABILITY_ATTRS,
|
|
236
|
+
remove_empty_nodes: true
|
|
237
|
+
)
|
|
238
|
+
readability_html = rdoc.content
|
|
239
|
+
title = rdoc.title
|
|
240
|
+
|
|
241
|
+
body_html = main_fallback_html(html, readability_html) || readability_html
|
|
242
|
+
body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
|
|
243
|
+
|
|
244
|
+
out = +''
|
|
245
|
+
out << "# #{title.strip}\n\n" if title && !title.strip.empty?
|
|
246
|
+
out << body
|
|
247
|
+
out
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# If +html+ has a +<main>+ or +<article>+ element holding
|
|
251
|
+
# substantially more text than Readability extracted, return that
|
|
252
|
+
# container's HTML so the caller can render it instead. Returns
|
|
253
|
+
# +nil+ when the fallback should not fire — when there is no
|
|
254
|
+
# semantic container, when it's too small to be meaningful, or
|
|
255
|
+
# when Readability's output is already comparable.
|
|
256
|
+
#
|
|
257
|
+
# @param html [String] full HTML document body, used to locate
|
|
258
|
+
# the +<main>+/+<article>+ container
|
|
259
|
+
# @param readability_html [String] HTML produced by
|
|
260
|
+
# +Readability::Document#content+, used as the comparison
|
|
261
|
+
# baseline
|
|
262
|
+
# @return [String, nil] container HTML when the fallback should
|
|
263
|
+
# fire, +nil+ otherwise
|
|
264
|
+
def self.main_fallback_html(html, readability_html)
|
|
265
|
+
doc = Nokogiri::HTML(html)
|
|
266
|
+
container = doc.at_css('main') || doc.at_css('article')
|
|
267
|
+
return nil unless container
|
|
268
|
+
|
|
269
|
+
container_text_len = container.text.gsub(/\s+/, ' ').strip.length
|
|
270
|
+
return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
|
|
271
|
+
|
|
272
|
+
readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
|
|
273
|
+
return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
|
|
274
|
+
|
|
275
|
+
container.to_html
|
|
276
|
+
end
|
|
277
|
+
private_class_method :main_fallback_html
|
|
278
|
+
|
|
279
|
+
# JSON-LD fields can be a string, hash, or array of either.
|
|
280
|
+
# Normalize to a single hash (the first one if it's a list) so
|
|
281
|
+
# callers can +.dig+ safely.
|
|
282
|
+
#
|
|
283
|
+
# @param value [Object] raw JSON-LD field value
|
|
284
|
+
# @return [Hash] empty hash when +value+ does not contain a hash
|
|
285
|
+
def self.first_obj(value)
|
|
286
|
+
value = value.first if value.is_a?(Array)
|
|
287
|
+
value.is_a?(Hash) ? value : {}
|
|
288
|
+
end
|
|
289
|
+
private_class_method :first_obj
|
|
290
|
+
|
|
291
|
+
# Same idea as {.first_obj} but preserves a bare string (e.g.
|
|
292
|
+
# +brand: "Apple"+) instead of replacing it with +{}+.
|
|
293
|
+
#
|
|
294
|
+
# @param value [Object] raw JSON-LD field value
|
|
295
|
+
# @return [String, Hash, nil]
|
|
296
|
+
def self.first_obj_or_string(value)
|
|
297
|
+
value = value.first if value.is_a?(Array)
|
|
298
|
+
value
|
|
299
|
+
end
|
|
300
|
+
private_class_method :first_obj_or_string
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
module Extractor
|
|
5
|
+
# The terminal plain-text arm of the registry: content that *is*
|
|
6
|
+
# already text needs no extraction, so it passes through verbatim
|
|
7
|
+
# (forced to UTF-8 — invalid bytes are left in for downstream to
|
|
8
|
+
# deal with, matching what +File.read+ with a UTF-8 encoding does).
|
|
9
|
+
# Markdown, source files, JSON, robots.txt all land here.
|
|
10
|
+
#
|
|
11
|
+
# Matching is split by whether the transport supplied a
|
|
12
|
+
# content-type:
|
|
13
|
+
#
|
|
14
|
+
# * With a content-type (the web path): claim +text/*+ only.
|
|
15
|
+
# A non-text type that no earlier extractor claimed is *not*
|
|
16
|
+
# second-guessed by sniffing — a server declaring
|
|
17
|
+
# +application/octet-stream+ gets the {Unsupported} refusal the
|
|
18
|
+
# LLM can react to, same as before this registry existed.
|
|
19
|
+
# * Without one (the local-file path, where {FileType.detect_mime}
|
|
20
|
+
# returned +nil+ for "unrecognised"): claim anything that passes
|
|
21
|
+
# the {FileType.binary?} heuristic on the sample. Opaque
|
|
22
|
+
# binaries stay unclaimed and surface as {Unsupported}.
|
|
23
|
+
module Passthrough
|
|
24
|
+
# @return [Symbol] {Page#kind} tag.
|
|
25
|
+
def self.kind
|
|
26
|
+
:text
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# @param sample [String] leading bytes of the content.
|
|
30
|
+
# @param content_type [String, nil] normalized content-type,
|
|
31
|
+
# +nil+ when the transport has none.
|
|
32
|
+
# @return [Boolean]
|
|
33
|
+
def self.matches?(sample:, content_type:)
|
|
34
|
+
return content_type.start_with?('text/') unless content_type.nil?
|
|
35
|
+
|
|
36
|
+
!FileType.binary?(sample)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# @param io [IO, StringIO] IO over the text content.
|
|
40
|
+
# @return [String] the content, tagged UTF-8. Deliberately NOT
|
|
41
|
+
# derived from {.extract_lines} — a passthrough must stay
|
|
42
|
+
# verbatim (trailing newline, CRLF line endings), which a
|
|
43
|
+
# join of chomped lines would silently normalize away.
|
|
44
|
+
def self.extract(io)
|
|
45
|
+
io.read.force_encoding(Encoding::UTF_8)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# The lazy line stream for {Extractor.extract_paged}: the IO is
|
|
49
|
+
# read line-by-line, so a window over the head of a gigabyte
|
|
50
|
+
# log never loads the rest. Consuming the whole stream is a
|
|
51
|
+
# cheap sequential read — which is why the paging window counts
|
|
52
|
+
# this stream's tail for an exact +total_lines+ (see
|
|
53
|
+
# {Extractor.extract_paged}).
|
|
54
|
+
#
|
|
55
|
+
# @param io [IO, StringIO] IO over the text content; must
|
|
56
|
+
# remain open while the enumerator is consumed.
|
|
57
|
+
# @return [Enumerator::Lazy<String>] chomped lines, tagged
|
|
58
|
+
# UTF-8.
|
|
59
|
+
def self.extract_lines(io)
|
|
60
|
+
io.each_line.lazy.map { |raw| raw.chomp.force_encoding(Encoding::UTF_8) }
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
# The format→text extraction seam: one registry of extractors that
|
|
5
|
+
# turn an +IO+ of some recognised format (HTML and plain text out
|
|
6
|
+
# of the box; PDF / office formats via the pikuri-pdf /
|
|
7
|
+
# pikuri-extractors plug-in gems) into Markdown-flavoured UTF-8
|
|
8
|
+
# text, consumed through two front doors:
|
|
9
|
+
#
|
|
10
|
+
# * {.extract} — the whole document as one String. The shape the
|
|
11
|
+
# indexing / caching callers want ({Pikuri::VectorDb}'s indexer,
|
|
12
|
+
# {Tool::WebScrape}'s URL cache): no windowing, no presentation.
|
|
13
|
+
# * {.extract_paged} — the LLM-tool shape: the same extraction,
|
|
14
|
+
# windowed to a line range with a byte cap, returned as a {Page}
|
|
15
|
+
# the caller renders. Backs +Workspace::Read+ and
|
|
16
|
+
# +VectorDb::Tools::Read+ (via the {FileType} path wrappers) so
|
|
17
|
+
# the offset/limit/byte-cap logic lives in one tested place.
|
|
18
|
+
#
|
|
19
|
+
# Both front doors — +Tool::Scraper+ dispatching on the HTTP
|
|
20
|
+
# +Content-Type+ header for the web tools, and {FileType} resolving
|
|
21
|
+
# local paths — route through this one registry, so both share one
|
|
22
|
+
# set of format truths and "support a new format" is a registry
|
|
23
|
+
# entry (pikuri-pdf and pikuri-extractors plug PDF and office
|
|
24
|
+
# formats in without pikuri-core knowing), not a new special case
|
|
25
|
+
# in two dispatchers.
|
|
26
|
+
#
|
|
27
|
+
# == The extractor duck type
|
|
28
|
+
#
|
|
29
|
+
# Each {.registry} entry implements three methods:
|
|
30
|
+
#
|
|
31
|
+
# * +matches?(sample:, content_type:)+ → +Boolean+ — claim the
|
|
32
|
+
# content. +sample+ is the leading {FileType::SAMPLE_BYTES} bytes
|
|
33
|
+
# (for magic-byte sniffs); +content_type+ is the normalized HTTP
|
|
34
|
+
# +Content-Type+ for web content, the {FileType.detect_mime}
|
|
35
|
+
# result for local files, and may be +nil+ ("no transport
|
|
36
|
+
# metadata — sniff if you can").
|
|
37
|
+
# * +extract(io)+ → +String+ — the whole document as
|
|
38
|
+
# Markdown-flavoured UTF-8 text. Raises {Error} on content the
|
|
39
|
+
# extractor claimed but cannot parse (malformed PDF, ...).
|
|
40
|
+
# * +kind+ → +Symbol+ — a short tag (+:text+ / +:pdf+ / +:html+)
|
|
41
|
+
# carried on {Page#kind} so rendering callers can word
|
|
42
|
+
# format-specific trailers ("End of PDF", the scanned-image
|
|
43
|
+
# hint) without re-sniffing.
|
|
44
|
+
#
|
|
45
|
+
# plus one *optional* method for formats whose lines can be
|
|
46
|
+
# produced incrementally:
|
|
47
|
+
#
|
|
48
|
+
# * +extract_lines(io)+ → +Enumerator<String>+ — the same content
|
|
49
|
+
# as +extract+, as a lazy stream of already-+chomp+ed lines.
|
|
50
|
+
# {.extract_paged} prefers this when present and stops consuming
|
|
51
|
+
# the moment the window fills, so the rest of the document is
|
|
52
|
+
# never parsed (pikuri-pdf's extractor: pdf-reader's page list
|
|
53
|
+
# parses on access; {Passthrough}: the IO is read line-by-line).
|
|
54
|
+
# The enumerator
|
|
55
|
+
# must be consumed while +io+ is still open, and may raise
|
|
56
|
+
# {Error} mid-iteration. Extractors that need the whole document
|
|
57
|
+
# to produce anything ({HTML}: Readability walks the full DOM —
|
|
58
|
+
# true of any subprocess-based extractor too) simply omit it;
|
|
59
|
+
# {.extract_paged} then extracts in full and windows the result.
|
|
60
|
+
#
|
|
61
|
+
# Windowing itself (offset / limit / byte cap / line truncation) is
|
|
62
|
+
# presentation and deliberately lives once in {.extract_paged}, not
|
|
63
|
+
# per extractor — +extract_lines+ is line *production*, the only
|
|
64
|
+
# genuinely format-specific half of paging.
|
|
65
|
+
#
|
|
66
|
+
# == Errors
|
|
67
|
+
#
|
|
68
|
+
# Both failure modes are failures the *caller's* LLM can react to,
|
|
69
|
+
# so they share one rescuable root:
|
|
70
|
+
#
|
|
71
|
+
# * {Unsupported} — nothing in {.registry} claimed the content
|
|
72
|
+
# (opaque binary, an unhandled content-type).
|
|
73
|
+
# * {Error} (the root) — an extractor claimed the content but the
|
|
74
|
+
# parse failed (malformed PDF, ...).
|
|
75
|
+
#
|
|
76
|
+
# Callers map them to their own conventions:
|
|
77
|
+
# +Tool::Scraper+ re-raises both as +FetchError+;
|
|
78
|
+
# {FileType.read_as_text} maps {Unsupported} to the +ArgumentError+
|
|
79
|
+
# binary refusal and {Error} to a +RuntimeError+ carrying the path.
|
|
80
|
+
module Extractor
|
|
81
|
+
module_function
|
|
82
|
+
|
|
83
|
+
# Raised when an extractor claims content but fails to parse it
|
|
84
|
+
# (e.g. a malformed PDF). Message is LLM-presentable.
|
|
85
|
+
Error = Class.new(StandardError)
|
|
86
|
+
|
|
87
|
+
# Raised by {.extract} / {.extract_paged} when no registry entry
|
|
88
|
+
# claims the content. Subclass of {Error} so callers that don't
|
|
89
|
+
# care about the distinction rescue one class.
|
|
90
|
+
Unsupported = Class.new(Error)
|
|
91
|
+
|
|
92
|
+
# @return [Integer] default line-window size for {.extract_paged}
|
|
93
|
+
# when the caller omits +limit+.
|
|
94
|
+
PAGE_DEFAULT_LIMIT = 2000
|
|
95
|
+
|
|
96
|
+
# @return [Integer] default hard byte cap on the content collected
|
|
97
|
+
# by a single {.extract_paged} call. Bypassable by paging via
|
|
98
|
+
# +offset+. The rendered output is slightly larger (line
|
|
99
|
+
# numbering, trailer) — that's the caller's concern.
|
|
100
|
+
PAGE_MAX_BYTES = 50 * 1024
|
|
101
|
+
|
|
102
|
+
# @return [Integer] default per-line character cap;
|
|
103
|
+
# {.extract_paged} truncates longer lines and appends
|
|
104
|
+
# {PAGE_LINE_TRUNCATION_MARKER}.
|
|
105
|
+
PAGE_MAX_LINE_LENGTH = 2000
|
|
106
|
+
|
|
107
|
+
# @return [String] suffix appended to a line truncated at
|
|
108
|
+
# {PAGE_MAX_LINE_LENGTH}.
|
|
109
|
+
PAGE_LINE_TRUNCATION_MARKER = "... (line truncated to #{PAGE_MAX_LINE_LENGTH} chars)"
|
|
110
|
+
|
|
111
|
+
# One windowed slice of a document, returned by {.extract_paged}.
|
|
112
|
+
# The caller turns this into an observation; this struct carries
|
|
113
|
+
# everything a trailer needs without the caller re-reading the
|
|
114
|
+
# document.
|
|
115
|
+
#
|
|
116
|
+
# == Fields
|
|
117
|
+
#
|
|
118
|
+
# * +lines+ — +Array<String>+, the collected window. Already
|
|
119
|
+
# per-line truncated (with {PAGE_LINE_TRUNCATION_MARKER}); *not*
|
|
120
|
+
# line-numbered — numbering is presentation the caller adds. For
|
|
121
|
+
# a PDF the array includes the +"--- Page N ---"+ marker lines
|
|
122
|
+
# pikuri-pdf's extractor emits, which count toward +limit+ / the
|
|
123
|
+
# byte cap like any other line.
|
|
124
|
+
# * +start_line+ — the 1-indexed line number of +lines.first+
|
|
125
|
+
# (i.e. the +offset+ the caller asked for). +lines.last+ is at
|
|
126
|
+
# +start_line + lines.length - 1+.
|
|
127
|
+
# * +total_lines+ — total line count of the document when known,
|
|
128
|
+
# else +nil+. Known when the read reached EOF, when the format
|
|
129
|
+
# was extracted in full (no +extract_lines+ — e.g. HTML), or
|
|
130
|
+
# when the lazy stream is cheap enough to count to the end
|
|
131
|
+
# (plain text). +nil+ when a lazy stream stopped early — the
|
|
132
|
+
# byte cap fired, or a PDF filled the window before its last
|
|
133
|
+
# page (counting the rest would mean parsing every page,
|
|
134
|
+
# defeating the laziness).
|
|
135
|
+
# * +more+ — +true+ if content remains past this window (the
|
|
136
|
+
# caller should offer +offset = start_line + lines.length+).
|
|
137
|
+
# * +byte_capped+ — +true+ if the byte cap (not the line limit)
|
|
138
|
+
# was the stopping criterion.
|
|
139
|
+
# * +kind+ — the matched extractor's +kind+ tag (+:text+ /
|
|
140
|
+
# +:pdf+ / +:html+); lets the caller word format-specific
|
|
141
|
+
# trailers and the empty-document message.
|
|
142
|
+
#
|
|
143
|
+
# An empty document yields +lines: []+, +total_lines: 0+; an
|
|
144
|
+
# +offset+ past EOF yields +lines: []+ with +total_lines+ set to
|
|
145
|
+
# the real (non-zero) count — the caller distinguishes the two.
|
|
146
|
+
Page = Data.define(:lines, :start_line, :total_lines, :more, :byte_capped, :kind)
|
|
147
|
+
|
|
148
|
+
# The extractor registry, consulted in order — first match wins.
|
|
149
|
+
# Core ships two entries: {HTML} matches on content-type, and
|
|
150
|
+
# {Passthrough} is the terminal plain-text arm. A gem adding a
|
|
151
|
+
# format picks its insertion point by the strength of its claim:
|
|
152
|
+
# a magic-byte sniff that never misfires on text goes at the
|
|
153
|
+
# *front* so it beats {HTML}'s content-type match even under a
|
|
154
|
+
# lying header (+registry.unshift(X)+ — pikuri-pdf does this);
|
|
155
|
+
# a content-type / weaker-sniff claimer inserts before the
|
|
156
|
+
# terminal entry (+registry.insert(-2, X)+ — pikuri-extractors
|
|
157
|
+
# does this).
|
|
158
|
+
#
|
|
159
|
+
# @return [Array<#matches?>] mutable, deliberately — this is the
|
|
160
|
+
# plug-in seam.
|
|
161
|
+
def registry
|
|
162
|
+
@registry ||= [HTML, Passthrough]
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Extract the whole document behind +io+ as one Markdown-flavoured
|
|
166
|
+
# UTF-8 String. May be empty (empty text file, scanned-image PDF
|
|
167
|
+
# with no extractable text).
|
|
168
|
+
#
|
|
169
|
+
# @param io [IO, StringIO] seekable IO positioned at the start of
|
|
170
|
+
# the content; this method reads a leading sample for the
|
|
171
|
+
# +matches?+ sniff and rewinds before extracting.
|
|
172
|
+
# @param content_type [String, nil] normalized content-type when
|
|
173
|
+
# the transport supplies one (HTTP header, {FileType.detect_mime}
|
|
174
|
+
# result); +nil+ when unknown — extractors then rely on their
|
|
175
|
+
# byte sniffs.
|
|
176
|
+
# @return [String]
|
|
177
|
+
# @raise [Unsupported] when no registry entry claims the content.
|
|
178
|
+
# @raise [Error] when the matched extractor cannot parse it.
|
|
179
|
+
def extract(io, content_type: nil)
|
|
180
|
+
extractor_for(io, content_type).extract(io)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Extract +io+ and return a windowed {Page}: the lines from
|
|
184
|
+
# +offset+ (1-indexed) up to +limit+ of them, stopping early if
|
|
185
|
+
# +max_bytes+ is reached, with over-long lines truncated at
|
|
186
|
+
# +max_line_length+.
|
|
187
|
+
#
|
|
188
|
+
# Lazy where the format allows: extractors that implement
|
|
189
|
+
# +extract_lines+ (plain text; pikuri-pdf's PDF) are consumed
|
|
190
|
+
# only until the window fills — reading the first window of a
|
|
191
|
+
# 500-page PDF parses a handful of pages, and the first page of
|
|
192
|
+
# a gigabyte log never loads it. Extractors without it (HTML) are extracted
|
|
193
|
+
# in full and then windowed, which is also what makes their
|
|
194
|
+
# +total_lines+ always exact.
|
|
195
|
+
#
|
|
196
|
+
# @param io [IO, StringIO] seekable IO positioned at the start.
|
|
197
|
+
# @param content_type [String, nil] as for {.extract}.
|
|
198
|
+
# @param offset [Integer] 1-indexed first line to include. The
|
|
199
|
+
# caller is responsible for validating +offset >= 1+.
|
|
200
|
+
# @param limit [Integer] maximum lines to collect. Caller
|
|
201
|
+
# validates +limit >= 1+.
|
|
202
|
+
# @param max_bytes [Integer] hard byte cap on collected content.
|
|
203
|
+
# @param max_line_length [Integer] per-line truncation threshold.
|
|
204
|
+
# @return [Page] the windowed slice.
|
|
205
|
+
# @raise [Unsupported] when no registry entry claims the content.
|
|
206
|
+
# @raise [Error] when the matched extractor cannot parse it.
|
|
207
|
+
def extract_paged(io, content_type: nil, offset: 1, limit: PAGE_DEFAULT_LIMIT,
|
|
208
|
+
max_bytes: PAGE_MAX_BYTES, max_line_length: PAGE_MAX_LINE_LENGTH)
|
|
209
|
+
extractor = extractor_for(io, content_type)
|
|
210
|
+
if extractor.respond_to?(:extract_lines)
|
|
211
|
+
# count_tail is a per-format economics call: once the window
|
|
212
|
+
# fills, counting the rest of a plain-text stream is a cheap
|
|
213
|
+
# sequential read (so the trailer can say "of N"), while for
|
|
214
|
+
# a PDF it would mean parsing every remaining page — exactly
|
|
215
|
+
# what extract_lines exists to avoid. Plugged-in extractors
|
|
216
|
+
# (pikuri-pdf's included) get the conservative default (stop
|
|
217
|
+
# early, total unknown).
|
|
218
|
+
window(extractor.extract_lines(io),
|
|
219
|
+
offset: offset, limit: limit, max_bytes: max_bytes,
|
|
220
|
+
max_line_length: max_line_length, kind: extractor.kind,
|
|
221
|
+
known_total: nil, count_tail: extractor.equal?(Passthrough))
|
|
222
|
+
else
|
|
223
|
+
lines = extractor.extract(io).split("\n")
|
|
224
|
+
window(lines, offset: offset, limit: limit, max_bytes: max_bytes,
|
|
225
|
+
max_line_length: max_line_length, kind: extractor.kind,
|
|
226
|
+
known_total: lines.length)
|
|
227
|
+
end
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Find the first registry entry claiming +io+'s content: read the
|
|
231
|
+
# leading {FileType::SAMPLE_BYTES} for the sniff, rewind, and ask
|
|
232
|
+
# each extractor in order.
|
|
233
|
+
#
|
|
234
|
+
# @param io [IO, StringIO] seekable IO positioned at the start.
|
|
235
|
+
# @param content_type [String, nil]
|
|
236
|
+
# @return [#extract] the matched extractor.
|
|
237
|
+
# @raise [Unsupported] when nothing matches.
|
|
238
|
+
def extractor_for(io, content_type)
|
|
239
|
+
sample = io.read(FileType::SAMPLE_BYTES) || +''
|
|
240
|
+
io.rewind
|
|
241
|
+
registry.find { |ex| ex.matches?(sample: sample, content_type: content_type) } ||
|
|
242
|
+
raise(Unsupported, 'no extractor for this content' \
|
|
243
|
+
"#{content_type && !content_type.empty? ? " (content-type #{content_type.inspect})" : ''}")
|
|
244
|
+
end
|
|
245
|
+
private_class_method :extractor_for
|
|
246
|
+
|
|
247
|
+
# Collect a {Page} window out of +lines+ (an Array or a lazy
|
|
248
|
+
# Enumerator of already-+chomp+ed lines). +known_total+ is the
|
|
249
|
+
# full line count when the caller extracted everything up front
|
|
250
|
+
# (Array case), +nil+ for a lazy stream — then +total_lines+ is
|
|
251
|
+
# exact only if the iteration reached EOF: +count_tail+ keeps
|
|
252
|
+
# the loop counting (without collecting) past the line limit
|
|
253
|
+
# when consuming the rest of the stream is cheap; without it the
|
|
254
|
+
# loop breaks and leaves the total unknown. The byte cap always
|
|
255
|
+
# aborts the count.
|
|
256
|
+
#
|
|
257
|
+
# @param lines [Enumerable<String>]
|
|
258
|
+
# @param known_total [Integer, nil]
|
|
259
|
+
# @param count_tail [Boolean]
|
|
260
|
+
# @return [Page]
|
|
261
|
+
def window(lines, offset:, limit:, max_bytes:, max_line_length:, kind:,
|
|
262
|
+
known_total:, count_tail: false)
|
|
263
|
+
start_index = offset - 1
|
|
264
|
+
collected = []
|
|
265
|
+
seen = 0
|
|
266
|
+
bytes = 0
|
|
267
|
+
byte_capped = false
|
|
268
|
+
more = false
|
|
269
|
+
stopped_early = false
|
|
270
|
+
|
|
271
|
+
lines.each do |raw|
|
|
272
|
+
seen += 1
|
|
273
|
+
next if seen <= start_index
|
|
274
|
+
|
|
275
|
+
if collected.length >= limit
|
|
276
|
+
more = true
|
|
277
|
+
next if count_tail # keep counting so total_lines stays exact
|
|
278
|
+
|
|
279
|
+
stopped_early = true
|
|
280
|
+
break
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
line = truncate_line(raw, max_line_length)
|
|
284
|
+
size = line.bytesize + 1 # +1 for the joining newline
|
|
285
|
+
if bytes + size > max_bytes
|
|
286
|
+
byte_capped = true
|
|
287
|
+
more = true
|
|
288
|
+
stopped_early = true
|
|
289
|
+
break
|
|
290
|
+
end
|
|
291
|
+
collected << line
|
|
292
|
+
bytes += size
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
Page.new(lines: collected, start_line: offset,
|
|
296
|
+
total_lines: known_total || (stopped_early ? nil : seen),
|
|
297
|
+
more: more, byte_capped: byte_capped, kind: kind)
|
|
298
|
+
end
|
|
299
|
+
private_class_method :window
|
|
300
|
+
|
|
301
|
+
# Truncate +line+ to +max_line_length+ chars, appending
|
|
302
|
+
# {PAGE_LINE_TRUNCATION_MARKER} when it overflows.
|
|
303
|
+
#
|
|
304
|
+
# @param line [String]
|
|
305
|
+
# @param max_line_length [Integer]
|
|
306
|
+
# @return [String]
|
|
307
|
+
def truncate_line(line, max_line_length)
|
|
308
|
+
return line if line.length <= max_line_length
|
|
309
|
+
|
|
310
|
+
line[0, max_line_length] + PAGE_LINE_TRUNCATION_MARKER
|
|
311
|
+
end
|
|
312
|
+
private_class_method :truncate_line
|
|
313
|
+
end
|
|
314
|
+
end
|