pikuri-core 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pikuri/agent/listener/terminal.rb +18 -36
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +74 -266
- data/lib/pikuri/subprocess.rb +73 -2
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +0 -1
- metadata +5 -61
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pikuri-core
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Martin Vysny
|
|
@@ -10,20 +10,6 @@ bindir: bin
|
|
|
10
10
|
cert_chain: []
|
|
11
11
|
date: 2026-06-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
-
- !ruby/object:Gem::Dependency
|
|
14
|
-
name: dentaku
|
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
|
16
|
-
requirements:
|
|
17
|
-
- - "~>"
|
|
18
|
-
- !ruby/object:Gem::Version
|
|
19
|
-
version: '3.5'
|
|
20
|
-
type: :runtime
|
|
21
|
-
prerelease: false
|
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
-
requirements:
|
|
24
|
-
- - "~>"
|
|
25
|
-
- !ruby/object:Gem::Version
|
|
26
|
-
version: '3.5'
|
|
27
13
|
- !ruby/object:Gem::Dependency
|
|
28
14
|
name: faraday
|
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -52,20 +38,6 @@ dependencies:
|
|
|
52
38
|
- - "~>"
|
|
53
39
|
- !ruby/object:Gem::Version
|
|
54
40
|
version: '1.19'
|
|
55
|
-
- !ruby/object:Gem::Dependency
|
|
56
|
-
name: pdf-reader
|
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
|
58
|
-
requirements:
|
|
59
|
-
- - "~>"
|
|
60
|
-
- !ruby/object:Gem::Version
|
|
61
|
-
version: '2.15'
|
|
62
|
-
type: :runtime
|
|
63
|
-
prerelease: false
|
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
-
requirements:
|
|
66
|
-
- - "~>"
|
|
67
|
-
- !ruby/object:Gem::Version
|
|
68
|
-
version: '2.15'
|
|
69
41
|
- !ruby/object:Gem::Dependency
|
|
70
42
|
name: rainbow
|
|
71
43
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -122,34 +94,6 @@ dependencies:
|
|
|
122
94
|
- - "~>"
|
|
123
95
|
- !ruby/object:Gem::Version
|
|
124
96
|
version: '1.15'
|
|
125
|
-
- !ruby/object:Gem::Dependency
|
|
126
|
-
name: tsort
|
|
127
|
-
requirement: !ruby/object:Gem::Requirement
|
|
128
|
-
requirements:
|
|
129
|
-
- - "~>"
|
|
130
|
-
- !ruby/object:Gem::Version
|
|
131
|
-
version: '0.2'
|
|
132
|
-
type: :runtime
|
|
133
|
-
prerelease: false
|
|
134
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
135
|
-
requirements:
|
|
136
|
-
- - "~>"
|
|
137
|
-
- !ruby/object:Gem::Version
|
|
138
|
-
version: '0.2'
|
|
139
|
-
- !ruby/object:Gem::Dependency
|
|
140
|
-
name: tty-markdown
|
|
141
|
-
requirement: !ruby/object:Gem::Requirement
|
|
142
|
-
requirements:
|
|
143
|
-
- - "~>"
|
|
144
|
-
- !ruby/object:Gem::Version
|
|
145
|
-
version: '0.7'
|
|
146
|
-
type: :runtime
|
|
147
|
-
prerelease: false
|
|
148
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
149
|
-
requirements:
|
|
150
|
-
- - "~>"
|
|
151
|
-
- !ruby/object:Gem::Version
|
|
152
|
-
version: '0.7'
|
|
153
97
|
- !ruby/object:Gem::Dependency
|
|
154
98
|
name: zeitwerk
|
|
155
99
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -199,6 +143,9 @@ files:
|
|
|
199
143
|
- lib/pikuri/agent/listener/token_log.rb
|
|
200
144
|
- lib/pikuri/agent/listener_list.rb
|
|
201
145
|
- lib/pikuri/agent/synthesizer.rb
|
|
146
|
+
- lib/pikuri/extractor.rb
|
|
147
|
+
- lib/pikuri/extractor/html.rb
|
|
148
|
+
- lib/pikuri/extractor/passthrough.rb
|
|
202
149
|
- lib/pikuri/file_type.rb
|
|
203
150
|
- lib/pikuri/finalizers.rb
|
|
204
151
|
- lib/pikuri/paths.rb
|
|
@@ -207,10 +154,7 @@ files:
|
|
|
207
154
|
- lib/pikuri/tool/calculator.rb
|
|
208
155
|
- lib/pikuri/tool/fetch.rb
|
|
209
156
|
- lib/pikuri/tool/parameters.rb
|
|
210
|
-
- lib/pikuri/tool/scraper
|
|
211
|
-
- lib/pikuri/tool/scraper/html.rb
|
|
212
|
-
- lib/pikuri/tool/scraper/pdf.rb
|
|
213
|
-
- lib/pikuri/tool/scraper/simple.rb
|
|
157
|
+
- lib/pikuri/tool/scraper.rb
|
|
214
158
|
- lib/pikuri/tool/search/brave.rb
|
|
215
159
|
- lib/pikuri/tool/search/duckduckgo.rb
|
|
216
160
|
- lib/pikuri/tool/search/engines.rb
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Pikuri
|
|
4
|
-
class Tool
|
|
5
|
-
module Scraper
|
|
6
|
-
# Raised by anything in the scraper stack when a URL cannot be
|
|
7
|
-
# rendered into Markdown text — HTTP non-2xx, network failure,
|
|
8
|
-
# redirect-loop, missing +Location+, unsupported content-type, or a
|
|
9
|
-
# parse failure that reads as "try a different URL" to the LLM.
|
|
10
|
-
# Catching this in {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the
|
|
11
|
-
# failure into an +"Error: ..."+ observation; anything else bubbles
|
|
12
|
-
# up so genuine bugs stay visible.
|
|
13
|
-
class FetchError < StandardError; end
|
|
14
|
-
end
|
|
15
|
-
end
|
|
16
|
-
end
|
|
@@ -1,285 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'json'
|
|
4
|
-
require 'nokogiri'
|
|
5
|
-
require 'readability'
|
|
6
|
-
require 'reverse_markdown'
|
|
7
|
-
|
|
8
|
-
module Pikuri
|
|
9
|
-
class Tool
|
|
10
|
-
module Scraper
|
|
11
|
-
# HTML → Markdown extractor used by {Simple.visit} when the fetched
|
|
12
|
-
# response carries an HTML content-type.
|
|
13
|
-
#
|
|
14
|
-
# Always renders both views of the page when available:
|
|
15
|
-
#
|
|
16
|
-
# 1. JSON-LD section. Any +<script type="application/ld+json">+ node
|
|
17
|
-
# whose +@type+ matches a substantive schema.org content type
|
|
18
|
-
# (Product, Article, Recipe, ...) is rendered as a header — title,
|
|
19
|
-
# metadata bullets (brand, SKU, price, rating, author, published),
|
|
20
|
-
# and the +articleBody+/+description+ copy when present.
|
|
21
|
-
# 2. Readability section. The page is run through +Readability+ +
|
|
22
|
-
# +reverse_markdown+, with a +<main>+/+<article>+ fallback for
|
|
23
|
-
# pages whose content sits mostly outside +<p>+ tags.
|
|
24
|
-
#
|
|
25
|
-
# Concatenated with a horizontal rule, so the LLM gets both the
|
|
26
|
-
# structured metadata and the rendered body and can pick whichever
|
|
27
|
-
# is more useful for the task. Trades some duplication (when a
|
|
28
|
-
# publisher embeds the article body in JSON-LD AND in HTML) for
|
|
29
|
-
# fewer type-based heuristics on which branch should win — the
|
|
30
|
-
# earlier "is this Article's +description+ a teaser or the real
|
|
31
|
-
# body?" carve-out is no longer needed because both end up in
|
|
32
|
-
# the output regardless.
|
|
33
|
-
#
|
|
34
|
-
# Pure parser — no I/O. {.extract} takes an HTML string and returns
|
|
35
|
-
# Markdown, so tests can drive it against fixture HTML without a
|
|
36
|
-
# network round-trip.
|
|
37
|
-
module HTML
|
|
38
|
-
# @return [Array<String>] schema.org +@type+ values that we treat
|
|
39
|
-
# as "the primary entity of this page" when picking a JSON-LD
|
|
40
|
-
# node to render. Order does not matter — the first matching
|
|
41
|
-
# node wins. Skips noise nodes (Organization, BreadcrumbList,
|
|
42
|
-
# WebSite, ...) that ship on most pages but carry no page
|
|
43
|
-
# content.
|
|
44
|
-
INTERESTING_TYPES = %w[
|
|
45
|
-
Product Article NewsArticle BlogPosting Recipe Event Book Movie
|
|
46
|
-
].freeze
|
|
47
|
-
|
|
48
|
-
# @return [Array<String>] HTML tags preserved by the readability
|
|
49
|
-
# pass. Anything outside this list is stripped before Markdown
|
|
50
|
-
# conversion.
|
|
51
|
-
READABILITY_TAGS = %w[
|
|
52
|
-
h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
|
|
53
|
-
strong em b i br hr table thead tbody tr td th
|
|
54
|
-
].freeze
|
|
55
|
-
|
|
56
|
-
# @return [Array<String>] HTML attributes preserved by the
|
|
57
|
-
# readability pass; everything else (class, id, style, data-*)
|
|
58
|
-
# is dropped before Markdown conversion
|
|
59
|
-
READABILITY_ATTRS = %w[href src alt title].freeze
|
|
60
|
-
|
|
61
|
-
# @return [Float] minimum +<main>+/+<article>+ to Readability
|
|
62
|
-
# text-length ratio that triggers the semantic-container
|
|
63
|
-
# fallback in {.readability_to_markdown}. Picked low enough to
|
|
64
|
-
# catch the failure mode (Readability collapsing a page that
|
|
65
|
-
# uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
|
|
66
|
-
# ~5x) but high enough that pages where both produce
|
|
67
|
-
# comparable output keep Readability's noise filtering.
|
|
68
|
-
MAIN_FALLBACK_RATIO = 2.0
|
|
69
|
-
|
|
70
|
-
# @return [Integer] minimum text length the
|
|
71
|
-
# +<main>+/+<article>+ container must hold before the fallback
|
|
72
|
-
# in {.readability_to_markdown} can fire. Below this, the
|
|
73
|
-
# ratio comparison is dominated by noise and we'd swap on
|
|
74
|
-
# tiny pages where Readability is doing the right thing.
|
|
75
|
-
MAIN_FALLBACK_MIN_CHARS = 500
|
|
76
|
-
|
|
77
|
-
# Render +html+ as Markdown by emitting both the JSON-LD section
|
|
78
|
-
# (when an interesting node is present) and the readability /
|
|
79
|
-
# +<main>+ section, joined by a horizontal rule. Either section
|
|
80
|
-
# may be missing — pages with no JSON-LD return only the
|
|
81
|
-
# readability output, and a malformed page with no extractable
|
|
82
|
-
# body returns only the JSON-LD render.
|
|
83
|
-
#
|
|
84
|
-
# @param html [String] HTML document body
|
|
85
|
-
# @return [String] Markdown representation
|
|
86
|
-
def self.extract(html)
|
|
87
|
-
sections = [jsonld_section(html), readability_to_markdown(html)]
|
|
88
|
-
sections.reject! { |s| s.nil? || s.strip.empty? }
|
|
89
|
-
sections.join("\n\n---\n\n")
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
# Pick the first JSON-LD node whose +@type+ matches one of
|
|
93
|
-
# {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
|
|
94
|
-
# when no such node exists, in which case {.extract} emits only
|
|
95
|
-
# the readability section.
|
|
96
|
-
#
|
|
97
|
-
# No content-field gating: a node carrying just +name+/+author+/
|
|
98
|
-
# +datePublished+ still renders (as a metadata-only header),
|
|
99
|
-
# because the readability pass independently produces the page
|
|
100
|
-
# body. That is the trade-off that lets us drop the type-based
|
|
101
|
-
# "is this teaser or article copy?" heuristics — duplication is
|
|
102
|
-
# acceptable when both views are available, and the LLM can
|
|
103
|
-
# pick whichever it needs.
|
|
104
|
-
#
|
|
105
|
-
# @param html [String] HTML document body
|
|
106
|
-
# @return [String, nil] Markdown render of the picked JSON-LD
|
|
107
|
-
# node, or +nil+ when nothing matched
|
|
108
|
-
def self.jsonld_section(html)
|
|
109
|
-
node = parse_jsonld(html).find do |n|
|
|
110
|
-
Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
|
|
111
|
-
end
|
|
112
|
-
node ? jsonld_to_markdown(node) : nil
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
# Collect every JSON-LD payload embedded in +html+, flattening
|
|
116
|
-
# +@graph+ wrappers so callers see one flat array of schema.org
|
|
117
|
-
# nodes. Malformed JSON blocks are silently skipped — sites
|
|
118
|
-
# frequently ship broken JSON-LD and we only need at least one
|
|
119
|
-
# parseable block.
|
|
120
|
-
#
|
|
121
|
-
# @param html [String] HTML document body
|
|
122
|
-
# @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
|
|
123
|
-
def self.parse_jsonld(html)
|
|
124
|
-
doc = Nokogiri::HTML(html)
|
|
125
|
-
blobs = doc.css('script[type="application/ld+json"]').map(&:text)
|
|
126
|
-
|
|
127
|
-
blobs.flat_map do |raw|
|
|
128
|
-
parsed = begin
|
|
129
|
-
JSON.parse(raw)
|
|
130
|
-
rescue JSON::ParserError
|
|
131
|
-
nil
|
|
132
|
-
end
|
|
133
|
-
next [] unless parsed
|
|
134
|
-
|
|
135
|
-
nodes = parsed.is_a?(Array) ? parsed : [parsed]
|
|
136
|
-
nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
|
|
137
|
-
end
|
|
138
|
-
end
|
|
139
|
-
|
|
140
|
-
# Render a single JSON-LD +node+ as Markdown: a top-level title
|
|
141
|
-
# from +name+/+headline+, a bullet list of common useful fields
|
|
142
|
-
# (brand, SKU, price, rating, author, published date, ...), the
|
|
143
|
-
# body copy, and the lead image.
|
|
144
|
-
#
|
|
145
|
-
# When the node carries +articleBody+ (the full publisher-supplied
|
|
146
|
-
# article text), that wins over +description+ — the description
|
|
147
|
-
# is typically a lede teaser and would just repeat the article's
|
|
148
|
-
# opening lines.
|
|
149
|
-
#
|
|
150
|
-
# @param node [Hash] JSON-LD node, typically picked by
|
|
151
|
-
# {.jsonld_section}
|
|
152
|
-
# @return [String] Markdown representation
|
|
153
|
-
def self.jsonld_to_markdown(node)
|
|
154
|
-
out = +''
|
|
155
|
-
name = node['name'] || node['headline']
|
|
156
|
-
out << "# #{name}\n\n" if name
|
|
157
|
-
|
|
158
|
-
offer = first_obj(node['offers'])
|
|
159
|
-
rating = first_obj(node['aggregateRating'])
|
|
160
|
-
brand = first_obj_or_string(node['brand'])
|
|
161
|
-
author = first_obj_or_string(node['author'])
|
|
162
|
-
|
|
163
|
-
brand_name = brand.is_a?(Hash) ? brand['name'] : brand
|
|
164
|
-
author_name = author.is_a?(Hash) ? author['name'] : author
|
|
165
|
-
|
|
166
|
-
fields = {
|
|
167
|
-
'Brand' => brand_name,
|
|
168
|
-
'SKU' => node['sku'],
|
|
169
|
-
'GTIN' => node['gtin13'] || node['gtin'],
|
|
170
|
-
'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
|
|
171
|
-
'Availability' => offer['availability'],
|
|
172
|
-
'Rating' => rating['ratingValue'],
|
|
173
|
-
'Reviews' => rating['reviewCount'],
|
|
174
|
-
'Author' => author_name,
|
|
175
|
-
'Published' => node['datePublished']
|
|
176
|
-
}.reject { |_, v| v.nil? || v.to_s.strip.empty? }
|
|
177
|
-
|
|
178
|
-
unless fields.empty?
|
|
179
|
-
fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
|
|
180
|
-
out << "\n"
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
if (body = node['articleBody'] || node['description'])
|
|
184
|
-
out << "#{body}\n\n"
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
if (img = node['image'])
|
|
188
|
-
img = img.first if img.is_a?(Array)
|
|
189
|
-
img = img['url'] if img.is_a?(Hash)
|
|
190
|
-
out << "\n\n" if img
|
|
191
|
-
end
|
|
192
|
-
|
|
193
|
-
out
|
|
194
|
-
end
|
|
195
|
-
|
|
196
|
-
# Run +Readability+ over +html+ to isolate the main content node,
|
|
197
|
-
# then convert that to Markdown via +reverse_markdown+. The page
|
|
198
|
-
# +<title>+ is rendered as a top-level heading.
|
|
199
|
-
#
|
|
200
|
-
# When the page uses semantic HTML5 (+<main>+ or +<article>+) but
|
|
201
|
-
# leaves most of its content outside +<p>+ tags — divs, lists,
|
|
202
|
-
# spans — Readability's paragraph-density scoring collapses the
|
|
203
|
-
# extraction to a sliver of the page. In that case we render the
|
|
204
|
-
# +<main>+/+<article>+ container directly. The fallback only
|
|
205
|
-
# fires when the container holds substantially more text than
|
|
206
|
-
# Readability picked up (see {MAIN_FALLBACK_RATIO} /
|
|
207
|
-
# {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
|
|
208
|
-
# Readability so its noise filtering still strips nav/ads/etc.
|
|
209
|
-
#
|
|
210
|
-
# @param html [String] HTML document body
|
|
211
|
-
# @return [String] Markdown representation
|
|
212
|
-
def self.readability_to_markdown(html)
|
|
213
|
-
rdoc = Readability::Document.new(
|
|
214
|
-
html,
|
|
215
|
-
tags: READABILITY_TAGS,
|
|
216
|
-
attributes: READABILITY_ATTRS,
|
|
217
|
-
remove_empty_nodes: true
|
|
218
|
-
)
|
|
219
|
-
readability_html = rdoc.content
|
|
220
|
-
title = rdoc.title
|
|
221
|
-
|
|
222
|
-
body_html = main_fallback_html(html, readability_html) || readability_html
|
|
223
|
-
body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
|
|
224
|
-
|
|
225
|
-
out = +''
|
|
226
|
-
out << "# #{title.strip}\n\n" if title && !title.strip.empty?
|
|
227
|
-
out << body
|
|
228
|
-
out
|
|
229
|
-
end
|
|
230
|
-
|
|
231
|
-
# If +html+ has a +<main>+ or +<article>+ element holding
|
|
232
|
-
# substantially more text than Readability extracted, return that
|
|
233
|
-
# container's HTML so the caller can render it instead. Returns
|
|
234
|
-
# +nil+ when the fallback should not fire — when there is no
|
|
235
|
-
# semantic container, when it's too small to be meaningful, or
|
|
236
|
-
# when Readability's output is already comparable.
|
|
237
|
-
#
|
|
238
|
-
# @param html [String] full HTML document body, used to locate
|
|
239
|
-
# the +<main>+/+<article>+ container
|
|
240
|
-
# @param readability_html [String] HTML produced by
|
|
241
|
-
# +Readability::Document#content+, used as the comparison
|
|
242
|
-
# baseline
|
|
243
|
-
# @return [String, nil] container HTML when the fallback should
|
|
244
|
-
# fire, +nil+ otherwise
|
|
245
|
-
def self.main_fallback_html(html, readability_html)
|
|
246
|
-
doc = Nokogiri::HTML(html)
|
|
247
|
-
container = doc.at_css('main') || doc.at_css('article')
|
|
248
|
-
return nil unless container
|
|
249
|
-
|
|
250
|
-
container_text_len = container.text.gsub(/\s+/, ' ').strip.length
|
|
251
|
-
return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
|
|
252
|
-
|
|
253
|
-
readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
|
|
254
|
-
return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
|
|
255
|
-
|
|
256
|
-
container.to_html
|
|
257
|
-
end
|
|
258
|
-
private_class_method :main_fallback_html
|
|
259
|
-
|
|
260
|
-
# JSON-LD fields can be a string, hash, or array of either.
|
|
261
|
-
# Normalize to a single hash (the first one if it's a list) so
|
|
262
|
-
# callers can +.dig+ safely.
|
|
263
|
-
#
|
|
264
|
-
# @param value [Object] raw JSON-LD field value
|
|
265
|
-
# @return [Hash] empty hash when +value+ does not contain a hash
|
|
266
|
-
def self.first_obj(value)
|
|
267
|
-
value = value.first if value.is_a?(Array)
|
|
268
|
-
value.is_a?(Hash) ? value : {}
|
|
269
|
-
end
|
|
270
|
-
private_class_method :first_obj
|
|
271
|
-
|
|
272
|
-
# Same idea as {.first_obj} but preserves a bare string (e.g.
|
|
273
|
-
# +brand: "Apple"+) instead of replacing it with +{}+.
|
|
274
|
-
#
|
|
275
|
-
# @param value [Object] raw JSON-LD field value
|
|
276
|
-
# @return [String, Hash, nil]
|
|
277
|
-
def self.first_obj_or_string(value)
|
|
278
|
-
value = value.first if value.is_a?(Array)
|
|
279
|
-
value
|
|
280
|
-
end
|
|
281
|
-
private_class_method :first_obj_or_string
|
|
282
|
-
end
|
|
283
|
-
end
|
|
284
|
-
end
|
|
285
|
-
end
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'pdf-reader'
|
|
4
|
-
require 'stringio'
|
|
5
|
-
|
|
6
|
-
module Pikuri
|
|
7
|
-
class Tool
|
|
8
|
-
module Scraper
|
|
9
|
-
# PDF → text extractor used by {Simple.visit} when the fetched
|
|
10
|
-
# response carries +application/pdf+. Wraps the +pdf-reader+ gem:
|
|
11
|
-
# walk every page, concatenate the extracted text, hand the result
|
|
12
|
-
# back as a single string the LLM can read.
|
|
13
|
-
#
|
|
14
|
-
# Best-effort by design. +pdf-reader+ produces clean text from PDFs
|
|
15
|
-
# generated from a digital source (LaTeX, Word export, ...) but
|
|
16
|
-
# returns nothing useful from scanned documents — there is no OCR
|
|
17
|
-
# in this path. When extraction yields no text we still return an
|
|
18
|
-
# empty string rather than raising, so the caller's cache stores a
|
|
19
|
-
# consistent result and the LLM sees an empty observation it can
|
|
20
|
-
# react to.
|
|
21
|
-
#
|
|
22
|
-
# Pure parser — no I/O. {.extract} takes PDF bytes and returns text,
|
|
23
|
-
# so tests can drive it against an in-memory fixture without
|
|
24
|
-
# touching the network.
|
|
25
|
-
module PDF
|
|
26
|
-
# Render +bytes+ as plain text, one page per paragraph.
|
|
27
|
-
#
|
|
28
|
-
# +pdf-reader+ raises a handful of typed exceptions for documents
|
|
29
|
-
# it cannot parse — broken xrefs ({::PDF::Reader::MalformedPDFError}),
|
|
30
|
-
# invalid page references ({::PDF::Reader::InvalidPageError}),
|
|
31
|
-
# encrypted/XFA files ({::PDF::Reader::UnsupportedFeatureError}).
|
|
32
|
-
# All three describe a property of the PDF the LLM can react to
|
|
33
|
-
# ("try a different URL"), so we re-raise them as {FetchError} —
|
|
34
|
-
# same convention as the HTTP layer in {Simple.fetch}. Genuine
|
|
35
|
-
# bugs in +pdf-reader+ itself surface as their own classes and
|
|
36
|
-
# crash loud.
|
|
37
|
-
#
|
|
38
|
-
# @param bytes [String] raw PDF document (binary string)
|
|
39
|
-
# @return [String] concatenated page text; possibly empty when
|
|
40
|
-
# the PDF carries no extractable text (scanned image, empty
|
|
41
|
-
# document)
|
|
42
|
-
# @raise [FetchError] when +pdf-reader+ refuses the document
|
|
43
|
-
def self.extract(bytes)
|
|
44
|
-
reader = ::PDF::Reader.new(StringIO.new(bytes))
|
|
45
|
-
reader.pages.map { |p| p.text.strip }.reject(&:empty?).join("\n\n")
|
|
46
|
-
rescue ::PDF::Reader::MalformedPDFError,
|
|
47
|
-
::PDF::Reader::InvalidPageError,
|
|
48
|
-
::PDF::Reader::UnsupportedFeatureError => e
|
|
49
|
-
raise FetchError, "PDF rendering failed: #{e.class.name.split('::').last}: #{e.message}"
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
end
|
|
54
|
-
end
|
|
@@ -1,183 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'faraday'
|
|
4
|
-
require 'uri'
|
|
5
|
-
|
|
6
|
-
module Pikuri
|
|
7
|
-
class Tool
|
|
8
|
-
# Namespace for the URL-to-Markdown scraping stack used by
|
|
9
|
-
# {Tool::WEB_SCRAPE} and {Tool::FETCH}: a content-type-dispatching
|
|
10
|
-
# fetcher ({Simple}), pure content extractors ({HTML}, {PDF}), and a
|
|
11
|
-
# shared error type ({FetchError}). Nothing here knows about the LLM
|
|
12
|
-
# — the tools that wrap these layers turn rendered Markdown (or
|
|
13
|
-
# +FetchError+) into the next observation.
|
|
14
|
-
module Scraper
|
|
15
|
-
# Plain HTTP scraper: GET the URL with a real-browser User-Agent,
|
|
16
|
-
# follow redirects, and dispatch the response body to the parser
|
|
17
|
-
# matching its +Content-Type+. HTML and XHTML route to
|
|
18
|
-
# {HTML.extract}; +application/pdf+ routes to {PDF.extract}; any
|
|
19
|
-
# other +text/*+ type (plain text, Markdown, source files, …) is
|
|
20
|
-
# passed through verbatim since the LLM can already read it; the
|
|
21
|
-
# remaining types raise {FetchError} so the LLM observes the
|
|
22
|
-
# failure instead of receiving an empty rendering.
|
|
23
|
-
#
|
|
24
|
-
# Split into a thin HTTP fetch ({.fetch}) and a content-type
|
|
25
|
-
# dispatcher ({.visit}) so tests can drive each piece in isolation.
|
|
26
|
-
# "Simple" because everything happens in one Faraday GET — no
|
|
27
|
-
# headless browser, no JS execution.
|
|
28
|
-
module Simple
|
|
29
|
-
# @return [String] User-Agent sent with each request; many sites
|
|
30
|
-
# reject requests with no UA or an obvious bot UA
|
|
31
|
-
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
|
|
32
|
-
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
33
|
-
# @return [String] +Accept+ header sent with each request. Lists
|
|
34
|
-
# every content-type the dispatcher in {.visit} knows how to
|
|
35
|
-
# render, so servers that content-negotiate hand back something
|
|
36
|
-
# we can use. The trailing +text/*;q=0.8+ covers the verbatim
|
|
37
|
-
# pass-through arm (plain text, Markdown, source files, …) at a
|
|
38
|
-
# lower preference than rendered HTML/PDF.
|
|
39
|
-
ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
|
|
40
|
-
# @return [Integer] maximum number of HTTP redirects to follow
|
|
41
|
-
# before giving up
|
|
42
|
-
MAX_REDIRECTS = 5
|
|
43
|
-
# @return [Integer] connect timeout in seconds for the underlying
|
|
44
|
-
# Faraday request
|
|
45
|
-
OPEN_TIMEOUT = 10
|
|
46
|
-
# @return [Integer] read timeout in seconds for the underlying
|
|
47
|
-
# Faraday request
|
|
48
|
-
READ_TIMEOUT = 20
|
|
49
|
-
|
|
50
|
-
# @return [Integer] maximum number of characters of an error
|
|
51
|
-
# response body to include in a {FetchError} message. The body is
|
|
52
|
-
# often a multi-kilobyte HTML challenge page (Cloudflare, WAF
|
|
53
|
-
# interstitial, etc.); a short excerpt tells the LLM what kind of
|
|
54
|
-
# page came back without flooding the next observation.
|
|
55
|
-
ERROR_BODY_EXCERPT = 200
|
|
56
|
-
|
|
57
|
-
# Result of a successful {Simple.fetch}: the response body, the
|
|
58
|
-
# normalized content-type (lower-cased, with any +; charset=...+
|
|
59
|
-
# parameters stripped), and the final URL after redirects. The
|
|
60
|
-
# final URL is kept so future scrapers can resolve relative links
|
|
61
|
-
# against the actual landing page rather than the originally
|
|
62
|
-
# requested one.
|
|
63
|
-
Fetched = Data.define(:body, :content_type, :url)
|
|
64
|
-
|
|
65
|
-
# Fetch +url+ and render its main content as Markdown.
|
|
66
|
-
#
|
|
67
|
-
# No caching here — every call hits the network. Callers that want
|
|
68
|
-
# to memoize results should wrap this method themselves (see
|
|
69
|
-
# {Tool::WebScrape.visit}, which does exactly that).
|
|
70
|
-
#
|
|
71
|
-
# The dispatcher's output is +String#strip+'d so the LLM never
|
|
72
|
-
# sees a body that opens or closes with blank lines — common with
|
|
73
|
-
# +pdf-reader+'s page-feed whitespace and with text bodies that
|
|
74
|
-
# carry a trailing newline. Interior whitespace is preserved
|
|
75
|
-
# because Markdown paragraph breaks and source-code indentation
|
|
76
|
-
# are load-bearing.
|
|
77
|
-
#
|
|
78
|
-
# @param url [String] absolute HTTP(S) URL of the page to download
|
|
79
|
-
# @return [String] full Markdown representation of the page with
|
|
80
|
-
# leading/trailing whitespace trimmed, uncapped otherwise —
|
|
81
|
-
# caller is responsible for any size limiting before feeding
|
|
82
|
-
# the result back to the LLM
|
|
83
|
-
# @raise [FetchError] on HTTP non-2xx, network failure, redirect
|
|
84
|
-
# loop, a 3xx without a +Location+ header, or a response whose
|
|
85
|
-
# content-type the dispatcher does not recognize
|
|
86
|
-
def self.visit(url)
|
|
87
|
-
dispatch(fetch(url)).strip
|
|
88
|
-
end
|
|
89
|
-
|
|
90
|
-
# Download the body of +url+, manually following up to
|
|
91
|
-
# {MAX_REDIRECTS} redirects. Faraday is configured with no
|
|
92
|
-
# middleware so behavior here mirrors the rest of the codebase
|
|
93
|
-
# (see +Tool::Search::DuckDuckGo.search+).
|
|
94
|
-
#
|
|
95
|
-
# All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
|
|
96
|
-
# blips, exhausted redirect budget, 3xx without a +Location+ —
|
|
97
|
-
# surface as {FetchError} so the caller has a single exception type
|
|
98
|
-
# to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
|
|
99
|
-
# characters with whitespace collapsed, so a Cloudflare-challenge
|
|
100
|
-
# response doesn't dump kilobytes of inline HTML into the next LLM
|
|
101
|
-
# observation.
|
|
102
|
-
#
|
|
103
|
-
# @param url [String] absolute HTTP(S) URL to fetch
|
|
104
|
-
# @param limit [Integer] redirects remaining; recurses with
|
|
105
|
-
# +limit - 1+ on each 3xx
|
|
106
|
-
# @return [Fetched] body, normalized content-type, and final URL
|
|
107
|
-
# after redirects
|
|
108
|
-
# @raise [FetchError] on non-2xx/3xx responses, network errors,
|
|
109
|
-
# redirect-loop exhaustion, or 3xx without a +Location+ header
|
|
110
|
-
def self.fetch(url, limit: MAX_REDIRECTS)
|
|
111
|
-
raise FetchError, "too many redirects fetching #{url}" if limit.zero?
|
|
112
|
-
|
|
113
|
-
response = begin
|
|
114
|
-
Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
|
|
115
|
-
req.headers['User-Agent'] = USER_AGENT
|
|
116
|
-
req.headers['Accept'] = ACCEPT
|
|
117
|
-
end
|
|
118
|
-
rescue Faraday::Error => e
|
|
119
|
-
raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
|
|
120
|
-
end
|
|
121
|
-
|
|
122
|
-
case response.status
|
|
123
|
-
when 200..299
|
|
124
|
-
Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
|
|
125
|
-
when 300..399
|
|
126
|
-
location = response.headers['location']
|
|
127
|
-
raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
|
|
128
|
-
|
|
129
|
-
fetch(URI.join(url, location).to_s, limit: limit - 1)
|
|
130
|
-
else
|
|
131
|
-
raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
|
|
132
|
-
end
|
|
133
|
-
end
|
|
134
|
-
|
|
135
|
-
# Route a {Fetched} response to the parser that matches its
|
|
136
|
-
# content-type. Unknown types raise {FetchError} so the LLM gets a
|
|
137
|
-
# legible observation instead of an empty string.
|
|
138
|
-
#
|
|
139
|
-
# @param fetched [Fetched]
|
|
140
|
-
# @return [String] Markdown representation produced by the matched
|
|
141
|
-
# parser
|
|
142
|
-
# @raise [FetchError] when no parser matches the response's
|
|
143
|
-
# content-type
|
|
144
|
-
def self.dispatch(fetched)
|
|
145
|
-
case fetched.content_type
|
|
146
|
-
when 'text/html', 'application/xhtml+xml'
|
|
147
|
-
HTML.extract(fetched.body)
|
|
148
|
-
when 'application/pdf'
|
|
149
|
-
PDF.extract(fetched.body)
|
|
150
|
-
when %r{\Atext/}
|
|
151
|
-
fetched.body
|
|
152
|
-
else
|
|
153
|
-
raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
|
|
154
|
-
end
|
|
155
|
-
end
|
|
156
|
-
|
|
157
|
-
# Lower-case +raw+ and strip any +; charset=...+ parameters so the
|
|
158
|
-
# dispatcher can match on a canonical token.
|
|
159
|
-
#
|
|
160
|
-
# @param raw [String, nil] raw +Content-Type+ header value
|
|
161
|
-
# @return [String] normalized content-type, or +""+ when the
|
|
162
|
-
# header was missing
|
|
163
|
-
def self.normalize_content_type(raw)
|
|
164
|
-
raw.to_s.split(';').first.to_s.strip.downcase
|
|
165
|
-
end
|
|
166
|
-
private_class_method :normalize_content_type
|
|
167
|
-
|
|
168
|
-
# Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
|
|
169
|
-
# characters, so the {FetchError} message stays a single readable
|
|
170
|
-
# line even when the server returned a multi-KB HTML challenge
|
|
171
|
-
# page.
|
|
172
|
-
#
|
|
173
|
-
# @param body [String, nil]
|
|
174
|
-
# @return [String]
|
|
175
|
-
def self.excerpt(body)
|
|
176
|
-
text = body.to_s.gsub(/\s+/, ' ').strip
|
|
177
|
-
text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
|
|
178
|
-
end
|
|
179
|
-
private_class_method :excerpt
|
|
180
|
-
end
|
|
181
|
-
end
|
|
182
|
-
end
|
|
183
|
-
end
|