pikuri-core 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pikuri-core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Vysny
@@ -10,20 +10,6 @@ bindir: bin
10
10
  cert_chain: []
11
11
  date: 2026-06-04 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: dentaku
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '3.5'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '3.5'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: faraday
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -52,20 +38,6 @@ dependencies:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
40
  version: '1.19'
55
- - !ruby/object:Gem::Dependency
56
- name: pdf-reader
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: '2.15'
62
- type: :runtime
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: '2.15'
69
41
  - !ruby/object:Gem::Dependency
70
42
  name: rainbow
71
43
  requirement: !ruby/object:Gem::Requirement
@@ -122,34 +94,6 @@ dependencies:
122
94
  - - "~>"
123
95
  - !ruby/object:Gem::Version
124
96
  version: '1.15'
125
- - !ruby/object:Gem::Dependency
126
- name: tsort
127
- requirement: !ruby/object:Gem::Requirement
128
- requirements:
129
- - - "~>"
130
- - !ruby/object:Gem::Version
131
- version: '0.2'
132
- type: :runtime
133
- prerelease: false
134
- version_requirements: !ruby/object:Gem::Requirement
135
- requirements:
136
- - - "~>"
137
- - !ruby/object:Gem::Version
138
- version: '0.2'
139
- - !ruby/object:Gem::Dependency
140
- name: tty-markdown
141
- requirement: !ruby/object:Gem::Requirement
142
- requirements:
143
- - - "~>"
144
- - !ruby/object:Gem::Version
145
- version: '0.7'
146
- type: :runtime
147
- prerelease: false
148
- version_requirements: !ruby/object:Gem::Requirement
149
- requirements:
150
- - - "~>"
151
- - !ruby/object:Gem::Version
152
- version: '0.7'
153
97
  - !ruby/object:Gem::Dependency
154
98
  name: zeitwerk
155
99
  requirement: !ruby/object:Gem::Requirement
@@ -199,6 +143,9 @@ files:
199
143
  - lib/pikuri/agent/listener/token_log.rb
200
144
  - lib/pikuri/agent/listener_list.rb
201
145
  - lib/pikuri/agent/synthesizer.rb
146
+ - lib/pikuri/extractor.rb
147
+ - lib/pikuri/extractor/html.rb
148
+ - lib/pikuri/extractor/passthrough.rb
202
149
  - lib/pikuri/file_type.rb
203
150
  - lib/pikuri/finalizers.rb
204
151
  - lib/pikuri/paths.rb
@@ -207,10 +154,7 @@ files:
207
154
  - lib/pikuri/tool/calculator.rb
208
155
  - lib/pikuri/tool/fetch.rb
209
156
  - lib/pikuri/tool/parameters.rb
210
- - lib/pikuri/tool/scraper/fetch_error.rb
211
- - lib/pikuri/tool/scraper/html.rb
212
- - lib/pikuri/tool/scraper/pdf.rb
213
- - lib/pikuri/tool/scraper/simple.rb
157
+ - lib/pikuri/tool/scraper.rb
214
158
  - lib/pikuri/tool/search/brave.rb
215
159
  - lib/pikuri/tool/search/duckduckgo.rb
216
160
  - lib/pikuri/tool/search/engines.rb
@@ -1,16 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Pikuri
4
- class Tool
5
- module Scraper
6
- # Raised by anything in the scraper stack when a URL cannot be
7
- # rendered into Markdown text — HTTP non-2xx, network failure,
8
- # redirect-loop, missing +Location+, unsupported content-type, or a
9
- # parse failure that reads as "try a different URL" to the LLM.
10
- # Catching this in {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the
11
- # failure into an +"Error: ..."+ observation; anything else bubbles
12
- # up so genuine bugs stay visible.
13
- class FetchError < StandardError; end
14
- end
15
- end
16
- end
@@ -1,285 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'json'
4
- require 'nokogiri'
5
- require 'readability'
6
- require 'reverse_markdown'
7
-
8
- module Pikuri
9
- class Tool
10
- module Scraper
11
- # HTML → Markdown extractor used by {Simple.visit} when the fetched
12
- # response carries an HTML content-type.
13
- #
14
- # Always renders both views of the page when available:
15
- #
16
- # 1. JSON-LD section. Any +<script type="application/ld+json">+ node
17
- # whose +@type+ matches a substantive schema.org content type
18
- # (Product, Article, Recipe, ...) is rendered as a header — title,
19
- # metadata bullets (brand, SKU, price, rating, author, published),
20
- # and the +articleBody+/+description+ copy when present.
21
- # 2. Readability section. The page is run through +Readability+ +
22
- # +reverse_markdown+, with a +<main>+/+<article>+ fallback for
23
- # pages whose content sits mostly outside +<p>+ tags.
24
- #
25
- # Concatenated with a horizontal rule, so the LLM gets both the
26
- # structured metadata and the rendered body and can pick whichever
27
- # is more useful for the task. Trades some duplication (when a
28
- # publisher embeds the article body in JSON-LD AND in HTML) for
29
- # fewer type-based heuristics on which branch should win — the
30
- # earlier "is this Article's +description+ a teaser or the real
31
- # body?" carve-out is no longer needed because both end up in
32
- # the output regardless.
33
- #
34
- # Pure parser — no I/O. {.extract} takes an HTML string and returns
35
- # Markdown, so tests can drive it against fixture HTML without a
36
- # network round-trip.
37
- module HTML
38
- # @return [Array<String>] schema.org +@type+ values that we treat
39
- # as "the primary entity of this page" when picking a JSON-LD
40
- # node to render. Order does not matter — the first matching
41
- # node wins. Skips noise nodes (Organization, BreadcrumbList,
42
- # WebSite, ...) that ship on most pages but carry no page
43
- # content.
44
- INTERESTING_TYPES = %w[
45
- Product Article NewsArticle BlogPosting Recipe Event Book Movie
46
- ].freeze
47
-
48
- # @return [Array<String>] HTML tags preserved by the readability
49
- # pass. Anything outside this list is stripped before Markdown
50
- # conversion.
51
- READABILITY_TAGS = %w[
52
- h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
53
- strong em b i br hr table thead tbody tr td th
54
- ].freeze
55
-
56
- # @return [Array<String>] HTML attributes preserved by the
57
- # readability pass; everything else (class, id, style, data-*)
58
- # is dropped before Markdown conversion
59
- READABILITY_ATTRS = %w[href src alt title].freeze
60
-
61
- # @return [Float] minimum +<main>+/+<article>+ to Readability
62
- # text-length ratio that triggers the semantic-container
63
- # fallback in {.readability_to_markdown}. Picked low enough to
64
- # catch the failure mode (Readability collapsing a page that
65
- # uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
66
- # ~5x) but high enough that pages where both produce
67
- # comparable output keep Readability's noise filtering.
68
- MAIN_FALLBACK_RATIO = 2.0
69
-
70
- # @return [Integer] minimum text length the
71
- # +<main>+/+<article>+ container must hold before the fallback
72
- # in {.readability_to_markdown} can fire. Below this, the
73
- # ratio comparison is dominated by noise and we'd swap on
74
- # tiny pages where Readability is doing the right thing.
75
- MAIN_FALLBACK_MIN_CHARS = 500
76
-
77
- # Render +html+ as Markdown by emitting both the JSON-LD section
78
- # (when an interesting node is present) and the readability /
79
- # +<main>+ section, joined by a horizontal rule. Either section
80
- # may be missing — pages with no JSON-LD return only the
81
- # readability output, and a malformed page with no extractable
82
- # body returns only the JSON-LD render.
83
- #
84
- # @param html [String] HTML document body
85
- # @return [String] Markdown representation
86
- def self.extract(html)
87
- sections = [jsonld_section(html), readability_to_markdown(html)]
88
- sections.reject! { |s| s.nil? || s.strip.empty? }
89
- sections.join("\n\n---\n\n")
90
- end
91
-
92
- # Pick the first JSON-LD node whose +@type+ matches one of
93
- # {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
94
- # when no such node exists, in which case {.extract} emits only
95
- # the readability section.
96
- #
97
- # No content-field gating: a node carrying just +name+/+author+/
98
- # +datePublished+ still renders (as a metadata-only header),
99
- # because the readability pass independently produces the page
100
- # body. That is the trade-off that lets us drop the type-based
101
- # "is this teaser or article copy?" heuristics — duplication is
102
- # acceptable when both views are available, and the LLM can
103
- # pick whichever it needs.
104
- #
105
- # @param html [String] HTML document body
106
- # @return [String, nil] Markdown render of the picked JSON-LD
107
- # node, or +nil+ when nothing matched
108
- def self.jsonld_section(html)
109
- node = parse_jsonld(html).find do |n|
110
- Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
111
- end
112
- node ? jsonld_to_markdown(node) : nil
113
- end
114
-
115
- # Collect every JSON-LD payload embedded in +html+, flattening
116
- # +@graph+ wrappers so callers see one flat array of schema.org
117
- # nodes. Malformed JSON blocks are silently skipped — sites
118
- # frequently ship broken JSON-LD and we only need at least one
119
- # parseable block.
120
- #
121
- # @param html [String] HTML document body
122
- # @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
123
- def self.parse_jsonld(html)
124
- doc = Nokogiri::HTML(html)
125
- blobs = doc.css('script[type="application/ld+json"]').map(&:text)
126
-
127
- blobs.flat_map do |raw|
128
- parsed = begin
129
- JSON.parse(raw)
130
- rescue JSON::ParserError
131
- nil
132
- end
133
- next [] unless parsed
134
-
135
- nodes = parsed.is_a?(Array) ? parsed : [parsed]
136
- nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
137
- end
138
- end
139
-
140
- # Render a single JSON-LD +node+ as Markdown: a top-level title
141
- # from +name+/+headline+, a bullet list of common useful fields
142
- # (brand, SKU, price, rating, author, published date, ...), the
143
- # body copy, and the lead image.
144
- #
145
- # When the node carries +articleBody+ (the full publisher-supplied
146
- # article text), that wins over +description+ — the description
147
- # is typically a lede teaser and would just repeat the article's
148
- # opening lines.
149
- #
150
- # @param node [Hash] JSON-LD node, typically picked by
151
- # {.jsonld_section}
152
- # @return [String] Markdown representation
153
- def self.jsonld_to_markdown(node)
154
- out = +''
155
- name = node['name'] || node['headline']
156
- out << "# #{name}\n\n" if name
157
-
158
- offer = first_obj(node['offers'])
159
- rating = first_obj(node['aggregateRating'])
160
- brand = first_obj_or_string(node['brand'])
161
- author = first_obj_or_string(node['author'])
162
-
163
- brand_name = brand.is_a?(Hash) ? brand['name'] : brand
164
- author_name = author.is_a?(Hash) ? author['name'] : author
165
-
166
- fields = {
167
- 'Brand' => brand_name,
168
- 'SKU' => node['sku'],
169
- 'GTIN' => node['gtin13'] || node['gtin'],
170
- 'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
171
- 'Availability' => offer['availability'],
172
- 'Rating' => rating['ratingValue'],
173
- 'Reviews' => rating['reviewCount'],
174
- 'Author' => author_name,
175
- 'Published' => node['datePublished']
176
- }.reject { |_, v| v.nil? || v.to_s.strip.empty? }
177
-
178
- unless fields.empty?
179
- fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
180
- out << "\n"
181
- end
182
-
183
- if (body = node['articleBody'] || node['description'])
184
- out << "#{body}\n\n"
185
- end
186
-
187
- if (img = node['image'])
188
- img = img.first if img.is_a?(Array)
189
- img = img['url'] if img.is_a?(Hash)
190
- out << "![image](#{img})\n\n" if img
191
- end
192
-
193
- out
194
- end
195
-
196
- # Run +Readability+ over +html+ to isolate the main content node,
197
- # then convert that to Markdown via +reverse_markdown+. The page
198
- # +<title>+ is rendered as a top-level heading.
199
- #
200
- # When the page uses semantic HTML5 (+<main>+ or +<article>+) but
201
- # leaves most of its content outside +<p>+ tags — divs, lists,
202
- # spans — Readability's paragraph-density scoring collapses the
203
- # extraction to a sliver of the page. In that case we render the
204
- # +<main>+/+<article>+ container directly. The fallback only
205
- # fires when the container holds substantially more text than
206
- # Readability picked up (see {MAIN_FALLBACK_RATIO} /
207
- # {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
208
- # Readability so its noise filtering still strips nav/ads/etc.
209
- #
210
- # @param html [String] HTML document body
211
- # @return [String] Markdown representation
212
- def self.readability_to_markdown(html)
213
- rdoc = Readability::Document.new(
214
- html,
215
- tags: READABILITY_TAGS,
216
- attributes: READABILITY_ATTRS,
217
- remove_empty_nodes: true
218
- )
219
- readability_html = rdoc.content
220
- title = rdoc.title
221
-
222
- body_html = main_fallback_html(html, readability_html) || readability_html
223
- body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
224
-
225
- out = +''
226
- out << "# #{title.strip}\n\n" if title && !title.strip.empty?
227
- out << body
228
- out
229
- end
230
-
231
- # If +html+ has a +<main>+ or +<article>+ element holding
232
- # substantially more text than Readability extracted, return that
233
- # container's HTML so the caller can render it instead. Returns
234
- # +nil+ when the fallback should not fire — when there is no
235
- # semantic container, when it's too small to be meaningful, or
236
- # when Readability's output is already comparable.
237
- #
238
- # @param html [String] full HTML document body, used to locate
239
- # the +<main>+/+<article>+ container
240
- # @param readability_html [String] HTML produced by
241
- # +Readability::Document#content+, used as the comparison
242
- # baseline
243
- # @return [String, nil] container HTML when the fallback should
244
- # fire, +nil+ otherwise
245
- def self.main_fallback_html(html, readability_html)
246
- doc = Nokogiri::HTML(html)
247
- container = doc.at_css('main') || doc.at_css('article')
248
- return nil unless container
249
-
250
- container_text_len = container.text.gsub(/\s+/, ' ').strip.length
251
- return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
252
-
253
- readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
254
- return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
255
-
256
- container.to_html
257
- end
258
- private_class_method :main_fallback_html
259
-
260
- # JSON-LD fields can be a string, hash, or array of either.
261
- # Normalize to a single hash (the first one if it's a list) so
262
- # callers can +.dig+ safely.
263
- #
264
- # @param value [Object] raw JSON-LD field value
265
- # @return [Hash] empty hash when +value+ does not contain a hash
266
- def self.first_obj(value)
267
- value = value.first if value.is_a?(Array)
268
- value.is_a?(Hash) ? value : {}
269
- end
270
- private_class_method :first_obj
271
-
272
- # Same idea as {.first_obj} but preserves a bare string (e.g.
273
- # +brand: "Apple"+) instead of replacing it with +{}+.
274
- #
275
- # @param value [Object] raw JSON-LD field value
276
- # @return [String, Hash, nil]
277
- def self.first_obj_or_string(value)
278
- value = value.first if value.is_a?(Array)
279
- value
280
- end
281
- private_class_method :first_obj_or_string
282
- end
283
- end
284
- end
285
- end
@@ -1,54 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'pdf-reader'
4
- require 'stringio'
5
-
6
- module Pikuri
7
- class Tool
8
- module Scraper
9
- # PDF → text extractor used by {Simple.visit} when the fetched
10
- # response carries +application/pdf+. Wraps the +pdf-reader+ gem:
11
- # walk every page, concatenate the extracted text, hand the result
12
- # back as a single string the LLM can read.
13
- #
14
- # Best-effort by design. +pdf-reader+ produces clean text from PDFs
15
- # generated from a digital source (LaTeX, Word export, ...) but
16
- # returns nothing useful from scanned documents — there is no OCR
17
- # in this path. When extraction yields no text we still return an
18
- # empty string rather than raising, so the caller's cache stores a
19
- # consistent result and the LLM sees an empty observation it can
20
- # react to.
21
- #
22
- # Pure parser — no I/O. {.extract} takes PDF bytes and returns text,
23
- # so tests can drive it against an in-memory fixture without
24
- # touching the network.
25
- module PDF
26
- # Render +bytes+ as plain text, one page per paragraph.
27
- #
28
- # +pdf-reader+ raises a handful of typed exceptions for documents
29
- # it cannot parse — broken xrefs ({::PDF::Reader::MalformedPDFError}),
30
- # invalid page references ({::PDF::Reader::InvalidPageError}),
31
- # encrypted/XFA files ({::PDF::Reader::UnsupportedFeatureError}).
32
- # All three describe a property of the PDF the LLM can react to
33
- # ("try a different URL"), so we re-raise them as {FetchError} —
34
- # same convention as the HTTP layer in {Simple.fetch}. Genuine
35
- # bugs in +pdf-reader+ itself surface as their own classes and
36
- # crash loud.
37
- #
38
- # @param bytes [String] raw PDF document (binary string)
39
- # @return [String] concatenated page text; possibly empty when
40
- # the PDF carries no extractable text (scanned image, empty
41
- # document)
42
- # @raise [FetchError] when +pdf-reader+ refuses the document
43
- def self.extract(bytes)
44
- reader = ::PDF::Reader.new(StringIO.new(bytes))
45
- reader.pages.map { |p| p.text.strip }.reject(&:empty?).join("\n\n")
46
- rescue ::PDF::Reader::MalformedPDFError,
47
- ::PDF::Reader::InvalidPageError,
48
- ::PDF::Reader::UnsupportedFeatureError => e
49
- raise FetchError, "PDF rendering failed: #{e.class.name.split('::').last}: #{e.message}"
50
- end
51
- end
52
- end
53
- end
54
- end
@@ -1,183 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'faraday'
4
- require 'uri'
5
-
6
- module Pikuri
7
- class Tool
8
- # Namespace for the URL-to-Markdown scraping stack used by
9
- # {Tool::WEB_SCRAPE} and {Tool::FETCH}: a content-type-dispatching
10
- # fetcher ({Simple}), pure content extractors ({HTML}, {PDF}), and a
11
- # shared error type ({FetchError}). Nothing here knows about the LLM
12
- # — the tools that wrap these layers turn rendered Markdown (or
13
- # +FetchError+) into the next observation.
14
- module Scraper
15
- # Plain HTTP scraper: GET the URL with a real-browser User-Agent,
16
- # follow redirects, and dispatch the response body to the parser
17
- # matching its +Content-Type+. HTML and XHTML route to
18
- # {HTML.extract}; +application/pdf+ routes to {PDF.extract}; any
19
- # other +text/*+ type (plain text, Markdown, source files, …) is
20
- # passed through verbatim since the LLM can already read it; the
21
- # remaining types raise {FetchError} so the LLM observes the
22
- # failure instead of receiving an empty rendering.
23
- #
24
- # Split into a thin HTTP fetch ({.fetch}) and a content-type
25
- # dispatcher ({.visit}) so tests can drive each piece in isolation.
26
- # "Simple" because everything happens in one Faraday GET — no
27
- # headless browser, no JS execution.
28
- module Simple
29
- # @return [String] User-Agent sent with each request; many sites
30
- # reject requests with no UA or an obvious bot UA
31
- USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
32
- '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
33
- # @return [String] +Accept+ header sent with each request. Lists
34
- # every content-type the dispatcher in {.visit} knows how to
35
- # render, so servers that content-negotiate hand back something
36
- # we can use. The trailing +text/*;q=0.8+ covers the verbatim
37
- # pass-through arm (plain text, Markdown, source files, …) at a
38
- # lower preference than rendered HTML/PDF.
39
- ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
40
- # @return [Integer] maximum number of HTTP redirects to follow
41
- # before giving up
42
- MAX_REDIRECTS = 5
43
- # @return [Integer] connect timeout in seconds for the underlying
44
- # Faraday request
45
- OPEN_TIMEOUT = 10
46
- # @return [Integer] read timeout in seconds for the underlying
47
- # Faraday request
48
- READ_TIMEOUT = 20
49
-
50
- # @return [Integer] maximum number of characters of an error
51
- # response body to include in a {FetchError} message. The body is
52
- # often a multi-kilobyte HTML challenge page (Cloudflare, WAF
53
- # interstitial, etc.); a short excerpt tells the LLM what kind of
54
- # page came back without flooding the next observation.
55
- ERROR_BODY_EXCERPT = 200
56
-
57
- # Result of a successful {Simple.fetch}: the response body, the
58
- # normalized content-type (lower-cased, with any +; charset=...+
59
- # parameters stripped), and the final URL after redirects. The
60
- # final URL is kept so future scrapers can resolve relative links
61
- # against the actual landing page rather than the originally
62
- # requested one.
63
- Fetched = Data.define(:body, :content_type, :url)
64
-
65
- # Fetch +url+ and render its main content as Markdown.
66
- #
67
- # No caching here — every call hits the network. Callers that want
68
- # to memoize results should wrap this method themselves (see
69
- # {Tool::WebScrape.visit}, which does exactly that).
70
- #
71
- # The dispatcher's output is +String#strip+'d so the LLM never
72
- # sees a body that opens or closes with blank lines — common with
73
- # +pdf-reader+'s page-feed whitespace and with text bodies that
74
- # carry a trailing newline. Interior whitespace is preserved
75
- # because Markdown paragraph breaks and source-code indentation
76
- # are load-bearing.
77
- #
78
- # @param url [String] absolute HTTP(S) URL of the page to download
79
- # @return [String] full Markdown representation of the page with
80
- # leading/trailing whitespace trimmed, uncapped otherwise —
81
- # caller is responsible for any size limiting before feeding
82
- # the result back to the LLM
83
- # @raise [FetchError] on HTTP non-2xx, network failure, redirect
84
- # loop, a 3xx without a +Location+ header, or a response whose
85
- # content-type the dispatcher does not recognize
86
- def self.visit(url)
87
- dispatch(fetch(url)).strip
88
- end
89
-
90
- # Download the body of +url+, manually following up to
91
- # {MAX_REDIRECTS} redirects. Faraday is configured with no
92
- # middleware so behavior here mirrors the rest of the codebase
93
- # (see +Tool::Search::DuckDuckGo.search+).
94
- #
95
- # All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
96
- # blips, exhausted redirect budget, 3xx without a +Location+ —
97
- # surface as {FetchError} so the caller has a single exception type
98
- # to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
99
- # characters with whitespace collapsed, so a Cloudflare-challenge
100
- # response doesn't dump kilobytes of inline HTML into the next LLM
101
- # observation.
102
- #
103
- # @param url [String] absolute HTTP(S) URL to fetch
104
- # @param limit [Integer] redirects remaining; recurses with
105
- # +limit - 1+ on each 3xx
106
- # @return [Fetched] body, normalized content-type, and final URL
107
- # after redirects
108
- # @raise [FetchError] on non-2xx/3xx responses, network errors,
109
- # redirect-loop exhaustion, or 3xx without a +Location+ header
110
- def self.fetch(url, limit: MAX_REDIRECTS)
111
- raise FetchError, "too many redirects fetching #{url}" if limit.zero?
112
-
113
- response = begin
114
- Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
115
- req.headers['User-Agent'] = USER_AGENT
116
- req.headers['Accept'] = ACCEPT
117
- end
118
- rescue Faraday::Error => e
119
- raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
120
- end
121
-
122
- case response.status
123
- when 200..299
124
- Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
125
- when 300..399
126
- location = response.headers['location']
127
- raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
128
-
129
- fetch(URI.join(url, location).to_s, limit: limit - 1)
130
- else
131
- raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
132
- end
133
- end
134
-
135
- # Route a {Fetched} response to the parser that matches its
136
- # content-type. Unknown types raise {FetchError} so the LLM gets a
137
- # legible observation instead of an empty string.
138
- #
139
- # @param fetched [Fetched]
140
- # @return [String] Markdown representation produced by the matched
141
- # parser
142
- # @raise [FetchError] when no parser matches the response's
143
- # content-type
144
- def self.dispatch(fetched)
145
- case fetched.content_type
146
- when 'text/html', 'application/xhtml+xml'
147
- HTML.extract(fetched.body)
148
- when 'application/pdf'
149
- PDF.extract(fetched.body)
150
- when %r{\Atext/}
151
- fetched.body
152
- else
153
- raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
154
- end
155
- end
156
-
157
- # Lower-case +raw+ and strip any +; charset=...+ parameters so the
158
- # dispatcher can match on a canonical token.
159
- #
160
- # @param raw [String, nil] raw +Content-Type+ header value
161
- # @return [String] normalized content-type, or +""+ when the
162
- # header was missing
163
- def self.normalize_content_type(raw)
164
- raw.to_s.split(';').first.to_s.strip.downcase
165
- end
166
- private_class_method :normalize_content_type
167
-
168
- # Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
169
- # characters, so the {FetchError} message stays a single readable
170
- # line even when the server returned a multi-KB HTML challenge
171
- # page.
172
- #
173
- # @param body [String, nil]
174
- # @return [String]
175
- def self.excerpt(body)
176
- text = body.to_s.gsub(/\s+/, ' ').strip
177
- text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
178
- end
179
- private_class_method :excerpt
180
- end
181
- end
182
- end
183
- end