pikuri-core 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pikuri/agent/configurator.rb +9 -2
- data/lib/pikuri/agent/context_window_detector.rb +70 -10
- data/lib/pikuri/agent/control/interloper.rb +10 -2
- data/lib/pikuri/agent/event.rb +15 -0
- data/lib/pikuri/agent/extension.rb +37 -9
- data/lib/pikuri/agent/listener/terminal.rb +22 -36
- data/lib/pikuri/agent.rb +174 -73
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +87 -59
- data/lib/pikuri/finalizers.rb +118 -0
- data/lib/pikuri/paths.rb +29 -0
- data/lib/pikuri/subprocess.rb +109 -12
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +0 -1
- metadata +8 -62
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
require 'stringio'
|
|
5
|
+
require 'uri'
|
|
6
|
+
|
|
7
|
+
module Pikuri
|
|
8
|
+
class Tool
|
|
9
|
+
# HTTP side of the web tools ({Tool::WEB_SCRAPE} and {Tool::FETCH}):
|
|
10
|
+
# GET the URL with a real-browser User-Agent, follow redirects, and
|
|
11
|
+
# hand the response body to {Pikuri::Extractor.extract} with the
|
|
12
|
+
# response's +Content-Type+ as the hint. HTML/XHTML render via
|
|
13
|
+
# {Extractor::HTML}, any other +text/*+ type passes through
|
|
14
|
+
# verbatim, and plug-in extractors extend the set (with pikuri-pdf
|
|
15
|
+
# registered, +application/pdf+ extracts — by header or by +%PDF-+
|
|
16
|
+
# magic, so a PDF served under a lying header still works); the
|
|
17
|
+
# remaining types raise {FetchError} so the LLM observes the
|
|
18
|
+
# failure instead of receiving an empty rendering.
|
|
19
|
+
#
|
|
20
|
+
# Split into a thin HTTP fetch ({.fetch}) and the extraction
|
|
21
|
+
# wrapper ({.visit}) so tests can drive each piece in isolation and
|
|
22
|
+
# {Tool::Fetch} can reuse the HTTP half without the extraction
|
|
23
|
+
# pass. Nothing here knows about the LLM; the tools that wrap this
|
|
24
|
+
# module own caching and truncation and turn rendered Markdown (or
|
|
25
|
+
# {FetchError}) into the next observation.
|
|
26
|
+
module Scraper
|
|
27
|
+
# Raised when a URL cannot be rendered into Markdown text — HTTP
|
|
28
|
+
# non-2xx, network failure, redirect-loop, missing +Location+,
|
|
29
|
+
# unsupported content-type, or a parse failure that reads as "try
|
|
30
|
+
# a different URL" to the LLM. Catching this in
|
|
31
|
+
# {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the failure into an
|
|
32
|
+
# +"Error: ..."+ observation; anything else bubbles up so genuine
|
|
33
|
+
# bugs stay visible.
|
|
34
|
+
class FetchError < StandardError; end
|
|
35
|
+
|
|
36
|
+
# @return [String] User-Agent sent with each request; many sites
|
|
37
|
+
# reject requests with no UA or an obvious bot UA
|
|
38
|
+
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
|
|
39
|
+
'(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
40
|
+
# @return [String] +Accept+ header sent with each request, so
|
|
41
|
+
# servers that content-negotiate hand back something we can use:
|
|
42
|
+
# rendered HTML first, +application/pdf+ for hosts with a PDF
|
|
43
|
+
# extractor registered, then any +text/*+ for the verbatim
|
|
44
|
+
# pass-through arm.
|
|
45
|
+
ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
|
|
46
|
+
# @return [Integer] maximum number of HTTP redirects to follow
|
|
47
|
+
# before giving up
|
|
48
|
+
MAX_REDIRECTS = 5
|
|
49
|
+
# @return [Integer] connect timeout in seconds for the underlying
|
|
50
|
+
# Faraday request
|
|
51
|
+
OPEN_TIMEOUT = 10
|
|
52
|
+
# @return [Integer] read timeout in seconds for the underlying
|
|
53
|
+
# Faraday request
|
|
54
|
+
READ_TIMEOUT = 20
|
|
55
|
+
|
|
56
|
+
# @return [Integer] maximum number of characters of an error
|
|
57
|
+
# response body to include in a {FetchError} message. The body is
|
|
58
|
+
# often a multi-kilobyte HTML challenge page (Cloudflare, WAF
|
|
59
|
+
# interstitial, etc.); a short excerpt tells the LLM what kind of
|
|
60
|
+
# page came back without flooding the next observation.
|
|
61
|
+
ERROR_BODY_EXCERPT = 200
|
|
62
|
+
|
|
63
|
+
# Result of a successful {Scraper.fetch}: the response body, the
|
|
64
|
+
# normalized content-type (lower-cased, with any +; charset=...+
|
|
65
|
+
# parameters stripped), and the final URL after redirects.
|
|
66
|
+
Fetched = Data.define(:body, :content_type, :url)
|
|
67
|
+
|
|
68
|
+
# Fetch +url+ and render its main content as Markdown.
|
|
69
|
+
#
|
|
70
|
+
# No caching here — every call hits the network. Callers that want
|
|
71
|
+
# to memoize results should wrap this method themselves (see
|
|
72
|
+
# {Tool::WebScrape.visit}, which does exactly that).
|
|
73
|
+
#
|
|
74
|
+
# The extracted output is +String#strip+'d so the LLM never sees
|
|
75
|
+
# a body that opens or closes with blank lines — common with
|
|
76
|
+
# extracted PDFs' page-feed whitespace and with text bodies that
|
|
77
|
+
# carry a trailing newline. Interior whitespace is preserved
|
|
78
|
+
# because Markdown paragraph breaks and source-code indentation
|
|
79
|
+
# are load-bearing.
|
|
80
|
+
#
|
|
81
|
+
# @param url [String] absolute HTTP(S) URL of the page to download
|
|
82
|
+
# @return [String] full Markdown representation of the page with
|
|
83
|
+
# leading/trailing whitespace trimmed, uncapped otherwise —
|
|
84
|
+
# caller is responsible for any size limiting before feeding
|
|
85
|
+
# the result back to the LLM
|
|
86
|
+
# @raise [FetchError] on HTTP non-2xx, network failure, redirect
|
|
87
|
+
# loop, a 3xx without a +Location+ header, a response no
|
|
88
|
+
# extractor recognizes, or an extraction failure (malformed
|
|
89
|
+
# PDF, ...)
|
|
90
|
+
def self.visit(url)
|
|
91
|
+
extract(fetch(url)).strip
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Render a {Fetched} response as Markdown via
|
|
95
|
+
# {Pikuri::Extractor.extract}, re-raising both extraction failure
|
|
96
|
+
# modes as {FetchError} — the single exception type the web tools
|
|
97
|
+
# rescue. The content-type is passed verbatim (including the +""+
|
|
98
|
+
# of a missing header, which matches no text arm — a body without
|
|
99
|
+
# transport metadata is refused, not sniffed; only a strong magic
|
|
100
|
+
# sniff like pikuri-pdf's +%PDF-+ overrides a wrong or missing
|
|
101
|
+
# header, because such a sniff never misfires on text).
|
|
102
|
+
#
|
|
103
|
+
# @param fetched [Fetched]
|
|
104
|
+
# @return [String] Markdown representation produced by the
|
|
105
|
+
# matched extractor
|
|
106
|
+
# @raise [FetchError] when no extractor matches the response's
|
|
107
|
+
# content-type, or when extraction fails
|
|
108
|
+
def self.extract(fetched)
|
|
109
|
+
Pikuri::Extractor.extract(StringIO.new(fetched.body), content_type: fetched.content_type)
|
|
110
|
+
rescue Pikuri::Extractor::Unsupported
|
|
111
|
+
raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
|
|
112
|
+
rescue Pikuri::Extractor::Error => e
|
|
113
|
+
raise FetchError, e.message
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Download the body of +url+, manually following up to
|
|
117
|
+
# {MAX_REDIRECTS} redirects. Faraday is configured with no
|
|
118
|
+
# middleware so behavior here mirrors the rest of the codebase
|
|
119
|
+
# (see +Tool::Search::DuckDuckGo.search+).
|
|
120
|
+
#
|
|
121
|
+
# All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
|
|
122
|
+
# blips, exhausted redirect budget, 3xx without a +Location+ —
|
|
123
|
+
# surface as {FetchError} so the caller has a single exception type
|
|
124
|
+
# to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
|
|
125
|
+
# characters with whitespace collapsed, so a Cloudflare-challenge
|
|
126
|
+
# response doesn't dump kilobytes of inline HTML into the next LLM
|
|
127
|
+
# observation.
|
|
128
|
+
#
|
|
129
|
+
# @param url [String] absolute HTTP(S) URL to fetch
|
|
130
|
+
# @param limit [Integer] redirects remaining; recurses with
|
|
131
|
+
# +limit - 1+ on each 3xx
|
|
132
|
+
# @return [Fetched] body, normalized content-type, and final URL
|
|
133
|
+
# after redirects
|
|
134
|
+
# @raise [FetchError] on non-2xx/3xx responses, network errors,
|
|
135
|
+
# redirect-loop exhaustion, or 3xx without a +Location+ header
|
|
136
|
+
def self.fetch(url, limit: MAX_REDIRECTS)
|
|
137
|
+
raise FetchError, "too many redirects fetching #{url}" if limit.zero?
|
|
138
|
+
|
|
139
|
+
response = begin
|
|
140
|
+
Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
|
|
141
|
+
req.headers['User-Agent'] = USER_AGENT
|
|
142
|
+
req.headers['Accept'] = ACCEPT
|
|
143
|
+
end
|
|
144
|
+
rescue Faraday::Error => e
|
|
145
|
+
raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
case response.status
|
|
149
|
+
when 200..299
|
|
150
|
+
Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
|
|
151
|
+
when 300..399
|
|
152
|
+
location = response.headers['location']
|
|
153
|
+
raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
|
|
154
|
+
|
|
155
|
+
fetch(URI.join(url, location).to_s, limit: limit - 1)
|
|
156
|
+
else
|
|
157
|
+
raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Lower-case +raw+ and strip any +; charset=...+ parameters so the
|
|
162
|
+
# extractors can match on a canonical token.
|
|
163
|
+
#
|
|
164
|
+
# @param raw [String, nil] raw +Content-Type+ header value
|
|
165
|
+
# @return [String] normalized content-type, or +""+ when the
|
|
166
|
+
# header was missing
|
|
167
|
+
def self.normalize_content_type(raw)
|
|
168
|
+
raw.to_s.split(';').first.to_s.strip.downcase
|
|
169
|
+
end
|
|
170
|
+
private_class_method :normalize_content_type
|
|
171
|
+
|
|
172
|
+
# Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
|
|
173
|
+
# characters, so the {FetchError} message stays a single readable
|
|
174
|
+
# line even when the server returned a multi-KB HTML challenge
|
|
175
|
+
# page.
|
|
176
|
+
#
|
|
177
|
+
# @param body [String, nil]
|
|
178
|
+
# @return [String]
|
|
179
|
+
def self.excerpt(body)
|
|
180
|
+
text = body.to_s.gsub(/\s+/, ' ').strip
|
|
181
|
+
text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
|
|
182
|
+
end
|
|
183
|
+
private_class_method :excerpt
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
module Pikuri
|
|
4
4
|
class Tool
|
|
5
5
|
# Truncation policy and Tool spec for the +web_scrape+ tool. The actual
|
|
6
|
-
# scraping lives in {Tool::Scraper
|
|
6
|
+
# scraping lives in {Tool::Scraper}; this module is a thin
|
|
7
7
|
# wrapper that picks the scraper, applies a character cap so the LLM
|
|
8
8
|
# doesn't drown in long-form content, and exposes the result to the
|
|
9
9
|
# agent loop in OpenAI tool-call shape.
|
|
@@ -37,7 +37,7 @@ module Pikuri
|
|
|
37
37
|
CACHE
|
|
38
38
|
end
|
|
39
39
|
|
|
40
|
-
# Fetch +url+ via {Tool::Scraper
|
|
40
|
+
# Fetch +url+ via {Tool::Scraper} and truncate the rendered
|
|
41
41
|
# Markdown to +max_chars+ characters.
|
|
42
42
|
#
|
|
43
43
|
# The full extracted Markdown is cached on disk via {.cache}, keyed
|
|
@@ -65,7 +65,7 @@ module Pikuri
|
|
|
65
65
|
# truncated, or +"Error: ..."+ on a recoverable fetch failure
|
|
66
66
|
def self.visit(url, max_chars: DEFAULT_MAX_CHARS)
|
|
67
67
|
max_chars = max_chars.clamp(1, MAX_MAX_CHARS)
|
|
68
|
-
markdown = cache.fetch(url) { Scraper
|
|
68
|
+
markdown = cache.fetch(url) { Scraper.visit(url) }
|
|
69
69
|
truncate(markdown, max_chars)
|
|
70
70
|
rescue Scraper::FetchError => e
|
|
71
71
|
"Error: #{e.message}"
|
|
@@ -95,10 +95,10 @@ module Pikuri
|
|
|
95
95
|
WEB_SCRAPE = new(
|
|
96
96
|
name: 'web_scrape',
|
|
97
97
|
description: <<~DESC,
|
|
98
|
-
Scrapes the rendered webpage
|
|
98
|
+
Scrapes the rendered webpage or text file at the given URL and returns its main content as Markdown.
|
|
99
99
|
|
|
100
100
|
Usage:
|
|
101
|
-
- Use for HTML pages
|
|
101
|
+
- Use for HTML pages where you want readable content — readability extraction strips nav, sidebars, and boilerplate.
|
|
102
102
|
- For raw textual payloads (JSON, CSV, robots.txt, source files), use fetch instead — it returns bytes verbatim, while web_scrape would corrupt them with a Markdown pass.
|
|
103
103
|
- A Single Page App may return very little or no content. Do NOT retry with a larger max_chars; try a different URL instead.
|
|
104
104
|
DESC
|
data/lib/pikuri/version.rb
CHANGED
data/lib/pikuri-core.rb
CHANGED
metadata
CHANGED
|
@@ -1,29 +1,15 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pikuri-core
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Martin Vysny
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-06-04 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
-
- !ruby/object:Gem::Dependency
|
|
14
|
-
name: dentaku
|
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
|
16
|
-
requirements:
|
|
17
|
-
- - "~>"
|
|
18
|
-
- !ruby/object:Gem::Version
|
|
19
|
-
version: '3.5'
|
|
20
|
-
type: :runtime
|
|
21
|
-
prerelease: false
|
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
-
requirements:
|
|
24
|
-
- - "~>"
|
|
25
|
-
- !ruby/object:Gem::Version
|
|
26
|
-
version: '3.5'
|
|
27
13
|
- !ruby/object:Gem::Dependency
|
|
28
14
|
name: faraday
|
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -52,20 +38,6 @@ dependencies:
|
|
|
52
38
|
- - "~>"
|
|
53
39
|
- !ruby/object:Gem::Version
|
|
54
40
|
version: '1.19'
|
|
55
|
-
- !ruby/object:Gem::Dependency
|
|
56
|
-
name: pdf-reader
|
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
|
58
|
-
requirements:
|
|
59
|
-
- - "~>"
|
|
60
|
-
- !ruby/object:Gem::Version
|
|
61
|
-
version: '2.15'
|
|
62
|
-
type: :runtime
|
|
63
|
-
prerelease: false
|
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
-
requirements:
|
|
66
|
-
- - "~>"
|
|
67
|
-
- !ruby/object:Gem::Version
|
|
68
|
-
version: '2.15'
|
|
69
41
|
- !ruby/object:Gem::Dependency
|
|
70
42
|
name: rainbow
|
|
71
43
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -122,34 +94,6 @@ dependencies:
|
|
|
122
94
|
- - "~>"
|
|
123
95
|
- !ruby/object:Gem::Version
|
|
124
96
|
version: '1.15'
|
|
125
|
-
- !ruby/object:Gem::Dependency
|
|
126
|
-
name: tsort
|
|
127
|
-
requirement: !ruby/object:Gem::Requirement
|
|
128
|
-
requirements:
|
|
129
|
-
- - "~>"
|
|
130
|
-
- !ruby/object:Gem::Version
|
|
131
|
-
version: '0.2'
|
|
132
|
-
type: :runtime
|
|
133
|
-
prerelease: false
|
|
134
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
135
|
-
requirements:
|
|
136
|
-
- - "~>"
|
|
137
|
-
- !ruby/object:Gem::Version
|
|
138
|
-
version: '0.2'
|
|
139
|
-
- !ruby/object:Gem::Dependency
|
|
140
|
-
name: tty-markdown
|
|
141
|
-
requirement: !ruby/object:Gem::Requirement
|
|
142
|
-
requirements:
|
|
143
|
-
- - "~>"
|
|
144
|
-
- !ruby/object:Gem::Version
|
|
145
|
-
version: '0.7'
|
|
146
|
-
type: :runtime
|
|
147
|
-
prerelease: false
|
|
148
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
149
|
-
requirements:
|
|
150
|
-
- - "~>"
|
|
151
|
-
- !ruby/object:Gem::Version
|
|
152
|
-
version: '0.7'
|
|
153
97
|
- !ruby/object:Gem::Dependency
|
|
154
98
|
name: zeitwerk
|
|
155
99
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -199,16 +143,18 @@ files:
|
|
|
199
143
|
- lib/pikuri/agent/listener/token_log.rb
|
|
200
144
|
- lib/pikuri/agent/listener_list.rb
|
|
201
145
|
- lib/pikuri/agent/synthesizer.rb
|
|
146
|
+
- lib/pikuri/extractor.rb
|
|
147
|
+
- lib/pikuri/extractor/html.rb
|
|
148
|
+
- lib/pikuri/extractor/passthrough.rb
|
|
202
149
|
- lib/pikuri/file_type.rb
|
|
150
|
+
- lib/pikuri/finalizers.rb
|
|
151
|
+
- lib/pikuri/paths.rb
|
|
203
152
|
- lib/pikuri/subprocess.rb
|
|
204
153
|
- lib/pikuri/tool.rb
|
|
205
154
|
- lib/pikuri/tool/calculator.rb
|
|
206
155
|
- lib/pikuri/tool/fetch.rb
|
|
207
156
|
- lib/pikuri/tool/parameters.rb
|
|
208
|
-
- lib/pikuri/tool/scraper
|
|
209
|
-
- lib/pikuri/tool/scraper/html.rb
|
|
210
|
-
- lib/pikuri/tool/scraper/pdf.rb
|
|
211
|
-
- lib/pikuri/tool/scraper/simple.rb
|
|
157
|
+
- lib/pikuri/tool/scraper.rb
|
|
212
158
|
- lib/pikuri/tool/search/brave.rb
|
|
213
159
|
- lib/pikuri/tool/search/duckduckgo.rb
|
|
214
160
|
- lib/pikuri/tool/search/engines.rb
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Pikuri
|
|
4
|
-
class Tool
|
|
5
|
-
module Scraper
|
|
6
|
-
# Raised by anything in the scraper stack when a URL cannot be
|
|
7
|
-
# rendered into Markdown text — HTTP non-2xx, network failure,
|
|
8
|
-
# redirect-loop, missing +Location+, unsupported content-type, or a
|
|
9
|
-
# parse failure that reads as "try a different URL" to the LLM.
|
|
10
|
-
# Catching this in {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the
|
|
11
|
-
# failure into an +"Error: ..."+ observation; anything else bubbles
|
|
12
|
-
# up so genuine bugs stay visible.
|
|
13
|
-
class FetchError < StandardError; end
|
|
14
|
-
end
|
|
15
|
-
end
|
|
16
|
-
end
|
|
@@ -1,285 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'json'
|
|
4
|
-
require 'nokogiri'
|
|
5
|
-
require 'readability'
|
|
6
|
-
require 'reverse_markdown'
|
|
7
|
-
|
|
8
|
-
module Pikuri
|
|
9
|
-
class Tool
|
|
10
|
-
module Scraper
|
|
11
|
-
# HTML → Markdown extractor used by {Simple.visit} when the fetched
|
|
12
|
-
# response carries an HTML content-type.
|
|
13
|
-
#
|
|
14
|
-
# Always renders both views of the page when available:
|
|
15
|
-
#
|
|
16
|
-
# 1. JSON-LD section. Any +<script type="application/ld+json">+ node
|
|
17
|
-
# whose +@type+ matches a substantive schema.org content type
|
|
18
|
-
# (Product, Article, Recipe, ...) is rendered as a header — title,
|
|
19
|
-
# metadata bullets (brand, SKU, price, rating, author, published),
|
|
20
|
-
# and the +articleBody+/+description+ copy when present.
|
|
21
|
-
# 2. Readability section. The page is run through +Readability+ +
|
|
22
|
-
# +reverse_markdown+, with a +<main>+/+<article>+ fallback for
|
|
23
|
-
# pages whose content sits mostly outside +<p>+ tags.
|
|
24
|
-
#
|
|
25
|
-
# Concatenated with a horizontal rule, so the LLM gets both the
|
|
26
|
-
# structured metadata and the rendered body and can pick whichever
|
|
27
|
-
# is more useful for the task. Trades some duplication (when a
|
|
28
|
-
# publisher embeds the article body in JSON-LD AND in HTML) for
|
|
29
|
-
# fewer type-based heuristics on which branch should win — the
|
|
30
|
-
# earlier "is this Article's +description+ a teaser or the real
|
|
31
|
-
# body?" carve-out is no longer needed because both end up in
|
|
32
|
-
# the output regardless.
|
|
33
|
-
#
|
|
34
|
-
# Pure parser — no I/O. {.extract} takes an HTML string and returns
|
|
35
|
-
# Markdown, so tests can drive it against fixture HTML without a
|
|
36
|
-
# network round-trip.
|
|
37
|
-
module HTML
|
|
38
|
-
# @return [Array<String>] schema.org +@type+ values that we treat
|
|
39
|
-
# as "the primary entity of this page" when picking a JSON-LD
|
|
40
|
-
# node to render. Order does not matter — the first matching
|
|
41
|
-
# node wins. Skips noise nodes (Organization, BreadcrumbList,
|
|
42
|
-
# WebSite, ...) that ship on most pages but carry no page
|
|
43
|
-
# content.
|
|
44
|
-
INTERESTING_TYPES = %w[
|
|
45
|
-
Product Article NewsArticle BlogPosting Recipe Event Book Movie
|
|
46
|
-
].freeze
|
|
47
|
-
|
|
48
|
-
# @return [Array<String>] HTML tags preserved by the readability
|
|
49
|
-
# pass. Anything outside this list is stripped before Markdown
|
|
50
|
-
# conversion.
|
|
51
|
-
READABILITY_TAGS = %w[
|
|
52
|
-
h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
|
|
53
|
-
strong em b i br hr table thead tbody tr td th
|
|
54
|
-
].freeze
|
|
55
|
-
|
|
56
|
-
# @return [Array<String>] HTML attributes preserved by the
|
|
57
|
-
# readability pass; everything else (class, id, style, data-*)
|
|
58
|
-
# is dropped before Markdown conversion
|
|
59
|
-
READABILITY_ATTRS = %w[href src alt title].freeze
|
|
60
|
-
|
|
61
|
-
# @return [Float] minimum +<main>+/+<article>+ to Readability
|
|
62
|
-
# text-length ratio that triggers the semantic-container
|
|
63
|
-
# fallback in {.readability_to_markdown}. Picked low enough to
|
|
64
|
-
# catch the failure mode (Readability collapsing a page that
|
|
65
|
-
# uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
|
|
66
|
-
# ~5x) but high enough that pages where both produce
|
|
67
|
-
# comparable output keep Readability's noise filtering.
|
|
68
|
-
MAIN_FALLBACK_RATIO = 2.0
|
|
69
|
-
|
|
70
|
-
# @return [Integer] minimum text length the
|
|
71
|
-
# +<main>+/+<article>+ container must hold before the fallback
|
|
72
|
-
# in {.readability_to_markdown} can fire. Below this, the
|
|
73
|
-
# ratio comparison is dominated by noise and we'd swap on
|
|
74
|
-
# tiny pages where Readability is doing the right thing.
|
|
75
|
-
MAIN_FALLBACK_MIN_CHARS = 500
|
|
76
|
-
|
|
77
|
-
# Render +html+ as Markdown by emitting both the JSON-LD section
|
|
78
|
-
# (when an interesting node is present) and the readability /
|
|
79
|
-
# +<main>+ section, joined by a horizontal rule. Either section
|
|
80
|
-
# may be missing — pages with no JSON-LD return only the
|
|
81
|
-
# readability output, and a malformed page with no extractable
|
|
82
|
-
# body returns only the JSON-LD render.
|
|
83
|
-
#
|
|
84
|
-
# @param html [String] HTML document body
|
|
85
|
-
# @return [String] Markdown representation
|
|
86
|
-
def self.extract(html)
|
|
87
|
-
sections = [jsonld_section(html), readability_to_markdown(html)]
|
|
88
|
-
sections.reject! { |s| s.nil? || s.strip.empty? }
|
|
89
|
-
sections.join("\n\n---\n\n")
|
|
90
|
-
end
|
|
91
|
-
|
|
92
|
-
# Pick the first JSON-LD node whose +@type+ matches one of
|
|
93
|
-
# {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
|
|
94
|
-
# when no such node exists, in which case {.extract} emits only
|
|
95
|
-
# the readability section.
|
|
96
|
-
#
|
|
97
|
-
# No content-field gating: a node carrying just +name+/+author+/
|
|
98
|
-
# +datePublished+ still renders (as a metadata-only header),
|
|
99
|
-
# because the readability pass independently produces the page
|
|
100
|
-
# body. That is the trade-off that lets us drop the type-based
|
|
101
|
-
# "is this teaser or article copy?" heuristics — duplication is
|
|
102
|
-
# acceptable when both views are available, and the LLM can
|
|
103
|
-
# pick whichever it needs.
|
|
104
|
-
#
|
|
105
|
-
# @param html [String] HTML document body
|
|
106
|
-
# @return [String, nil] Markdown render of the picked JSON-LD
|
|
107
|
-
# node, or +nil+ when nothing matched
|
|
108
|
-
def self.jsonld_section(html)
|
|
109
|
-
node = parse_jsonld(html).find do |n|
|
|
110
|
-
Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
|
|
111
|
-
end
|
|
112
|
-
node ? jsonld_to_markdown(node) : nil
|
|
113
|
-
end
|
|
114
|
-
|
|
115
|
-
# Collect every JSON-LD payload embedded in +html+, flattening
|
|
116
|
-
# +@graph+ wrappers so callers see one flat array of schema.org
|
|
117
|
-
# nodes. Malformed JSON blocks are silently skipped — sites
|
|
118
|
-
# frequently ship broken JSON-LD and we only need at least one
|
|
119
|
-
# parseable block.
|
|
120
|
-
#
|
|
121
|
-
# @param html [String] HTML document body
|
|
122
|
-
# @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
|
|
123
|
-
def self.parse_jsonld(html)
|
|
124
|
-
doc = Nokogiri::HTML(html)
|
|
125
|
-
blobs = doc.css('script[type="application/ld+json"]').map(&:text)
|
|
126
|
-
|
|
127
|
-
blobs.flat_map do |raw|
|
|
128
|
-
parsed = begin
|
|
129
|
-
JSON.parse(raw)
|
|
130
|
-
rescue JSON::ParserError
|
|
131
|
-
nil
|
|
132
|
-
end
|
|
133
|
-
next [] unless parsed
|
|
134
|
-
|
|
135
|
-
nodes = parsed.is_a?(Array) ? parsed : [parsed]
|
|
136
|
-
nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
|
|
137
|
-
end
|
|
138
|
-
end
|
|
139
|
-
|
|
140
|
-
# Render a single JSON-LD +node+ as Markdown: a top-level title
|
|
141
|
-
# from +name+/+headline+, a bullet list of common useful fields
|
|
142
|
-
# (brand, SKU, price, rating, author, published date, ...), the
|
|
143
|
-
# body copy, and the lead image.
|
|
144
|
-
#
|
|
145
|
-
# When the node carries +articleBody+ (the full publisher-supplied
|
|
146
|
-
# article text), that wins over +description+ — the description
|
|
147
|
-
# is typically a lede teaser and would just repeat the article's
|
|
148
|
-
# opening lines.
|
|
149
|
-
#
|
|
150
|
-
# @param node [Hash] JSON-LD node, typically picked by
|
|
151
|
-
# {.jsonld_section}
|
|
152
|
-
# @return [String] Markdown representation
|
|
153
|
-
def self.jsonld_to_markdown(node)
|
|
154
|
-
out = +''
|
|
155
|
-
name = node['name'] || node['headline']
|
|
156
|
-
out << "# #{name}\n\n" if name
|
|
157
|
-
|
|
158
|
-
offer = first_obj(node['offers'])
|
|
159
|
-
rating = first_obj(node['aggregateRating'])
|
|
160
|
-
brand = first_obj_or_string(node['brand'])
|
|
161
|
-
author = first_obj_or_string(node['author'])
|
|
162
|
-
|
|
163
|
-
brand_name = brand.is_a?(Hash) ? brand['name'] : brand
|
|
164
|
-
author_name = author.is_a?(Hash) ? author['name'] : author
|
|
165
|
-
|
|
166
|
-
fields = {
|
|
167
|
-
'Brand' => brand_name,
|
|
168
|
-
'SKU' => node['sku'],
|
|
169
|
-
'GTIN' => node['gtin13'] || node['gtin'],
|
|
170
|
-
'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
|
|
171
|
-
'Availability' => offer['availability'],
|
|
172
|
-
'Rating' => rating['ratingValue'],
|
|
173
|
-
'Reviews' => rating['reviewCount'],
|
|
174
|
-
'Author' => author_name,
|
|
175
|
-
'Published' => node['datePublished']
|
|
176
|
-
}.reject { |_, v| v.nil? || v.to_s.strip.empty? }
|
|
177
|
-
|
|
178
|
-
unless fields.empty?
|
|
179
|
-
fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
|
|
180
|
-
out << "\n"
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
if (body = node['articleBody'] || node['description'])
|
|
184
|
-
out << "#{body}\n\n"
|
|
185
|
-
end
|
|
186
|
-
|
|
187
|
-
if (img = node['image'])
|
|
188
|
-
img = img.first if img.is_a?(Array)
|
|
189
|
-
img = img['url'] if img.is_a?(Hash)
|
|
190
|
-
out << "\n\n" if img
|
|
191
|
-
end
|
|
192
|
-
|
|
193
|
-
out
|
|
194
|
-
end
|
|
195
|
-
|
|
196
|
-
# Run +Readability+ over +html+ to isolate the main content node,
|
|
197
|
-
# then convert that to Markdown via +reverse_markdown+. The page
|
|
198
|
-
# +<title>+ is rendered as a top-level heading.
|
|
199
|
-
#
|
|
200
|
-
# When the page uses semantic HTML5 (+<main>+ or +<article>+) but
|
|
201
|
-
# leaves most of its content outside +<p>+ tags — divs, lists,
|
|
202
|
-
# spans — Readability's paragraph-density scoring collapses the
|
|
203
|
-
# extraction to a sliver of the page. In that case we render the
|
|
204
|
-
# +<main>+/+<article>+ container directly. The fallback only
|
|
205
|
-
# fires when the container holds substantially more text than
|
|
206
|
-
# Readability picked up (see {MAIN_FALLBACK_RATIO} /
|
|
207
|
-
# {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
|
|
208
|
-
# Readability so its noise filtering still strips nav/ads/etc.
|
|
209
|
-
#
|
|
210
|
-
# @param html [String] HTML document body
|
|
211
|
-
# @return [String] Markdown representation
|
|
212
|
-
def self.readability_to_markdown(html)
|
|
213
|
-
rdoc = Readability::Document.new(
|
|
214
|
-
html,
|
|
215
|
-
tags: READABILITY_TAGS,
|
|
216
|
-
attributes: READABILITY_ATTRS,
|
|
217
|
-
remove_empty_nodes: true
|
|
218
|
-
)
|
|
219
|
-
readability_html = rdoc.content
|
|
220
|
-
title = rdoc.title
|
|
221
|
-
|
|
222
|
-
body_html = main_fallback_html(html, readability_html) || readability_html
|
|
223
|
-
body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
|
|
224
|
-
|
|
225
|
-
out = +''
|
|
226
|
-
out << "# #{title.strip}\n\n" if title && !title.strip.empty?
|
|
227
|
-
out << body
|
|
228
|
-
out
|
|
229
|
-
end
|
|
230
|
-
|
|
231
|
-
# If +html+ has a +<main>+ or +<article>+ element holding
|
|
232
|
-
# substantially more text than Readability extracted, return that
|
|
233
|
-
# container's HTML so the caller can render it instead. Returns
|
|
234
|
-
# +nil+ when the fallback should not fire — when there is no
|
|
235
|
-
# semantic container, when it's too small to be meaningful, or
|
|
236
|
-
# when Readability's output is already comparable.
|
|
237
|
-
#
|
|
238
|
-
# @param html [String] full HTML document body, used to locate
|
|
239
|
-
# the +<main>+/+<article>+ container
|
|
240
|
-
# @param readability_html [String] HTML produced by
|
|
241
|
-
# +Readability::Document#content+, used as the comparison
|
|
242
|
-
# baseline
|
|
243
|
-
# @return [String, nil] container HTML when the fallback should
|
|
244
|
-
# fire, +nil+ otherwise
|
|
245
|
-
def self.main_fallback_html(html, readability_html)
|
|
246
|
-
doc = Nokogiri::HTML(html)
|
|
247
|
-
container = doc.at_css('main') || doc.at_css('article')
|
|
248
|
-
return nil unless container
|
|
249
|
-
|
|
250
|
-
container_text_len = container.text.gsub(/\s+/, ' ').strip.length
|
|
251
|
-
return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
|
|
252
|
-
|
|
253
|
-
readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
|
|
254
|
-
return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
|
|
255
|
-
|
|
256
|
-
container.to_html
|
|
257
|
-
end
|
|
258
|
-
private_class_method :main_fallback_html
|
|
259
|
-
|
|
260
|
-
# JSON-LD fields can be a string, hash, or array of either.
|
|
261
|
-
# Normalize to a single hash (the first one if it's a list) so
|
|
262
|
-
# callers can +.dig+ safely.
|
|
263
|
-
#
|
|
264
|
-
# @param value [Object] raw JSON-LD field value
|
|
265
|
-
# @return [Hash] empty hash when +value+ does not contain a hash
|
|
266
|
-
def self.first_obj(value)
|
|
267
|
-
value = value.first if value.is_a?(Array)
|
|
268
|
-
value.is_a?(Hash) ? value : {}
|
|
269
|
-
end
|
|
270
|
-
private_class_method :first_obj
|
|
271
|
-
|
|
272
|
-
# Same idea as {.first_obj} but preserves a bare string (e.g.
|
|
273
|
-
# +brand: "Apple"+) instead of replacing it with +{}+.
|
|
274
|
-
#
|
|
275
|
-
# @param value [Object] raw JSON-LD field value
|
|
276
|
-
# @return [String, Hash, nil]
|
|
277
|
-
def self.first_obj_or_string(value)
|
|
278
|
-
value = value.first if value.is_a?(Array)
|
|
279
|
-
value
|
|
280
|
-
end
|
|
281
|
-
private_class_method :first_obj_or_string
|
|
282
|
-
end
|
|
283
|
-
end
|
|
284
|
-
end
|
|
285
|
-
end
|