pikuri-core 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'stringio'
5
+ require 'uri'
6
+
7
+ module Pikuri
8
+ class Tool
9
+ # HTTP side of the web tools ({Tool::WEB_SCRAPE} and {Tool::FETCH}):
10
+ # GET the URL with a real-browser User-Agent, follow redirects, and
11
+ # hand the response body to {Pikuri::Extractor.extract} with the
12
+ # response's +Content-Type+ as the hint. HTML/XHTML render via
13
+ # {Extractor::HTML}, any other +text/*+ type passes through
14
+ # verbatim, and plug-in extractors extend the set (with pikuri-pdf
15
+ # registered, +application/pdf+ extracts — by header or by +%PDF-+
16
+ # magic, so a PDF served under a lying header still works); the
17
+ # remaining types raise {FetchError} so the LLM observes the
18
+ # failure instead of receiving an empty rendering.
19
+ #
20
+ # Split into a thin HTTP fetch ({.fetch}) and the extraction
21
+ # wrapper ({.visit}) so tests can drive each piece in isolation and
22
+ # {Tool::Fetch} can reuse the HTTP half without the extraction
23
+ # pass. Nothing here knows about the LLM; the tools that wrap this
24
+ # module own caching and truncation and turn rendered Markdown (or
25
+ # {FetchError}) into the next observation.
26
+ module Scraper
27
+ # Raised when a URL cannot be rendered into Markdown text — HTTP
28
+ # non-2xx, network failure, redirect-loop, missing +Location+,
29
+ # unsupported content-type, or a parse failure that reads as "try
30
+ # a different URL" to the LLM. Catching this in
31
+ # {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the failure into an
32
+ # +"Error: ..."+ observation; anything else bubbles up so genuine
33
+ # bugs stay visible.
34
+ class FetchError < StandardError; end
35
+
36
+ # @return [String] User-Agent sent with each request; many sites
37
+ # reject requests with no UA or an obvious bot UA
38
+ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
39
+ '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
40
+ # @return [String] +Accept+ header sent with each request, so
41
+ # servers that content-negotiate hand back something we can use:
42
+ # rendered HTML first, +application/pdf+ for hosts with a PDF
43
+ # extractor registered, then any +text/*+ for the verbatim
44
+ # pass-through arm.
45
+ ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
46
+ # @return [Integer] maximum number of HTTP redirects to follow
47
+ # before giving up
48
+ MAX_REDIRECTS = 5
49
+ # @return [Integer] connect timeout in seconds for the underlying
50
+ # Faraday request
51
+ OPEN_TIMEOUT = 10
52
+ # @return [Integer] read timeout in seconds for the underlying
53
+ # Faraday request
54
+ READ_TIMEOUT = 20
55
+
56
+ # @return [Integer] maximum number of characters of an error
57
+ # response body to include in a {FetchError} message. The body is
58
+ # often a multi-kilobyte HTML challenge page (Cloudflare, WAF
59
+ # interstitial, etc.); a short excerpt tells the LLM what kind of
60
+ # page came back without flooding the next observation.
61
+ ERROR_BODY_EXCERPT = 200
62
+
63
+ # Result of a successful {Scraper.fetch}: the response body, the
64
+ # normalized content-type (lower-cased, with any +; charset=...+
65
+ # parameters stripped), and the final URL after redirects.
66
+ Fetched = Data.define(:body, :content_type, :url)
67
+
68
+ # Fetch +url+ and render its main content as Markdown.
69
+ #
70
+ # No caching here — every call hits the network. Callers that want
71
+ # to memoize results should wrap this method themselves (see
72
+ # {Tool::WebScrape.visit}, which does exactly that).
73
+ #
74
+ # The extracted output is +String#strip+'d so the LLM never sees
75
+ # a body that opens or closes with blank lines — common with
76
+ # extracted PDFs' page-feed whitespace and with text bodies that
77
+ # carry a trailing newline. Interior whitespace is preserved
78
+ # because Markdown paragraph breaks and source-code indentation
79
+ # are load-bearing.
80
+ #
81
+ # @param url [String] absolute HTTP(S) URL of the page to download
82
+ # @return [String] full Markdown representation of the page with
83
+ # leading/trailing whitespace trimmed, uncapped otherwise —
84
+ # caller is responsible for any size limiting before feeding
85
+ # the result back to the LLM
86
+ # @raise [FetchError] on HTTP non-2xx, network failure, redirect
87
+ # loop, a 3xx without a +Location+ header, a response no
88
+ # extractor recognizes, or an extraction failure (malformed
89
+ # PDF, ...)
90
+ def self.visit(url)
91
+ extract(fetch(url)).strip
92
+ end
93
+
94
+ # Render a {Fetched} response as Markdown via
95
+ # {Pikuri::Extractor.extract}, re-raising both extraction failure
96
+ # modes as {FetchError} — the single exception type the web tools
97
+ # rescue. The content-type is passed verbatim (including the +""+
98
+ # of a missing header, which matches no text arm — a body without
99
+ # transport metadata is refused, not sniffed; only a strong magic
100
+ # sniff like pikuri-pdf's +%PDF-+ overrides a wrong or missing
101
+ # header, because such a sniff never misfires on text).
102
+ #
103
+ # @param fetched [Fetched]
104
+ # @return [String] Markdown representation produced by the
105
+ # matched extractor
106
+ # @raise [FetchError] when no extractor matches the response's
107
+ # content-type, or when extraction fails
108
+ def self.extract(fetched)
109
+ Pikuri::Extractor.extract(StringIO.new(fetched.body), content_type: fetched.content_type)
110
+ rescue Pikuri::Extractor::Unsupported
111
+ raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
112
+ rescue Pikuri::Extractor::Error => e
113
+ raise FetchError, e.message
114
+ end
115
+
116
+ # Download the body of +url+, manually following up to
117
+ # {MAX_REDIRECTS} redirects. Faraday is configured with no
118
+ # middleware so behavior here mirrors the rest of the codebase
119
+ # (see +Tool::Search::DuckDuckGo.search+).
120
+ #
121
+ # All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
122
+ # blips, exhausted redirect budget, 3xx without a +Location+ —
123
+ # surface as {FetchError} so the caller has a single exception type
124
+ # to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
125
+ # characters with whitespace collapsed, so a Cloudflare-challenge
126
+ # response doesn't dump kilobytes of inline HTML into the next LLM
127
+ # observation.
128
+ #
129
+ # @param url [String] absolute HTTP(S) URL to fetch
130
+ # @param limit [Integer] redirects remaining; recurses with
131
+ # +limit - 1+ on each 3xx
132
+ # @return [Fetched] body, normalized content-type, and final URL
133
+ # after redirects
134
+ # @raise [FetchError] on non-2xx/3xx responses, network errors,
135
+ # redirect-loop exhaustion, or 3xx without a +Location+ header
136
+ def self.fetch(url, limit: MAX_REDIRECTS)
137
+ raise FetchError, "too many redirects fetching #{url}" if limit.zero?
138
+
139
+ response = begin
140
+ Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
141
+ req.headers['User-Agent'] = USER_AGENT
142
+ req.headers['Accept'] = ACCEPT
143
+ end
144
+ rescue Faraday::Error => e
145
+ raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
146
+ end
147
+
148
+ case response.status
149
+ when 200..299
150
+ Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
151
+ when 300..399
152
+ location = response.headers['location']
153
+ raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
154
+
155
+ fetch(URI.join(url, location).to_s, limit: limit - 1)
156
+ else
157
+ raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
158
+ end
159
+ end
160
+
161
+ # Lower-case +raw+ and strip any +; charset=...+ parameters so the
162
+ # extractors can match on a canonical token.
163
+ #
164
+ # @param raw [String, nil] raw +Content-Type+ header value
165
+ # @return [String] normalized content-type, or +""+ when the
166
+ # header was missing
167
+ def self.normalize_content_type(raw)
168
+ raw.to_s.split(';').first.to_s.strip.downcase
169
+ end
170
+ private_class_method :normalize_content_type
171
+
172
+ # Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
173
+ # characters, so the {FetchError} message stays a single readable
174
+ # line even when the server returned a multi-KB HTML challenge
175
+ # page.
176
+ #
177
+ # @param body [String, nil]
178
+ # @return [String]
179
+ def self.excerpt(body)
180
+ text = body.to_s.gsub(/\s+/, ' ').strip
181
+ text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
182
+ end
183
+ private_class_method :excerpt
184
+ end
185
+ end
186
+ end
@@ -3,7 +3,7 @@
3
3
  module Pikuri
4
4
  class Tool
5
5
  # Truncation policy and Tool spec for the +web_scrape+ tool. The actual
6
- # scraping lives in {Tool::Scraper::Simple}; this module is a thin
6
+ # scraping lives in {Tool::Scraper}; this module is a thin
7
7
  # wrapper that picks the scraper, applies a character cap so the LLM
8
8
  # doesn't drown in long-form content, and exposes the result to the
9
9
  # agent loop in OpenAI tool-call shape.
@@ -37,7 +37,7 @@ module Pikuri
37
37
  CACHE
38
38
  end
39
39
 
40
- # Fetch +url+ via {Tool::Scraper::Simple} and truncate the rendered
40
+ # Fetch +url+ via {Tool::Scraper} and truncate the rendered
41
41
  # Markdown to +max_chars+ characters.
42
42
  #
43
43
  # The full extracted Markdown is cached on disk via {.cache}, keyed
@@ -65,7 +65,7 @@ module Pikuri
65
65
  # truncated, or +"Error: ..."+ on a recoverable fetch failure
66
66
  def self.visit(url, max_chars: DEFAULT_MAX_CHARS)
67
67
  max_chars = max_chars.clamp(1, MAX_MAX_CHARS)
68
- markdown = cache.fetch(url) { Scraper::Simple.visit(url) }
68
+ markdown = cache.fetch(url) { Scraper.visit(url) }
69
69
  truncate(markdown, max_chars)
70
70
  rescue Scraper::FetchError => e
71
71
  "Error: #{e.message}"
@@ -95,10 +95,10 @@ module Pikuri
95
95
  WEB_SCRAPE = new(
96
96
  name: 'web_scrape',
97
97
  description: <<~DESC,
98
- Scrapes the rendered webpage, PDF, or text file at the given URL and returns its main content as Markdown.
98
+ Scrapes the rendered webpage or text file at the given URL and returns its main content as Markdown.
99
99
 
100
100
  Usage:
101
- - Use for HTML pages or PDFs where you want readable content — readability extraction strips nav, sidebars, and boilerplate.
101
+ - Use for HTML pages where you want readable content — readability extraction strips nav, sidebars, and boilerplate.
102
102
  - For raw textual payloads (JSON, CSV, robots.txt, source files), use fetch instead — it returns bytes verbatim, while web_scrape would corrupt them with a Markdown pass.
103
103
  - A Single Page App may return very little or no content. Do NOT retry with a larger max_chars; try a different URL instead.
104
104
  DESC
@@ -6,5 +6,5 @@ module Pikuri
6
6
  # additions to the public surface (+Pikuri::Tool+ / +Pikuri::Agent+ /
7
7
  # listeners / bundled tools), major for breaking changes to that
8
8
  # surface or to the +bin/pikuri-*+ CLIs.
9
- VERSION = '0.0.4'
9
+ VERSION = '0.0.6'
10
10
  end
data/lib/pikuri-core.rb CHANGED
@@ -169,7 +169,6 @@ module Pikuri
169
169
  Loader.ignore(File.expand_path('pikuri/version.rb', __dir__))
170
170
  Loader.inflector.inflect(
171
171
  'html' => 'HTML',
172
- 'pdf' => 'PDF',
173
172
  'duckduckgo' => 'DuckDuckGo'
174
173
  )
175
174
  Loader.setup
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pikuri-core
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Martin Vysny
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-05-29 00:00:00.000000000 Z
11
+ date: 2026-06-04 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: dentaku
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '3.5'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '3.5'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: faraday
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -52,20 +38,6 @@ dependencies:
52
38
  - - "~>"
53
39
  - !ruby/object:Gem::Version
54
40
  version: '1.19'
55
- - !ruby/object:Gem::Dependency
56
- name: pdf-reader
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - "~>"
60
- - !ruby/object:Gem::Version
61
- version: '2.15'
62
- type: :runtime
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - "~>"
67
- - !ruby/object:Gem::Version
68
- version: '2.15'
69
41
  - !ruby/object:Gem::Dependency
70
42
  name: rainbow
71
43
  requirement: !ruby/object:Gem::Requirement
@@ -122,34 +94,6 @@ dependencies:
122
94
  - - "~>"
123
95
  - !ruby/object:Gem::Version
124
96
  version: '1.15'
125
- - !ruby/object:Gem::Dependency
126
- name: tsort
127
- requirement: !ruby/object:Gem::Requirement
128
- requirements:
129
- - - "~>"
130
- - !ruby/object:Gem::Version
131
- version: '0.2'
132
- type: :runtime
133
- prerelease: false
134
- version_requirements: !ruby/object:Gem::Requirement
135
- requirements:
136
- - - "~>"
137
- - !ruby/object:Gem::Version
138
- version: '0.2'
139
- - !ruby/object:Gem::Dependency
140
- name: tty-markdown
141
- requirement: !ruby/object:Gem::Requirement
142
- requirements:
143
- - - "~>"
144
- - !ruby/object:Gem::Version
145
- version: '0.7'
146
- type: :runtime
147
- prerelease: false
148
- version_requirements: !ruby/object:Gem::Requirement
149
- requirements:
150
- - - "~>"
151
- - !ruby/object:Gem::Version
152
- version: '0.7'
153
97
  - !ruby/object:Gem::Dependency
154
98
  name: zeitwerk
155
99
  requirement: !ruby/object:Gem::Requirement
@@ -199,16 +143,18 @@ files:
199
143
  - lib/pikuri/agent/listener/token_log.rb
200
144
  - lib/pikuri/agent/listener_list.rb
201
145
  - lib/pikuri/agent/synthesizer.rb
146
+ - lib/pikuri/extractor.rb
147
+ - lib/pikuri/extractor/html.rb
148
+ - lib/pikuri/extractor/passthrough.rb
202
149
  - lib/pikuri/file_type.rb
150
+ - lib/pikuri/finalizers.rb
151
+ - lib/pikuri/paths.rb
203
152
  - lib/pikuri/subprocess.rb
204
153
  - lib/pikuri/tool.rb
205
154
  - lib/pikuri/tool/calculator.rb
206
155
  - lib/pikuri/tool/fetch.rb
207
156
  - lib/pikuri/tool/parameters.rb
208
- - lib/pikuri/tool/scraper/fetch_error.rb
209
- - lib/pikuri/tool/scraper/html.rb
210
- - lib/pikuri/tool/scraper/pdf.rb
211
- - lib/pikuri/tool/scraper/simple.rb
157
+ - lib/pikuri/tool/scraper.rb
212
158
  - lib/pikuri/tool/search/brave.rb
213
159
  - lib/pikuri/tool/search/duckduckgo.rb
214
160
  - lib/pikuri/tool/search/engines.rb
@@ -1,16 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Pikuri
4
- class Tool
5
- module Scraper
6
- # Raised by anything in the scraper stack when a URL cannot be
7
- # rendered into Markdown text — HTTP non-2xx, network failure,
8
- # redirect-loop, missing +Location+, unsupported content-type, or a
9
- # parse failure that reads as "try a different URL" to the LLM.
10
- # Catching this in {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the
11
- # failure into an +"Error: ..."+ observation; anything else bubbles
12
- # up so genuine bugs stay visible.
13
- class FetchError < StandardError; end
14
- end
15
- end
16
- end
@@ -1,285 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'json'
4
- require 'nokogiri'
5
- require 'readability'
6
- require 'reverse_markdown'
7
-
8
- module Pikuri
9
- class Tool
10
- module Scraper
11
- # HTML → Markdown extractor used by {Simple.visit} when the fetched
12
- # response carries an HTML content-type.
13
- #
14
- # Always renders both views of the page when available:
15
- #
16
- # 1. JSON-LD section. Any +<script type="application/ld+json">+ node
17
- # whose +@type+ matches a substantive schema.org content type
18
- # (Product, Article, Recipe, ...) is rendered as a header — title,
19
- # metadata bullets (brand, SKU, price, rating, author, published),
20
- # and the +articleBody+/+description+ copy when present.
21
- # 2. Readability section. The page is run through +Readability+ +
22
- # +reverse_markdown+, with a +<main>+/+<article>+ fallback for
23
- # pages whose content sits mostly outside +<p>+ tags.
24
- #
25
- # Concatenated with a horizontal rule, so the LLM gets both the
26
- # structured metadata and the rendered body and can pick whichever
27
- # is more useful for the task. Trades some duplication (when a
28
- # publisher embeds the article body in JSON-LD AND in HTML) for
29
- # fewer type-based heuristics on which branch should win — the
30
- # earlier "is this Article's +description+ a teaser or the real
31
- # body?" carve-out is no longer needed because both end up in
32
- # the output regardless.
33
- #
34
- # Pure parser — no I/O. {.extract} takes an HTML string and returns
35
- # Markdown, so tests can drive it against fixture HTML without a
36
- # network round-trip.
37
- module HTML
38
- # @return [Array<String>] schema.org +@type+ values that we treat
39
- # as "the primary entity of this page" when picking a JSON-LD
40
- # node to render. Order does not matter — the first matching
41
- # node wins. Skips noise nodes (Organization, BreadcrumbList,
42
- # WebSite, ...) that ship on most pages but carry no page
43
- # content.
44
- INTERESTING_TYPES = %w[
45
- Product Article NewsArticle BlogPosting Recipe Event Book Movie
46
- ].freeze
47
-
48
- # @return [Array<String>] HTML tags preserved by the readability
49
- # pass. Anything outside this list is stripped before Markdown
50
- # conversion.
51
- READABILITY_TAGS = %w[
52
- h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
53
- strong em b i br hr table thead tbody tr td th
54
- ].freeze
55
-
56
- # @return [Array<String>] HTML attributes preserved by the
57
- # readability pass; everything else (class, id, style, data-*)
58
- # is dropped before Markdown conversion
59
- READABILITY_ATTRS = %w[href src alt title].freeze
60
-
61
- # @return [Float] minimum +<main>+/+<article>+ to Readability
62
- # text-length ratio that triggers the semantic-container
63
- # fallback in {.readability_to_markdown}. Picked low enough to
64
- # catch the failure mode (Readability collapsing a page that
65
- # uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
66
- # ~5x) but high enough that pages where both produce
67
- # comparable output keep Readability's noise filtering.
68
- MAIN_FALLBACK_RATIO = 2.0
69
-
70
- # @return [Integer] minimum text length the
71
- # +<main>+/+<article>+ container must hold before the fallback
72
- # in {.readability_to_markdown} can fire. Below this, the
73
- # ratio comparison is dominated by noise and we'd swap on
74
- # tiny pages where Readability is doing the right thing.
75
- MAIN_FALLBACK_MIN_CHARS = 500
76
-
77
- # Render +html+ as Markdown by emitting both the JSON-LD section
78
- # (when an interesting node is present) and the readability /
79
- # +<main>+ section, joined by a horizontal rule. Either section
80
- # may be missing — pages with no JSON-LD return only the
81
- # readability output, and a malformed page with no extractable
82
- # body returns only the JSON-LD render.
83
- #
84
- # @param html [String] HTML document body
85
- # @return [String] Markdown representation
86
- def self.extract(html)
87
- sections = [jsonld_section(html), readability_to_markdown(html)]
88
- sections.reject! { |s| s.nil? || s.strip.empty? }
89
- sections.join("\n\n---\n\n")
90
- end
91
-
92
- # Pick the first JSON-LD node whose +@type+ matches one of
93
- # {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
94
- # when no such node exists, in which case {.extract} emits only
95
- # the readability section.
96
- #
97
- # No content-field gating: a node carrying just +name+/+author+/
98
- # +datePublished+ still renders (as a metadata-only header),
99
- # because the readability pass independently produces the page
100
- # body. That is the trade-off that lets us drop the type-based
101
- # "is this teaser or article copy?" heuristics — duplication is
102
- # acceptable when both views are available, and the LLM can
103
- # pick whichever it needs.
104
- #
105
- # @param html [String] HTML document body
106
- # @return [String, nil] Markdown render of the picked JSON-LD
107
- # node, or +nil+ when nothing matched
108
- def self.jsonld_section(html)
109
- node = parse_jsonld(html).find do |n|
110
- Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
111
- end
112
- node ? jsonld_to_markdown(node) : nil
113
- end
114
-
115
- # Collect every JSON-LD payload embedded in +html+, flattening
116
- # +@graph+ wrappers so callers see one flat array of schema.org
117
- # nodes. Malformed JSON blocks are silently skipped — sites
118
- # frequently ship broken JSON-LD and we only need at least one
119
- # parseable block.
120
- #
121
- # @param html [String] HTML document body
122
- # @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
123
- def self.parse_jsonld(html)
124
- doc = Nokogiri::HTML(html)
125
- blobs = doc.css('script[type="application/ld+json"]').map(&:text)
126
-
127
- blobs.flat_map do |raw|
128
- parsed = begin
129
- JSON.parse(raw)
130
- rescue JSON::ParserError
131
- nil
132
- end
133
- next [] unless parsed
134
-
135
- nodes = parsed.is_a?(Array) ? parsed : [parsed]
136
- nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
137
- end
138
- end
139
-
140
- # Render a single JSON-LD +node+ as Markdown: a top-level title
141
- # from +name+/+headline+, a bullet list of common useful fields
142
- # (brand, SKU, price, rating, author, published date, ...), the
143
- # body copy, and the lead image.
144
- #
145
- # When the node carries +articleBody+ (the full publisher-supplied
146
- # article text), that wins over +description+ — the description
147
- # is typically a lede teaser and would just repeat the article's
148
- # opening lines.
149
- #
150
- # @param node [Hash] JSON-LD node, typically picked by
151
- # {.jsonld_section}
152
- # @return [String] Markdown representation
153
- def self.jsonld_to_markdown(node)
154
- out = +''
155
- name = node['name'] || node['headline']
156
- out << "# #{name}\n\n" if name
157
-
158
- offer = first_obj(node['offers'])
159
- rating = first_obj(node['aggregateRating'])
160
- brand = first_obj_or_string(node['brand'])
161
- author = first_obj_or_string(node['author'])
162
-
163
- brand_name = brand.is_a?(Hash) ? brand['name'] : brand
164
- author_name = author.is_a?(Hash) ? author['name'] : author
165
-
166
- fields = {
167
- 'Brand' => brand_name,
168
- 'SKU' => node['sku'],
169
- 'GTIN' => node['gtin13'] || node['gtin'],
170
- 'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
171
- 'Availability' => offer['availability'],
172
- 'Rating' => rating['ratingValue'],
173
- 'Reviews' => rating['reviewCount'],
174
- 'Author' => author_name,
175
- 'Published' => node['datePublished']
176
- }.reject { |_, v| v.nil? || v.to_s.strip.empty? }
177
-
178
- unless fields.empty?
179
- fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
180
- out << "\n"
181
- end
182
-
183
- if (body = node['articleBody'] || node['description'])
184
- out << "#{body}\n\n"
185
- end
186
-
187
- if (img = node['image'])
188
- img = img.first if img.is_a?(Array)
189
- img = img['url'] if img.is_a?(Hash)
190
- out << "![image](#{img})\n\n" if img
191
- end
192
-
193
- out
194
- end
195
-
196
- # Run +Readability+ over +html+ to isolate the main content node,
197
- # then convert that to Markdown via +reverse_markdown+. The page
198
- # +<title>+ is rendered as a top-level heading.
199
- #
200
- # When the page uses semantic HTML5 (+<main>+ or +<article>+) but
201
- # leaves most of its content outside +<p>+ tags — divs, lists,
202
- # spans — Readability's paragraph-density scoring collapses the
203
- # extraction to a sliver of the page. In that case we render the
204
- # +<main>+/+<article>+ container directly. The fallback only
205
- # fires when the container holds substantially more text than
206
- # Readability picked up (see {MAIN_FALLBACK_RATIO} /
207
- # {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
208
- # Readability so its noise filtering still strips nav/ads/etc.
209
- #
210
- # @param html [String] HTML document body
211
- # @return [String] Markdown representation
212
- def self.readability_to_markdown(html)
213
- rdoc = Readability::Document.new(
214
- html,
215
- tags: READABILITY_TAGS,
216
- attributes: READABILITY_ATTRS,
217
- remove_empty_nodes: true
218
- )
219
- readability_html = rdoc.content
220
- title = rdoc.title
221
-
222
- body_html = main_fallback_html(html, readability_html) || readability_html
223
- body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
224
-
225
- out = +''
226
- out << "# #{title.strip}\n\n" if title && !title.strip.empty?
227
- out << body
228
- out
229
- end
230
-
231
- # If +html+ has a +<main>+ or +<article>+ element holding
232
- # substantially more text than Readability extracted, return that
233
- # container's HTML so the caller can render it instead. Returns
234
- # +nil+ when the fallback should not fire — when there is no
235
- # semantic container, when it's too small to be meaningful, or
236
- # when Readability's output is already comparable.
237
- #
238
- # @param html [String] full HTML document body, used to locate
239
- # the +<main>+/+<article>+ container
240
- # @param readability_html [String] HTML produced by
241
- # +Readability::Document#content+, used as the comparison
242
- # baseline
243
- # @return [String, nil] container HTML when the fallback should
244
- # fire, +nil+ otherwise
245
- def self.main_fallback_html(html, readability_html)
246
- doc = Nokogiri::HTML(html)
247
- container = doc.at_css('main') || doc.at_css('article')
248
- return nil unless container
249
-
250
- container_text_len = container.text.gsub(/\s+/, ' ').strip.length
251
- return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
252
-
253
- readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
254
- return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
255
-
256
- container.to_html
257
- end
258
- private_class_method :main_fallback_html
259
-
260
- # JSON-LD fields can be a string, hash, or array of either.
261
- # Normalize to a single hash (the first one if it's a list) so
262
- # callers can +.dig+ safely.
263
- #
264
- # @param value [Object] raw JSON-LD field value
265
- # @return [Hash] empty hash when +value+ does not contain a hash
266
- def self.first_obj(value)
267
- value = value.first if value.is_a?(Array)
268
- value.is_a?(Hash) ? value : {}
269
- end
270
- private_class_method :first_obj
271
-
272
- # Same idea as {.first_obj} but preserves a bare string (e.g.
273
- # +brand: "Apple"+) instead of replacing it with +{}+.
274
- #
275
- # @param value [Object] raw JSON-LD field value
276
- # @return [String, Hash, nil]
277
- def self.first_obj_or_string(value)
278
- value = value.first if value.is_a?(Array)
279
- value
280
- end
281
- private_class_method :first_obj_or_string
282
- end
283
- end
284
- end
285
- end