RubyGems - pikuri-core - Versions diffs - 0.0.4 → 0.0.6 - Mend

pikuri-core 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/lib/pikuri/agent/configurator.rb +9 -2
data/lib/pikuri/agent/context_window_detector.rb +70 -10
data/lib/pikuri/agent/control/interloper.rb +10 -2
data/lib/pikuri/agent/event.rb +15 -0
data/lib/pikuri/agent/extension.rb +37 -9
data/lib/pikuri/agent/listener/terminal.rb +22 -36
data/lib/pikuri/agent.rb +174 -73
data/lib/pikuri/extractor/html.rb +303 -0
data/lib/pikuri/extractor/passthrough.rb +64 -0
data/lib/pikuri/extractor.rb +314 -0
data/lib/pikuri/file_type.rb +87 -59
data/lib/pikuri/finalizers.rb +118 -0
data/lib/pikuri/paths.rb +29 -0
data/lib/pikuri/subprocess.rb +109 -12
data/lib/pikuri/tool/calculator.rb +213 -41
data/lib/pikuri/tool/fetch.rb +10 -9
data/lib/pikuri/tool/scraper.rb +186 -0
data/lib/pikuri/tool/web_scrape.rb +5 -5
data/lib/pikuri/version.rb +1 -1
data/lib/pikuri-core.rb +0 -1
metadata +8 -62
data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
data/lib/pikuri/tool/scraper/html.rb +0 -285
data/lib/pikuri/tool/scraper/pdf.rb +0 -54
data/lib/pikuri/tool/scraper/simple.rb +0 -183

data/lib/pikuri/tool/scraper.rb ADDED Viewed

@@ -0,0 +1,186 @@
+# frozen_string_literal: true
+require 'faraday'
+require 'stringio'
+require 'uri'
+module Pikuri
+  class Tool
+    # HTTP side of the web tools ({Tool::WEB_SCRAPE} and {Tool::FETCH}):
+    # GET the URL with a real-browser User-Agent, follow redirects, and
+    # hand the response body to {Pikuri::Extractor.extract} with the
+    # response's +Content-Type+ as the hint. HTML/XHTML render via
+    # {Extractor::HTML}, any other +text/*+ type passes through
+    # verbatim, and plug-in extractors extend the set (with pikuri-pdf
+    # registered, +application/pdf+ extracts — by header or by +%PDF-+
+    # magic, so a PDF served under a lying header still works); the
+    # remaining types raise {FetchError} so the LLM observes the
+    # failure instead of receiving an empty rendering.
+    #
+    # Split into a thin HTTP fetch ({.fetch}) and the extraction
+    # wrapper ({.visit}) so tests can drive each piece in isolation and
+    # {Tool::Fetch} can reuse the HTTP half without the extraction
+    # pass. Nothing here knows about the LLM; the tools that wrap this
+    # module own caching and truncation and turn rendered Markdown (or
+    # {FetchError}) into the next observation.
+    module Scraper
+      # Raised when a URL cannot be rendered into Markdown text — HTTP
+      # non-2xx, network failure, redirect-loop, missing +Location+,
+      # unsupported content-type, or a parse failure that reads as "try
+      # a different URL" to the LLM. Catching this in
+      # {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the failure into an
+      # +"Error: ..."+ observation; anything else bubbles up so genuine
+      # bugs stay visible.
+      class FetchError < StandardError; end
+      # @return [String] User-Agent sent with each request; many sites
+      #   reject requests with no UA or an obvious bot UA
+      USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
+                   '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
+      # @return [String] +Accept+ header sent with each request, so
+      #   servers that content-negotiate hand back something we can use:
+      #   rendered HTML first, +application/pdf+ for hosts with a PDF
+      #   extractor registered, then any +text/*+ for the verbatim
+      #   pass-through arm.
+      ACCEPT = 'text/html,application/xhtml+xml,application/pdf,text/*;q=0.8'
+      # @return [Integer] maximum number of HTTP redirects to follow
+      #   before giving up
+      MAX_REDIRECTS = 5
+      # @return [Integer] connect timeout in seconds for the underlying
+      #   Faraday request
+      OPEN_TIMEOUT = 10
+      # @return [Integer] read timeout in seconds for the underlying
+      #   Faraday request
+      READ_TIMEOUT = 20
+      # @return [Integer] maximum number of characters of an error
+      #   response body to include in a {FetchError} message. The body is
+      #   often a multi-kilobyte HTML challenge page (Cloudflare, WAF
+      #   interstitial, etc.); a short excerpt tells the LLM what kind of
+      #   page came back without flooding the next observation.
+      ERROR_BODY_EXCERPT = 200
+      # Result of a successful {Scraper.fetch}: the response body, the
+      # normalized content-type (lower-cased, with any +; charset=...+
+      # parameters stripped), and the final URL after redirects.
+      Fetched = Data.define(:body, :content_type, :url)
+      # Fetch +url+ and render its main content as Markdown.
+      #
+      # No caching here — every call hits the network. Callers that want
+      # to memoize results should wrap this method themselves (see
+      # {Tool::WebScrape.visit}, which does exactly that).
+      #
+      # The extracted output is +String#strip+'d so the LLM never sees
+      # a body that opens or closes with blank lines — common with
+      # extracted PDFs' page-feed whitespace and with text bodies that
+      # carry a trailing newline. Interior whitespace is preserved
+      # because Markdown paragraph breaks and source-code indentation
+      # are load-bearing.
+      #
+      # @param url [String] absolute HTTP(S) URL of the page to download
+      # @return [String] full Markdown representation of the page with
+      #   leading/trailing whitespace trimmed, uncapped otherwise —
+      #   caller is responsible for any size limiting before feeding
+      #   the result back to the LLM
+      # @raise [FetchError] on HTTP non-2xx, network failure, redirect
+      #   loop, a 3xx without a +Location+ header, a response no
+      #   extractor recognizes, or an extraction failure (malformed
+      #   PDF, ...)
+      def self.visit(url)
+        extract(fetch(url)).strip
+      end
+      # Render a {Fetched} response as Markdown via
+      # {Pikuri::Extractor.extract}, re-raising both extraction failure
+      # modes as {FetchError} — the single exception type the web tools
+      # rescue. The content-type is passed verbatim (including the +""+
+      # of a missing header, which matches no text arm — a body without
+      # transport metadata is refused, not sniffed; only a strong magic
+      # sniff like pikuri-pdf's +%PDF-+ overrides a wrong or missing
+      # header, because such a sniff never misfires on text).
+      #
+      # @param fetched [Fetched]
+      # @return [String] Markdown representation produced by the
+      #   matched extractor
+      # @raise [FetchError] when no extractor matches the response's
+      #   content-type, or when extraction fails
+      def self.extract(fetched)
+        Pikuri::Extractor.extract(StringIO.new(fetched.body), content_type: fetched.content_type)
+      rescue Pikuri::Extractor::Unsupported
+        raise FetchError, "unsupported content-type #{fetched.content_type.inspect} for #{fetched.url}"
+      rescue Pikuri::Extractor::Error => e
+        raise FetchError, e.message
+      end
+      # Download the body of +url+, manually following up to
+      # {MAX_REDIRECTS} redirects. Faraday is configured with no
+      # middleware so behavior here mirrors the rest of the codebase
+      # (see +Tool::Search::DuckDuckGo.search+).
+      #
+      # All recoverable failures — HTTP 4xx/5xx, +Faraday::Error+ network
+      # blips, exhausted redirect budget, 3xx without a +Location+ —
+      # surface as {FetchError} so the caller has a single exception type
+      # to rescue. Error bodies are trimmed to {ERROR_BODY_EXCERPT}
+      # characters with whitespace collapsed, so a Cloudflare-challenge
+      # response doesn't dump kilobytes of inline HTML into the next LLM
+      # observation.
+      #
+      # @param url [String] absolute HTTP(S) URL to fetch
+      # @param limit [Integer] redirects remaining; recurses with
+      #   +limit - 1+ on each 3xx
+      # @return [Fetched] body, normalized content-type, and final URL
+      #   after redirects
+      # @raise [FetchError] on non-2xx/3xx responses, network errors,
+      #   redirect-loop exhaustion, or 3xx without a +Location+ header
+      def self.fetch(url, limit: MAX_REDIRECTS)
+        raise FetchError, "too many redirects fetching #{url}" if limit.zero?
+        response = begin
+          Faraday.new(request: { open_timeout: OPEN_TIMEOUT, timeout: READ_TIMEOUT }).get(url) do |req|
+            req.headers['User-Agent'] = USER_AGENT
+            req.headers['Accept']     = ACCEPT
+          end
+        rescue Faraday::Error => e
+          raise FetchError, "#{e.class.name.split('::').last} fetching #{url}: #{e.message}"
+        end
+        case response.status
+        when 200..299
+          Fetched.new(body: response.body, content_type: normalize_content_type(response.headers['content-type']), url: url)
+        when 300..399
+          location = response.headers['location']
+          raise FetchError, "HTTP #{response.status} from #{url} with no Location header" if location.nil? || location.empty?
+          fetch(URI.join(url, location).to_s, limit: limit - 1)
+        else
+          raise FetchError, "HTTP #{response.status} fetching #{url}: #{excerpt(response.body)}"
+        end
+      end
+      # Lower-case +raw+ and strip any +; charset=...+ parameters so the
+      # extractors can match on a canonical token.
+      #
+      # @param raw [String, nil] raw +Content-Type+ header value
+      # @return [String] normalized content-type, or +""+ when the
+      #   header was missing
+      def self.normalize_content_type(raw)
+        raw.to_s.split(';').first.to_s.strip.downcase
+      end
+      private_class_method :normalize_content_type
+      # Whitespace-collapse +body+ and clip to {ERROR_BODY_EXCERPT}
+      # characters, so the {FetchError} message stays a single readable
+      # line even when the server returned a multi-KB HTML challenge
+      # page.
+      #
+      # @param body [String, nil]
+      # @return [String]
+      def self.excerpt(body)
+        text = body.to_s.gsub(/\s+/, ' ').strip
+        text.length > ERROR_BODY_EXCERPT ? "#{text[0, ERROR_BODY_EXCERPT]}..." : text
+      end
+      private_class_method :excerpt
+    end
+  end
+end

data/lib/pikuri/tool/web_scrape.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 module Pikuri
   class Tool
     # Truncation policy and Tool spec for the +web_scrape+ tool. The actual
-    # scraping lives in {Tool::Scraper::Simple}; this module is a thin
+    # scraping lives in {Tool::Scraper}; this module is a thin
     # wrapper that picks the scraper, applies a character cap so the LLM
     # doesn't drown in long-form content, and exposes the result to the
     # agent loop in OpenAI tool-call shape.
@@ -37,7 +37,7 @@ module Pikuri
         CACHE
       end
-      # Fetch +url+ via {Tool::Scraper::Simple} and truncate the rendered
+      # Fetch +url+ via {Tool::Scraper} and truncate the rendered
       # Markdown to +max_chars+ characters.
       #
       # The full extracted Markdown is cached on disk via {.cache}, keyed
@@ -65,7 +65,7 @@ module Pikuri
       #   truncated, or +"Error: ..."+ on a recoverable fetch failure
       def self.visit(url, max_chars: DEFAULT_MAX_CHARS)
         max_chars = max_chars.clamp(1, MAX_MAX_CHARS)
-        markdown = cache.fetch(url) { Scraper::Simple.visit(url) }
+        markdown = cache.fetch(url) { Scraper.visit(url) }
         truncate(markdown, max_chars)
       rescue Scraper::FetchError => e
         "Error: #{e.message}"
@@ -95,10 +95,10 @@ module Pikuri
     WEB_SCRAPE = new(
       name: 'web_scrape',
       description: <<~DESC,
-        Scrapes the rendered webpage, PDF, or text file at the given URL and returns its main content as Markdown.
+        Scrapes the rendered webpage or text file at the given URL and returns its main content as Markdown.
         Usage:
-        - Use for HTML pages or PDFs where you want readable content — readability extraction strips nav, sidebars, and boilerplate.
+        - Use for HTML pages where you want readable content — readability extraction strips nav, sidebars, and boilerplate.
         - For raw textual payloads (JSON, CSV, robots.txt, source files), use fetch instead — it returns bytes verbatim, while web_scrape would corrupt them with a Markdown pass.
         - A Single Page App may return very little or no content. Do NOT retry with a larger max_chars; try a different URL instead.
       DESC

data/lib/pikuri/version.rb CHANGED Viewed

@@ -6,5 +6,5 @@ module Pikuri
   # additions to the public surface (+Pikuri::Tool+ / +Pikuri::Agent+ /
   # listeners / bundled tools), major for breaking changes to that
   # surface or to the +bin/pikuri-*+ CLIs.
-  VERSION = '0.0.4'
+  VERSION = '0.0.6'
 end

data/lib/pikuri-core.rb CHANGED Viewed

@@ -169,7 +169,6 @@ module Pikuri
   Loader.ignore(File.expand_path('pikuri/version.rb', __dir__))
   Loader.inflector.inflect(
     'html'       => 'HTML',
-    'pdf'        => 'PDF',
     'duckduckgo' => 'DuckDuckGo'
   )
   Loader.setup

metadata CHANGED Viewed

@@ -1,29 +1,15 @@
 --- !ruby/object:Gem::Specification
 name: pikuri-core
 version: !ruby/object:Gem::Version
-  version: 0.0.4
+  version: 0.0.6
 platform: ruby
 authors:
 - Martin Vysny
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2026-05-29 00:00:00.000000000 Z
+date: 2026-06-04 00:00:00.000000000 Z
 dependencies:
-- !ruby/object:Gem::Dependency
-  name: dentaku
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '3.5'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '3.5'
 - !ruby/object:Gem::Dependency
   name: faraday
   requirement: !ruby/object:Gem::Requirement
@@ -52,20 +38,6 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.19'
-- !ruby/object:Gem::Dependency
-  name: pdf-reader
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '2.15'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '2.15'
 - !ruby/object:Gem::Dependency
   name: rainbow
   requirement: !ruby/object:Gem::Requirement
@@ -122,34 +94,6 @@ dependencies:
     - - "~>"
       - !ruby/object:Gem::Version
         version: '1.15'
-- !ruby/object:Gem::Dependency
-  name: tsort
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '0.2'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '0.2'
-- !ruby/object:Gem::Dependency
-  name: tty-markdown
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '0.7'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - "~>"
-      - !ruby/object:Gem::Version
-        version: '0.7'
 - !ruby/object:Gem::Dependency
   name: zeitwerk
   requirement: !ruby/object:Gem::Requirement
@@ -199,16 +143,18 @@ files:
 - lib/pikuri/agent/listener/token_log.rb
 - lib/pikuri/agent/listener_list.rb
 - lib/pikuri/agent/synthesizer.rb
+- lib/pikuri/extractor.rb
+- lib/pikuri/extractor/html.rb
+- lib/pikuri/extractor/passthrough.rb
 - lib/pikuri/file_type.rb
+- lib/pikuri/finalizers.rb
+- lib/pikuri/paths.rb
 - lib/pikuri/subprocess.rb
 - lib/pikuri/tool.rb
 - lib/pikuri/tool/calculator.rb
 - lib/pikuri/tool/fetch.rb
 - lib/pikuri/tool/parameters.rb
-- lib/pikuri/tool/scraper/fetch_error.rb
-- lib/pikuri/tool/scraper/html.rb
-- lib/pikuri/tool/scraper/pdf.rb
-- lib/pikuri/tool/scraper/simple.rb
+- lib/pikuri/tool/scraper.rb
 - lib/pikuri/tool/search/brave.rb
 - lib/pikuri/tool/search/duckduckgo.rb
 - lib/pikuri/tool/search/engines.rb

data/lib/pikuri/tool/scraper/fetch_error.rb DELETED Viewed

@@ -1,16 +0,0 @@
-# frozen_string_literal: true
-module Pikuri
-  class Tool
-    module Scraper
-      # Raised by anything in the scraper stack when a URL cannot be
-      # rendered into Markdown text — HTTP non-2xx, network failure,
-      # redirect-loop, missing +Location+, unsupported content-type, or a
-      # parse failure that reads as "try a different URL" to the LLM.
-      # Catching this in {Tool::WEB_SCRAPE} / {Tool::FETCH} turns the
-      # failure into an +"Error: ..."+ observation; anything else bubbles
-      # up so genuine bugs stay visible.
-      class FetchError < StandardError; end
-    end
-  end
-end

data/lib/pikuri/tool/scraper/html.rb DELETED Viewed

@@ -1,285 +0,0 @@
-# frozen_string_literal: true
-require 'json'
-require 'nokogiri'
-require 'readability'
-require 'reverse_markdown'
-module Pikuri
-  class Tool
-    module Scraper
-      # HTML → Markdown extractor used by {Simple.visit} when the fetched
-      # response carries an HTML content-type.
-      #
-      # Always renders both views of the page when available:
-      #
-      # 1. JSON-LD section. Any +<script type="application/ld+json">+ node
-      #    whose +@type+ matches a substantive schema.org content type
-      #    (Product, Article, Recipe, ...) is rendered as a header — title,
-      #    metadata bullets (brand, SKU, price, rating, author, published),
-      #    and the +articleBody+/+description+ copy when present.
-      # 2. Readability section. The page is run through +Readability+ +
-      #    +reverse_markdown+, with a +<main>+/+<article>+ fallback for
-      #    pages whose content sits mostly outside +<p>+ tags.
-      #
-      # Concatenated with a horizontal rule, so the LLM gets both the
-      # structured metadata and the rendered body and can pick whichever
-      # is more useful for the task. Trades some duplication (when a
-      # publisher embeds the article body in JSON-LD AND in HTML) for
-      # fewer type-based heuristics on which branch should win — the
-      # earlier "is this Article's +description+ a teaser or the real
-      # body?" carve-out is no longer needed because both end up in
-      # the output regardless.
-      #
-      # Pure parser — no I/O. {.extract} takes an HTML string and returns
-      # Markdown, so tests can drive it against fixture HTML without a
-      # network round-trip.
-      module HTML
-        # @return [Array<String>] schema.org +@type+ values that we treat
-        #   as "the primary entity of this page" when picking a JSON-LD
-        #   node to render. Order does not matter — the first matching
-        #   node wins. Skips noise nodes (Organization, BreadcrumbList,
-        #   WebSite, ...) that ship on most pages but carry no page
-        #   content.
-        INTERESTING_TYPES = %w[
-          Product Article NewsArticle BlogPosting Recipe Event Book Movie
-        ].freeze
-        # @return [Array<String>] HTML tags preserved by the readability
-        #   pass. Anything outside this list is stripped before Markdown
-        #   conversion.
-        READABILITY_TAGS = %w[
-          h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
-          strong em b i br hr table thead tbody tr td th
-        ].freeze
-        # @return [Array<String>] HTML attributes preserved by the
-        #   readability pass; everything else (class, id, style, data-*)
-        #   is dropped before Markdown conversion
-        READABILITY_ATTRS = %w[href src alt title].freeze
-        # @return [Float] minimum +<main>+/+<article>+ to Readability
-        #   text-length ratio that triggers the semantic-container
-        #   fallback in {.readability_to_markdown}. Picked low enough to
-        #   catch the failure mode (Readability collapsing a page that
-        #   uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
-        #   ~5x) but high enough that pages where both produce
-        #   comparable output keep Readability's noise filtering.
-        MAIN_FALLBACK_RATIO = 2.0
-        # @return [Integer] minimum text length the
-        #   +<main>+/+<article>+ container must hold before the fallback
-        #   in {.readability_to_markdown} can fire. Below this, the
-        #   ratio comparison is dominated by noise and we'd swap on
-        #   tiny pages where Readability is doing the right thing.
-        MAIN_FALLBACK_MIN_CHARS = 500
-        # Render +html+ as Markdown by emitting both the JSON-LD section
-        # (when an interesting node is present) and the readability /
-        # +<main>+ section, joined by a horizontal rule. Either section
-        # may be missing — pages with no JSON-LD return only the
-        # readability output, and a malformed page with no extractable
-        # body returns only the JSON-LD render.
-        #
-        # @param html [String] HTML document body
-        # @return [String] Markdown representation
-        def self.extract(html)
-          sections = [jsonld_section(html), readability_to_markdown(html)]
-          sections.reject! { |s| s.nil? || s.strip.empty? }
-          sections.join("\n\n---\n\n")
-        end
-        # Pick the first JSON-LD node whose +@type+ matches one of
-        # {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
-        # when no such node exists, in which case {.extract} emits only
-        # the readability section.
-        #
-        # No content-field gating: a node carrying just +name+/+author+/
-        # +datePublished+ still renders (as a metadata-only header),
-        # because the readability pass independently produces the page
-        # body. That is the trade-off that lets us drop the type-based
-        # "is this teaser or article copy?" heuristics — duplication is
-        # acceptable when both views are available, and the LLM can
-        # pick whichever it needs.
-        #
-        # @param html [String] HTML document body
-        # @return [String, nil] Markdown render of the picked JSON-LD
-        #   node, or +nil+ when nothing matched
-        def self.jsonld_section(html)
-          node = parse_jsonld(html).find do |n|
-            Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
-          end
-          node ? jsonld_to_markdown(node) : nil
-        end
-        # Collect every JSON-LD payload embedded in +html+, flattening
-        # +@graph+ wrappers so callers see one flat array of schema.org
-        # nodes. Malformed JSON blocks are silently skipped — sites
-        # frequently ship broken JSON-LD and we only need at least one
-        # parseable block.
-        #
-        # @param html [String] HTML document body
-        # @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
-        def self.parse_jsonld(html)
-          doc = Nokogiri::HTML(html)
-          blobs = doc.css('script[type="application/ld+json"]').map(&:text)
-          blobs.flat_map do |raw|
-            parsed = begin
-              JSON.parse(raw)
-            rescue JSON::ParserError
-              nil
-            end
-            next [] unless parsed
-            nodes = parsed.is_a?(Array) ? parsed : [parsed]
-            nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
-          end
-        end
-        # Render a single JSON-LD +node+ as Markdown: a top-level title
-        # from +name+/+headline+, a bullet list of common useful fields
-        # (brand, SKU, price, rating, author, published date, ...), the
-        # body copy, and the lead image.
-        #
-        # When the node carries +articleBody+ (the full publisher-supplied
-        # article text), that wins over +description+ — the description
-        # is typically a lede teaser and would just repeat the article's
-        # opening lines.
-        #
-        # @param node [Hash] JSON-LD node, typically picked by
-        #   {.jsonld_section}
-        # @return [String] Markdown representation
-        def self.jsonld_to_markdown(node)
-          out = +''
-          name = node['name'] || node['headline']
-          out << "# #{name}\n\n" if name
-          offer  = first_obj(node['offers'])
-          rating = first_obj(node['aggregateRating'])
-          brand  = first_obj_or_string(node['brand'])
-          author = first_obj_or_string(node['author'])
-          brand_name  = brand.is_a?(Hash)  ? brand['name']  : brand
-          author_name = author.is_a?(Hash) ? author['name'] : author
-          fields = {
-            'Brand'        => brand_name,
-            'SKU'          => node['sku'],
-            'GTIN'         => node['gtin13'] || node['gtin'],
-            'Price'        => [offer['price'], offer['priceCurrency']].compact.join(' '),
-            'Availability' => offer['availability'],
-            'Rating'       => rating['ratingValue'],
-            'Reviews'      => rating['reviewCount'],
-            'Author'       => author_name,
-            'Published'    => node['datePublished']
-          }.reject { |_, v| v.nil? || v.to_s.strip.empty? }
-          unless fields.empty?
-            fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
-            out << "\n"
-          end
-          if (body = node['articleBody'] || node['description'])
-            out << "#{body}\n\n"
-          end
-          if (img = node['image'])
-            img = img.first if img.is_a?(Array)
-            img = img['url'] if img.is_a?(Hash)
-            out << "![image](#{img})\n\n" if img
-          end
-          out
-        end
-        # Run +Readability+ over +html+ to isolate the main content node,
-        # then convert that to Markdown via +reverse_markdown+. The page
-        # +<title>+ is rendered as a top-level heading.
-        #
-        # When the page uses semantic HTML5 (+<main>+ or +<article>+) but
-        # leaves most of its content outside +<p>+ tags — divs, lists,
-        # spans — Readability's paragraph-density scoring collapses the
-        # extraction to a sliver of the page. In that case we render the
-        # +<main>+/+<article>+ container directly. The fallback only
-        # fires when the container holds substantially more text than
-        # Readability picked up (see {MAIN_FALLBACK_RATIO} /
-        # {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
-        # Readability so its noise filtering still strips nav/ads/etc.
-        #
-        # @param html [String] HTML document body
-        # @return [String] Markdown representation
-        def self.readability_to_markdown(html)
-          rdoc = Readability::Document.new(
-            html,
-            tags: READABILITY_TAGS,
-            attributes: READABILITY_ATTRS,
-            remove_empty_nodes: true
-          )
-          readability_html = rdoc.content
-          title = rdoc.title
-          body_html = main_fallback_html(html, readability_html) || readability_html
-          body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
-          out = +''
-          out << "# #{title.strip}\n\n" if title && !title.strip.empty?
-          out << body
-          out
-        end
-        # If +html+ has a +<main>+ or +<article>+ element holding
-        # substantially more text than Readability extracted, return that
-        # container's HTML so the caller can render it instead. Returns
-        # +nil+ when the fallback should not fire — when there is no
-        # semantic container, when it's too small to be meaningful, or
-        # when Readability's output is already comparable.
-        #
-        # @param html [String] full HTML document body, used to locate
-        #   the +<main>+/+<article>+ container
-        # @param readability_html [String] HTML produced by
-        #   +Readability::Document#content+, used as the comparison
-        #   baseline
-        # @return [String, nil] container HTML when the fallback should
-        #   fire, +nil+ otherwise
-        def self.main_fallback_html(html, readability_html)
-          doc = Nokogiri::HTML(html)
-          container = doc.at_css('main') || doc.at_css('article')
-          return nil unless container
-          container_text_len = container.text.gsub(/\s+/, ' ').strip.length
-          return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
-          readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
-          return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
-          container.to_html
-        end
-        private_class_method :main_fallback_html
-        # JSON-LD fields can be a string, hash, or array of either.
-        # Normalize to a single hash (the first one if it's a list) so
-        # callers can +.dig+ safely.
-        #
-        # @param value [Object] raw JSON-LD field value
-        # @return [Hash] empty hash when +value+ does not contain a hash
-        def self.first_obj(value)
-          value = value.first if value.is_a?(Array)
-          value.is_a?(Hash) ? value : {}
-        end
-        private_class_method :first_obj
-        # Same idea as {.first_obj} but preserves a bare string (e.g.
-        # +brand: "Apple"+) instead of replacing it with +{}+.
-        #
-        # @param value [Object] raw JSON-LD field value
-        # @return [String, Hash, nil]
-        def self.first_obj_or_string(value)
-          value = value.first if value.is_a?(Array)
-          value
-        end
-        private_class_method :first_obj_or_string
-      end
-    end
-  end
-end