RubyGems - html2rss - Versions diffs - 0.16.0 → 0.18.0 - Mend

html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

checksums.yaml +4 -4
data/README.md +48 -657
data/exe/html2rss +1 -1
data/html2rss.gemspec +7 -4
data/lib/html2rss/articles/deduplicator.rb +49 -0
data/lib/html2rss/auto_source/cleanup.rb +33 -5
data/lib/html2rss/auto_source/scraper/html.rb +118 -43
data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
data/lib/html2rss/auto_source/scraper.rb +142 -8
data/lib/html2rss/auto_source.rb +119 -47
data/lib/html2rss/blocked_surface.rb +64 -0
data/lib/html2rss/category_extractor.rb +82 -0
data/lib/html2rss/cli.rb +170 -23
data/lib/html2rss/config/class_methods.rb +189 -0
data/lib/html2rss/config/dynamic_params.rb +68 -0
data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
data/lib/html2rss/config/request_headers.rb +130 -0
data/lib/html2rss/config/schema.rb +208 -0
data/lib/html2rss/config/validator.rb +108 -0
data/lib/html2rss/config.rb +112 -61
data/lib/html2rss/error.rb +6 -0
data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
data/lib/html2rss/html_extractor.rb +136 -0
data/lib/html2rss/html_navigator.rb +46 -0
data/lib/html2rss/json_feed_builder/item.rb +94 -0
data/lib/html2rss/json_feed_builder.rb +58 -0
data/lib/html2rss/rendering/audio_renderer.rb +31 -0
data/lib/html2rss/rendering/description_builder.rb +88 -0
data/lib/html2rss/rendering/image_renderer.rb +31 -0
data/lib/html2rss/rendering/media_renderer.rb +33 -0
data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
data/lib/html2rss/rendering/video_renderer.rb +31 -0
data/lib/html2rss/rendering.rb +14 -0
data/lib/html2rss/request_controls.rb +128 -0
data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
data/lib/html2rss/request_service/budget.rb +39 -0
data/lib/html2rss/request_service/context.rb +64 -20
data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
data/lib/html2rss/request_service/policy.rb +248 -0
data/lib/html2rss/request_service/puppet_commander.rb +212 -13
data/lib/html2rss/request_service/response.rb +42 -2
data/lib/html2rss/request_service/response_guard.rb +62 -0
data/lib/html2rss/request_service.rb +31 -15
data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
data/lib/html2rss/request_session/runtime_input.rb +57 -0
data/lib/html2rss/request_session/runtime_policy.rb +76 -0
data/lib/html2rss/request_session.rb +118 -0
data/lib/html2rss/rss_builder/article.rb +166 -0
data/lib/html2rss/rss_builder/channel.rb +96 -11
data/lib/html2rss/rss_builder/enclosure.rb +48 -0
data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
data/lib/html2rss/rss_builder.rb +72 -71
data/lib/html2rss/selectors/config.rb +122 -0
data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
data/lib/html2rss/selectors/extractors/href.rb +53 -0
data/lib/html2rss/selectors/extractors/html.rb +48 -0
data/lib/html2rss/selectors/extractors/static.rb +41 -0
data/lib/html2rss/selectors/extractors/text.rb +46 -0
data/lib/html2rss/selectors/extractors.rb +52 -0
data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
data/lib/html2rss/selectors/post_processors/base.rb +74 -0
data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
data/lib/html2rss/selectors/post_processors/template.rb +73 -0
data/lib/html2rss/selectors/post_processors.rb +43 -0
data/lib/html2rss/selectors.rb +294 -0
data/lib/html2rss/url.rb +262 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +129 -70
data/lib/tasks/config_schema.rake +17 -0
data/schema/html2rss-config.schema.json +469 -0
metadata +120 -46
data/lib/html2rss/attribute_post_processors/base.rb +0 -74
data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
data/lib/html2rss/attribute_post_processors/template.rb +0 -101
data/lib/html2rss/attribute_post_processors.rb +0 -44
data/lib/html2rss/auto_source/article.rb +0 -127
data/lib/html2rss/auto_source/channel.rb +0 -78
data/lib/html2rss/auto_source/reducer.rb +0 -48
data/lib/html2rss/auto_source/rss_builder.rb +0 -70
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
data/lib/html2rss/config/channel.rb +0 -125
data/lib/html2rss/config/selectors.rb +0 -103
data/lib/html2rss/item.rb +0 -186
data/lib/html2rss/item_extractors/attribute.rb +0 -50
data/lib/html2rss/item_extractors/href.rb +0 -52
data/lib/html2rss/item_extractors/html.rb +0 -46
data/lib/html2rss/item_extractors/static.rb +0 -39
data/lib/html2rss/item_extractors/text.rb +0 -44
data/lib/html2rss/item_extractors.rb +0 -88
data/lib/html2rss/object_to_xml_converter.rb +0 -56
data/lib/html2rss/rss_builder/item.rb +0 -83
data/lib/html2rss/utils.rb +0 -113

data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb ADDED Viewed

@@ -0,0 +1,199 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      class SemanticHtml
+        ##
+        # Selects the best content-like anchor from a semantic container.
+        #
+        # The selector turns raw DOM anchors into ranked facts so semantic
+        # scraping can reason about link intent instead of DOM order. It favors
+        # heading-aligned article links and suppresses utility links, duplicate
+        # destinations, and weak textless affordances.
+        class AnchorSelector # rubocop:disable Metrics/ClassLength
+          AnchorFacts = Data.define(
+            :anchor,
+            :text,
+            :url,
+            :destination,
+            :segments,
+            :meaningful_text,
+            :content_like_destination,
+            :heading_anchor,
+            :heading_text_match,
+            :score
+          )
+          HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
+          UTILITY_PATH_SEGMENTS = %w[
+            about account author category comment comments contact feedback help
+            login newsletter profile register search settings share signup subscribe
+            topic topics view-all archive archives
+            feed feeds
+            recommended
+            for-you
+            preference preferences
+            notification notifications
+            privacy terms
+            cookie cookies
+            logout
+            user users
+          ].to_set.freeze
+          CONTENT_PATH_SEGMENTS = %w[
+            article articles news post posts story stories update updates
+          ].to_set.freeze
+          UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
+          def initialize(base_url)
+            @base_url = base_url
+          end
+          ##
+          # Chooses the single anchor that best represents the story contained
+          # in a semantic block.
+          #
+          # Ranking is scoped to one container at a time. That keeps the logic
+          # local, makes duplicate links to the same destination collapse into
+          # one candidate, and avoids page-wide heuristics leaking across cards.
+          #
+          # @param container [Nokogiri::XML::Element] semantic container being evaluated
+          # @return [Nokogiri::XML::Element, nil] selected primary anchor or nil when none qualify
+          def primary_anchor_for(container)
+            facts_for(container).max_by(&:score)&.anchor
+          end
+          private
+          attr_reader :base_url
+          def facts_for(container)
+            heading = heading_for(container)
+            heading_text = visible_text(heading)
+            container.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).each_with_object({}) do |anchor, best_by_destination|
+              next if anchor.path.match?(Html::TAGS_TO_IGNORE)
+              facts = build_facts(anchor, heading, heading_text)
+              next unless facts
+              keep_stronger_fact(best_by_destination, facts)
+            end.values
+          end
+          def build_facts(anchor, heading, heading_text) # rubocop:disable Metrics/MethodLength
+            text = visible_text(anchor)
+            meaningful_text = meaningful_text?(text)
+            ancestors = anchor.ancestors.to_a
+            url = normalized_destination(anchor)
+            return unless url
+            segments = url.path_segments
+            content_like_destination = content_like_destination?(segments)
+            return if ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
+            heading_anchor = heading_anchor?(ancestors, heading)
+            heading_text_match = heading_text_match?(heading_text, text, meaningful_text)
+            return unless heading_anchor || content_like_anchor?(meaningful_text, content_like_destination)
+            AnchorFacts.new(
+              anchor:,
+              text:,
+              url:,
+              destination: url.to_s,
+              segments:,
+              meaningful_text:,
+              content_like_destination:,
+              heading_anchor:,
+              heading_text_match:,
+              score: score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
+            )
+          end
+          def ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
+            utility_destination?(segments) ||
+              utility_text?(text) ||
+              icon_only_anchor?(anchor, meaningful_text) ||
+              utility_landmark_anchor?(ancestors)
+          end
+          def keep_stronger_fact(best_by_destination, facts)
+            current = best_by_destination[facts.destination]
+            return best_by_destination[facts.destination] = facts unless current
+            return if current.score >= facts.score
+            best_by_destination[facts.destination] = facts
+          end
+          def content_like_anchor?(meaningful_text, content_like_destination)
+            meaningful_text || content_like_destination
+          end
+          def score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
+            score = 0
+            score += 100 if heading_anchor
+            score += 20 if heading_text_match
+            score += 10 if meaningful_text
+            score += 10 if content_like_destination
+            score
+          end
+          def heading_anchor?(ancestors, heading)
+            heading && ancestors.include?(heading)
+          end
+          def heading_text_match?(heading_text, text, meaningful_text)
+            meaningful_text && meaningful_text?(heading_text) && heading_text == text
+          end
+          def heading_for(container)
+            container.at_css(HEADING_SELECTOR)
+          end
+          def icon_only_anchor?(anchor, meaningful_text)
+            !meaningful_text && anchor.at_css('img, svg')
+          end
+          def utility_destination?(segments)
+            segments.empty? || segments.any? { |segment| UTILITY_PATH_SEGMENTS.include?(segment) }
+          end
+          def content_like_destination?(segments)
+            segments.any? do |segment|
+              CONTENT_PATH_SEGMENTS.include?(segment) || segment.match?(/\A\d[\w-]*\z/)
+            end
+          end
+          def normalized_destination(anchor)
+            href = anchor['href'].to_s.split('#').first.to_s.strip
+            return if href.empty?
+            Html2rss::Url.from_relative(href, base_url)
+          rescue ArgumentError
+            nil
+          end
+          def meaningful_text?(text)
+            text.scan(/\p{Alnum}+/).any?
+          end
+          def utility_text?(text)
+            text.match?(
+              /\A(about|contact|log in|login|sign up|signup|share|comments?|view all|recommended for you|subscribe)\b/i
+            )
+          end
+          def utility_landmark_anchor?(ancestors)
+            ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
+          end
+          def visible_text(node)
+            return '' unless node
+            HtmlExtractor.extract_visible_text(node).to_s.strip
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/semantic_html.rb CHANGED Viewed

@@ -1,114 +1,120 @@
 # frozen_string_literal: true
-require 'addressable'
-require 'parallel'
+require_relative 'semantic_html/anchor_selector'
 module Html2rss
   class AutoSource
     module Scraper
       ##
-      # Scrapes articles by looking for common markup tags (article, section, li)
-      # containing an <a href> tag.
+      # Scrapes semantic containers by choosing one primary content link per
+      # block before extraction.
       #
-      # See:
-      # 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
+      # This scraper is intentionally container-first:
+      # 1. collect candidate semantic containers once
+      # 2. select the strongest content-like anchor within each container
+      # 3. extract fields from the container while honoring that anchor choice
+      #
+      # The result is lower recall on weak-signal blocks, but much better link
+      # quality on modern teaser cards that mix headlines, utility links, and
+      # duplicate image overlays.
       class SemanticHtml
         include Enumerable
+        Entry = Data.define(:container, :selected_anchor)
+        CONTAINER_SELECTORS = [
+          'article:not(:has(article))',
+          'section:not(:has(section))',
+          'li:not(:has(li))',
+          'tr:not(:has(tr))',
+          'div:not(:has(div))'
+        ].freeze
         ##
-        # Map of parent element names to CSS selectors for finding <a href> tags.
-        ANCHOR_TAG_SELECTORS = {
-          'section' => ['section :not(section) a[href]'],
-          'tr' => ['table tr :not(tr) a[href]'],
-          'article' => [
-            'article :not(article) a[href]',
-            'article a[href]'
-          ],
-          'li' => [
-            'ul > li :not(li) a[href]',
-            'ol > li :not(li) a[href]'
-          ]
-        }.freeze
-        # Check if the parsed_body contains articles
-        # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
-        # @return [Boolean] True if articles are found, otherwise false.
+        # @return [Symbol] config key used to enable or configure this scraper
+        def self.options_key = :semantic_html
+        # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+        # @return [Boolean] true when at least one semantic container has an eligible anchor
         def self.articles?(parsed_body)
           return false unless parsed_body
-          ANCHOR_TAG_SELECTORS.each_value do |selectors|
-            return true if selectors.any? { |selector| parsed_body.at_css(selector) }
-          end
-          false
+          new(parsed_body, url: 'https://example.com').extractable?
+        end
+        # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+        # @param url [String, Html2rss::Url] base url
+        # @param extractor [Class] extractor class used for article extraction
+        def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
+          @parsed_body = parsed_body
+          @url = url
+          @extractor = extractor
+          @anchor_selector = AnchorSelector.new(url)
         end
-        # Finds the closest ancestor tag matching the specified tag name
-        # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
-        # @param tag_name [String] The tag name to search for
-        # @param stop_tag [String] The tag name to stop searching at
-        # @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
-        def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
-          return current_tag if current_tag.name == tag_name
+        attr_reader :parsed_body
-          stop_tags = Set[tag_name, stop_tag]
+        ##
+        # Yields extracted article hashes for each semantic container that
+        # survives anchor selection.
+        #
+        # Detection and extraction share the same memoized entry list so this
+        # scraper does not rerun anchor ranking once a page has already been
+        # accepted as extractable.
+        #
+        # @yieldparam article_hash [Hash] extracted article hash
+        # @return [Enumerator<Hash>]
+        def each
+          return enum_for(:each) unless block_given?
-          while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
-            current_tag = current_tag.parent
+          extractable_entries.each do |entry|
+            article_hash = @extractor.new(
+              entry.container,
+              base_url: @url,
+              selected_anchor: entry.selected_anchor
+            ).call
+            yield article_hash if article_hash
           end
-          current_tag
         end
-        # Finds the closest matching selector upwards in the DOM tree
-        # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
-        # @param selector [String] The CSS selector to search for
-        # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
-        def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
-          current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
+        ##
+        # Reports whether the page contains at least one semantic container with
+        # a selectable primary anchor.
+        #
+        # @return [Boolean] true when at least one candidate container yields a primary anchor
+        def extractable?
+          extractable_entries.any?
         end
-        # Helper method to find a matching selector upwards
-        # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
-        # @param selector [String] The CSS selector to search for
-        # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
-        def self.find_closest_selector_upwards(current_tag, selector:)
-          while current_tag
-            found = current_tag.at_css(selector)
-            return found if found
-            return nil unless current_tag.respond_to?(:parent)
+        protected
-            current_tag = current_tag.parent
-          end
+        def candidate_containers
+          @candidate_containers ||= collect_candidate_containers
         end
-        # Returns an array of [tag_name, selector] pairs
-        # @return [Array<[String, String]>] Array of tag name and selector pairs
-        def self.anchor_tag_selector_pairs
-          ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
-            selectors.map { |selector| [tag_name, selector] }
-          end
+        def primary_anchor_for(container)
+          @anchor_selector.primary_anchor_for(container)
         end
-        def initialize(parsed_body, url:)
-          @parsed_body = parsed_body
-          @url = url
-        end
+        def extractable_entries
+          @extractable_entries ||= candidate_containers.filter_map do |container|
+            selected_anchor = primary_anchor_for(container)
+            next unless selected_anchor
-        attr_reader :parsed_body
+            Entry.new(container:, selected_anchor:)
+          end
+        end
-        ##
-        # @yieldparam [Hash] The scraped article hash
-        # @return [Enumerator] Enumerator for the scraped articles
-        def each
-          return enum_for(:each) unless block_given?
+        def collect_candidate_containers
+          seen = {}.compare_by_identity
-          SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
-            parsed_body.css(selector).each do |selected_tag|
-              article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
-              article_hash = Extractor.new(article_tag, url: @url).call
+          CONTAINER_SELECTORS.each_with_object([]) do |selector, containers|
+            parsed_body.css(selector).each do |container|
+              next if container.path.match?(Html::TAGS_TO_IGNORE)
+              next if seen[container]
-              yield article_hash if article_hash
+              seen[container] = true
+              containers << container
             end
           end
         end

data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb ADDED Viewed

@@ -0,0 +1,261 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      class WordpressApi
+        ##
+        # Determines whether a WordPress page can safely be mapped to a posts query.
+        class PageScope
+          CATEGORY_SEGMENT = 'category'
+          TAG_SEGMENT = 'tag'
+          AUTHOR_SEGMENT = 'author'
+          ##
+          # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+          # @param url [Html2rss::Url] canonical page URL
+          # @return [PageScope] derived page scope
+          def self.from(parsed_body:, url:)
+            Resolver.new(parsed_body:, url:).call
+          end
+          ##
+          # @param query [Hash<String, String>] scoped query params for the posts endpoint
+          # @param fetchable [Boolean] whether a posts follow-up is safe for this page
+          # @param reason [Symbol] classification of the resolved page scope
+          def initialize(query:, fetchable:, reason:)
+            @query = query.freeze
+            @fetchable = fetchable
+            @reason = reason
+            freeze
+          end
+          ##
+          # @return [Hash<String, String>] query params to apply to the posts request
+          attr_reader :query
+          ##
+          # @return [Boolean] whether the page may safely use the posts API follow-up
+          def fetchable?
+            @fetchable
+          end
+          ##
+          # @return [Symbol] classification of the resolved page scope
+          attr_reader :reason
+          ##
+          # Resolves the page scope from page markup and canonical URL signals.
+          class Resolver # rubocop:disable Metrics/ClassLength
+            ##
+            # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+            # @param url [Html2rss::Url] canonical page URL
+            def initialize(parsed_body:, url:)
+              @parsed_body = parsed_body
+              @url = Html2rss::Url.from_absolute(url)
+            end
+            ##
+            # @return [PageScope] derived page scope
+            def call
+              category_scope ||
+                tag_scope ||
+                author_scope ||
+                date_scope ||
+                fallback_scope
+            end
+            private
+            attr_reader :parsed_body, :url
+            def category_scope
+              return unless category_archive?
+              scoped_scope('categories' => archive_id('category'))
+            end
+            def tag_scope
+              return unless tag_archive?
+              scoped_scope('tags' => archive_id('tag'))
+            end
+            def author_scope
+              return unless author_archive?
+              scoped_scope('author' => archive_id('author'))
+            end
+            def date_scope
+              return unless date_archive?
+              range = date_archive_range
+              return unknown_archive_scope unless range
+              PageScope.new(query: range, fetchable: true, reason: :archive)
+            end
+            def fallback_scope
+              return unknown_archive_scope if archive_like?
+              return non_archive_scope if singular_like?
+              PageScope.new(query: {}, fetchable: true, reason: :unscoped)
+            end
+            def scoped_scope(query)
+              return unknown_archive_scope if query.values.any?(&:nil?)
+              PageScope.new(query:, fetchable: true, reason: :archive)
+            end
+            def unknown_archive_scope
+              PageScope.new(query: {}, fetchable: false, reason: :unsupported_archive)
+            end
+            def non_archive_scope
+              PageScope.new(query: {}, fetchable: false, reason: :non_archive)
+            end
+            def category_archive?
+              body_classes.include?('category') || leading_path_segment == CATEGORY_SEGMENT
+            end
+            def tag_archive?
+              body_classes.include?('tag') || leading_path_segment == TAG_SEGMENT
+            end
+            def author_archive?
+              body_classes.include?('author') || leading_path_segment == AUTHOR_SEGMENT
+            end
+            def date_archive?
+              body_classes.include?('date') || date_archive_path?
+            end
+            def archive_like?
+              category_archive? || tag_archive? || author_archive? || date_archive? || body_classes.include?('archive')
+            end
+            def singular_like?
+              body_classes.intersect?(%w[page single singular attachment]) ||
+                body_classes.any? { _1.match?(/\A(?:page-id|postid)-\d+\z/) }
+            end
+            def body_classes
+              @body_classes ||= parsed_body.at_css('body')&.[]('class').to_s.split
+            end
+            def archive_id(prefix)
+              body_classes.filter_map do |klass|
+                klass[Regexp.new("^#{Regexp.escape(prefix)}-(\\d+)$"), 1]
+              end.first
+            end
+            def canonical_or_current_url
+              href = parsed_body.at_css(WordpressApi::CANONICAL_LINK_SELECTOR)&.[]('href').to_s.strip
+              return url if href.empty?
+              canonical_url = Html2rss::Url.from_relative(href, url)
+              same_origin_url?(canonical_url, url) ? canonical_url : url
+            rescue ArgumentError
+              url
+            end
+            def path_segments
+              @path_segments ||= canonical_or_current_url.path_segments
+            end
+            def leading_path_segment
+              path_segments.first
+            end
+            def date_archive_path?
+              !date_archive_segments.nil?
+            end
+            def date_archive_range
+              components = date_archive_components
+              return unless components
+              start_date = Date.new(*components.fetch(:start_date_parts))
+              {
+                'after' => iso8601_start(start_date),
+                'before' => iso8601_start(next_archive_boundary(start_date, components.fetch(:precision)))
+              }
+            rescue Date::Error
+              nil
+            end
+            def date_archive_components
+              segments = date_archive_segments
+              return unless segments
+              year = segments.fetch(0).to_i
+              month = parse_archive_segment(segments[1], 1, 12)
+              day = parse_archive_segment(segments[2], 1, 31)
+              {
+                start_date_parts: [year, month || 1, day || 1],
+                precision: archive_precision(month:, day:)
+              }
+            end
+            def date_archive_segments
+              year_index = path_segments.find_index { _1.match?(/\A\d{4}\z/) }
+              return unless year_index
+              segments = path_segments.drop(year_index)
+              return unless segments.length.between?(1, 3)
+              return unless archive_segment_shape?(segments)
+              segments
+            end
+            def archive_segment_shape?(segments)
+              month = segments[1]
+              day = segments[2]
+              return false if day && month.nil?
+              return false unless month.nil? || month.match?(/\A\d+\z/)
+              return false unless day.nil? || day.match?(/\A\d+\z/)
+              true
+            end
+            def same_origin_url?(left, right)
+              [left.scheme, left.host, left.port] == [right.scheme, right.host, right.port]
+            end
+            def archive_precision(month:, day:)
+              return :day if day
+              return :month if month
+              :year
+            end
+            def next_archive_boundary(start_date, precision)
+              {
+                year: start_date.next_year,
+                month: start_date.next_month,
+                day: start_date.next_day
+              }.fetch(precision)
+            end
+            def iso8601_start(date)
+              date.strftime('%Y-%m-%dT00:00:00Z')
+            end
+            def parse_archive_segment(value, minimum, maximum)
+              return nil unless value&.match?(/\A\d+\z/)
+              number = value.to_i
+              return nil if number < minimum || number > maximum
+              number
+            end
+          end
+        end
+      end
+    end
+  end
+end