RubyGems - html2rss - Versions diffs - 0.17.0 → 0.19.0 - Mend

html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

checksums.yaml +4 -4
data/README.md +90 -639
data/exe/html2rss +1 -1
data/html2rss.gemspec +5 -2
data/lib/html2rss/articles/deduplicator.rb +50 -0
data/lib/html2rss/auto_source/cleanup.rb +44 -5
data/lib/html2rss/auto_source/scraper/html.rb +123 -43
data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
data/lib/html2rss/auto_source/scraper.rb +160 -8
data/lib/html2rss/auto_source.rb +123 -47
data/lib/html2rss/blocked_surface.rb +65 -0
data/lib/html2rss/category_extractor.rb +82 -0
data/lib/html2rss/cli.rb +194 -23
data/lib/html2rss/config/class_methods.rb +178 -0
data/lib/html2rss/config/dynamic_params.rb +70 -0
data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
data/lib/html2rss/config/request_headers.rb +136 -0
data/lib/html2rss/config/schema.rb +240 -0
data/lib/html2rss/config/validator.rb +146 -0
data/lib/html2rss/config.rb +118 -61
data/lib/html2rss/error.rb +31 -0
data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
data/lib/html2rss/feed_pipeline.rb +127 -0
data/lib/html2rss/hash_util.rb +101 -0
data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
data/lib/html2rss/html_extractor.rb +141 -0
data/lib/html2rss/html_navigator.rb +54 -0
data/lib/html2rss/json_feed_builder/item.rb +94 -0
data/lib/html2rss/json_feed_builder.rb +59 -0
data/lib/html2rss/rendering/audio_renderer.rb +36 -0
data/lib/html2rss/rendering/description_builder.rb +87 -0
data/lib/html2rss/rendering/image_renderer.rb +41 -0
data/lib/html2rss/rendering/media_renderer.rb +37 -0
data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
data/lib/html2rss/rendering/video_renderer.rb +36 -0
data/lib/html2rss/rendering.rb +23 -0
data/lib/html2rss/request_controls.rb +123 -0
data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
data/lib/html2rss/request_service/budget.rb +39 -0
data/lib/html2rss/request_service/context.rb +77 -21
data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
data/lib/html2rss/request_service/policy.rb +252 -0
data/lib/html2rss/request_service/puppet_commander.rb +212 -13
data/lib/html2rss/request_service/response.rb +51 -3
data/lib/html2rss/request_service/response_guard.rb +62 -0
data/lib/html2rss/request_service.rb +50 -15
data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
data/lib/html2rss/request_session/runtime_input.rb +71 -0
data/lib/html2rss/request_session/runtime_policy.rb +83 -0
data/lib/html2rss/request_session.rb +122 -0
data/lib/html2rss/rss_builder/article.rb +187 -0
data/lib/html2rss/rss_builder/channel.rb +105 -11
data/lib/html2rss/rss_builder/enclosure.rb +62 -0
data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
data/lib/html2rss/rss_builder.rb +76 -71
data/lib/html2rss/selectors/config.rb +123 -0
data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
data/lib/html2rss/selectors/extractors/href.rb +55 -0
data/lib/html2rss/selectors/extractors/html.rb +49 -0
data/lib/html2rss/selectors/extractors/static.rb +42 -0
data/lib/html2rss/selectors/extractors/text.rb +47 -0
data/lib/html2rss/selectors/extractors.rb +53 -0
data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
data/lib/html2rss/selectors/post_processors/base.rb +80 -0
data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
data/lib/html2rss/selectors/post_processors/template.rb +76 -0
data/lib/html2rss/selectors/post_processors.rb +48 -0
data/lib/html2rss/selectors.rb +301 -0
data/lib/html2rss/url.rb +266 -0
data/lib/html2rss/version.rb +2 -1
data/lib/html2rss.rb +67 -71
data/lib/tasks/config_schema.rake +17 -0
data/schema/html2rss-config.schema.json +551 -0
metadata +120 -38
data/lib/html2rss/attribute_post_processors/base.rb +0 -74
data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
data/lib/html2rss/attribute_post_processors/template.rb +0 -101
data/lib/html2rss/attribute_post_processors.rb +0 -44
data/lib/html2rss/auto_source/article.rb +0 -127
data/lib/html2rss/auto_source/channel.rb +0 -78
data/lib/html2rss/auto_source/reducer.rb +0 -48
data/lib/html2rss/auto_source/rss_builder.rb +0 -70
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
data/lib/html2rss/config/channel.rb +0 -125
data/lib/html2rss/config/selectors.rb +0 -103
data/lib/html2rss/item.rb +0 -186
data/lib/html2rss/item_extractors/attribute.rb +0 -50
data/lib/html2rss/item_extractors/href.rb +0 -52
data/lib/html2rss/item_extractors/html.rb +0 -46
data/lib/html2rss/item_extractors/static.rb +0 -39
data/lib/html2rss/item_extractors/text.rb +0 -44
data/lib/html2rss/item_extractors.rb +0 -88
data/lib/html2rss/object_to_xml_converter.rb +0 -56
data/lib/html2rss/rss_builder/item.rb +0 -83
data/lib/html2rss/utils.rb +0 -113

data/lib/html2rss/auto_source/scraper/schema/thing.rb CHANGED Viewed

@@ -11,6 +11,7 @@ module Html2rss
         #
         # @see https://schema.org/Thing
         class Thing
+          # Supported Schema.org `@type` values mapped to article extraction.
           SUPPORTED_TYPES = %w[
             AdvertiserContentArticle
             AnalysisNewsArticle
@@ -32,11 +33,14 @@ module Html2rss
             TechArticle
           ].to_set.freeze
-          DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
+          # Attributes exposed by `#call` in generated article hashes.
+          DEFAULT_ATTRIBUTES = %i[id title description url image published_at categories].freeze
+          # @param schema_object [Hash{Symbol => Object}] parsed schema.org object
+          # @param url [String, Html2rss::Url, nil] base URL used for relative normalization
           def initialize(schema_object, url:)
             @schema_object = schema_object
-            @url = url
+            @base_url = normalized_base_url(url)
           end
           # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
@@ -46,24 +50,27 @@ module Html2rss
             end
           end
+          # @return [String, nil] stable schema object identifier
           def id
             return @id if defined?(@id)
-            id = (schema_object[:@id] || url&.path).to_s
+            id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s
             return if id.empty?
             @id = id
           end
+          # @return [String, nil] article title
           def title = schema_object[:title]
+          # @return [String, nil] longest available description field
           def description
             schema_object.values_at(:description, :schema_object_body, :abstract)
                          .max_by { |string| string.to_s.size }
           end
-          # @return [Addressable::URI, nil] the URL of the schema object
+          # @return [Html2rss::Url, nil] the URL of the schema object
           def url
             url = schema_object[:url]
             if url.to_s.empty?
@@ -71,21 +78,29 @@ module Html2rss
               return
             end
-            Utils.build_absolute_url_from_relative(url, @url)
+            Url.from_relative(url, base_url || url)
           end
+          # @return [Html2rss::Url, nil] normalized article image URL
           def image
             if (image_url = image_urls.first)
-              Utils.build_absolute_url_from_relative(image_url, @url)
+              Url.from_relative(image_url, base_url || image_url)
             end
           end
+          # @return [String, nil] published-at timestamp string
           def published_at = schema_object[:datePublished]
-          private
+          # @return [Array<String>, nil] extracted category labels
+          def categories
+            return @categories if defined?(@categories)
-          attr_reader :schema_object
+            @categories = CategoryExtractor.call(schema_object)
+          end
+          attr_reader :schema_object, :base_url
+          # @return [Array<String>] normalized image URL candidates
           def image_urls
             schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
               next unless object
@@ -97,6 +112,52 @@ module Html2rss
               end
             end
           end
+          # @param value [String, Symbol, nil] candidate schema identifier
+          # @param reference_url [Html2rss::Url, nil] URL used for same-origin normalization
+          # @return [String, nil] normalized identifier value
+          def normalized_id(value, reference_url:)
+            text = value.to_s
+            return if text.empty?
+            normalized_url = normalized_id_url(text, reference_url:)
+            return text unless reference_url && normalized_url.host == reference_url.host
+            normalized_id_value(normalized_url)
+          rescue ArgumentError
+            text
+          end
+          # @param text [String] raw identifier text
+          # @param reference_url [Html2rss::Url, nil] URL used to resolve relative IDs
+          # @return [Html2rss::Url] normalized identifier URL
+          def normalized_id_url(text, reference_url:)
+            if text.start_with?('/')
+              Url.from_relative(text, reference_url || text)
+            else
+              Url.from_absolute(text)
+            end
+          end
+          # @param url [Html2rss::Url] normalized identifier URL
+          # @return [String, nil] path/query portion used as stable ID
+          def normalized_id_value(url)
+            path = url.path.to_s
+            return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
+            return path unless path.empty?
+            url.query
+          end
+          # @param url [String, Html2rss::Url, nil] candidate page URL
+          # @return [Html2rss::Url, nil] normalized absolute URL for schema resolution
+          def normalized_base_url(url)
+            return if url.to_s.strip.empty?
+            Url.from_absolute(url)
+          rescue ArgumentError
+            nil
+          end
         end
       end
     end

data/lib/html2rss/auto_source/scraper/schema.rb CHANGED Viewed

@@ -8,24 +8,31 @@ module Html2rss
     module Scraper
       ##
       # Scrapes articles from Schema.org objects, by looking for the objects in:
       # <script type="application/ld+json"> "schema" tags.
       #
-      # See:
-      # 1. https://schema.org/docs/full.html
-      # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
+      # @see https://schema.org/docs/full.html
+      # @see https://developers.google.com/search/docs/appearance/structured-data/article#microdata
       class Schema
         include Enumerable
+        # Selector for JSON-LD script tags containing Schema.org objects.
         TAG_SELECTOR = 'script[type="application/ld+json"]'
+        # @return [Symbol] scraper config key
+        def self.options_key = :schema
         class << self
+          # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+          # @return [Boolean] whether the page includes supported schema types
           def articles?(parsed_body)
-            parsed_body.css(TAG_SELECTOR).any? do |script|
-              (Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
-                script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
-              end
-            end
+            parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) }
+          end
+          # @param script [Nokogiri::XML::Element] schema JSON-LD script tag
+          # @return [Boolean] whether the tag references a supported schema type
+          def supported_schema_type?(script)
+            supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
+            supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
           end
           ##
@@ -49,11 +56,14 @@ module Html2rss
             end
           end
+          # @param object [Hash{Symbol => Object}] schema candidate object
+          # @return [Boolean] whether an extractor exists for the candidate object
           def supported_schema_object?(object)
             scraper_for_schema_object(object) ? true : false
           end
           ##
+          # @param schema_object [Hash{Symbol => Object}] schema object with an @type key
           # @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
           def scraper_for_schema_object(schema_object)
             type = schema_object[:@type]
@@ -63,7 +73,7 @@ module Html2rss
             elsif ItemList::SUPPORTED_TYPES.member?(type)
               ItemList
             else
-              Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
+              Log.debug("#{name}: unsupported schema object @type=#{type.inspect}")
               nil
             end
           end
@@ -73,14 +83,19 @@ module Html2rss
           def parse_script_tag(script_tag)
             JSON.parse(script_tag.text, symbolize_names: true)
           rescue JSON::ParserError => error
-            Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
+            Log.warn("#{name}: failed to parse JSON", error: error.message)
             []
           end
         end
-        def initialize(parsed_body, url:)
+        # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+        # @param url [String, Html2rss::Url] base page URL
+        # @param opts [Hash] scraper-specific options
+        # @option opts [Object] :_reserved reserved for future scraper-specific options
+        def initialize(parsed_body, url:, **opts)
           @parsed_body = parsed_body
           @url = url
+          @opts = opts
         end
         ##

data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb ADDED Viewed

@@ -0,0 +1,204 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      class SemanticHtml
+        ##
+        # Selects the best content-like anchor from a semantic container.
+        #
+        # The selector turns raw DOM anchors into ranked facts so semantic
+        # scraping can reason about link intent instead of DOM order. It favors
+        # heading-aligned article links and suppresses utility links, duplicate
+        # destinations, and weak textless affordances.
+        class AnchorSelector # rubocop:disable Metrics/ClassLength
+          AnchorFacts = Data.define(
+            :anchor,
+            :text,
+            :url,
+            :destination,
+            :segments,
+            :meaningful_text,
+            :content_like_destination,
+            :heading_anchor,
+            :heading_text_match,
+            :score
+          )
+          # Comma-separated heading selector used for heading/anchor matching.
+          HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
+          # Path segments that usually represent utility navigation rather than article content.
+          UTILITY_PATH_SEGMENTS = %w[
+            about account author category comment comments contact feedback help
+            login newsletter profile register search settings share signup subscribe
+            topic topics view-all archive archives
+            feed feeds
+            recommended
+            for-you
+            preference preferences
+            notification notifications
+            privacy terms
+            cookie cookies
+            logout
+            user users
+          ].to_set.freeze
+          # Path segments that signal content-like destinations.
+          CONTENT_PATH_SEGMENTS = %w[
+            article articles news post posts story stories update updates
+          ].to_set.freeze
+          # Ancestor tags that usually indicate navigation/utility regions.
+          UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
+          # @param base_url [String, Html2rss::Url] page URL used to normalize href destinations
+          def initialize(base_url)
+            @base_url = base_url
+          end
+          ##
+          # Chooses the single anchor that best represents the story contained
+          # in a semantic block.
+          #
+          # Ranking is scoped to one container at a time. That keeps the logic
+          # local, makes duplicate links to the same destination collapse into
+          # one candidate, and avoids page-wide heuristics leaking across cards.
+          #
+          # @param container [Nokogiri::XML::Element] semantic container being evaluated
+          # @return [Nokogiri::XML::Element, nil] selected primary anchor or nil when none qualify
+          def primary_anchor_for(container)
+            facts_for(container).max_by(&:score)&.anchor
+          end
+          private
+          attr_reader :base_url
+          def facts_for(container)
+            heading = heading_for(container)
+            heading_text = visible_text(heading)
+            container.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).each_with_object({}) do |anchor, best_by_destination|
+              next if anchor.path.match?(Html::TAGS_TO_IGNORE)
+              facts = build_facts(anchor, heading, heading_text)
+              next unless facts
+              keep_stronger_fact(best_by_destination, facts)
+            end.values
+          end
+          def build_facts(anchor, heading, heading_text) # rubocop:disable Metrics/MethodLength
+            text = visible_text(anchor)
+            meaningful_text = meaningful_text?(text)
+            ancestors = anchor.ancestors.to_a
+            url = normalized_destination(anchor)
+            return unless url
+            segments = url.path_segments
+            content_like_destination = content_like_destination?(segments)
+            return if ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
+            heading_anchor = heading_anchor?(ancestors, heading)
+            heading_text_match = heading_text_match?(heading_text, text, meaningful_text)
+            return unless heading_anchor || content_like_anchor?(meaningful_text, content_like_destination)
+            AnchorFacts.new(
+              anchor:,
+              text:,
+              url:,
+              destination: url.to_s,
+              segments:,
+              meaningful_text:,
+              content_like_destination:,
+              heading_anchor:,
+              heading_text_match:,
+              score: score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
+            )
+          end
+          def ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
+            utility_destination?(segments) ||
+              utility_text?(text) ||
+              icon_only_anchor?(anchor, meaningful_text) ||
+              utility_landmark_anchor?(ancestors)
+          end
+          def keep_stronger_fact(best_by_destination, facts)
+            current = best_by_destination[facts.destination]
+            return best_by_destination[facts.destination] = facts unless current
+            return if current.score >= facts.score
+            best_by_destination[facts.destination] = facts
+          end
+          def content_like_anchor?(meaningful_text, content_like_destination)
+            meaningful_text || content_like_destination
+          end
+          def score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
+            score = 0
+            score += 100 if heading_anchor
+            score += 20 if heading_text_match
+            score += 10 if meaningful_text
+            score += 10 if content_like_destination
+            score
+          end
+          def heading_anchor?(ancestors, heading)
+            heading && ancestors.include?(heading)
+          end
+          def heading_text_match?(heading_text, text, meaningful_text)
+            meaningful_text && meaningful_text?(heading_text) && heading_text == text
+          end
+          def heading_for(container)
+            container.at_css(HEADING_SELECTOR)
+          end
+          def icon_only_anchor?(anchor, meaningful_text)
+            !meaningful_text && anchor.at_css('img, svg')
+          end
+          def utility_destination?(segments)
+            segments.empty? || segments.any? { |segment| UTILITY_PATH_SEGMENTS.include?(segment) }
+          end
+          def content_like_destination?(segments)
+            segments.any? do |segment|
+              CONTENT_PATH_SEGMENTS.include?(segment) || segment.match?(/\A\d[\w-]*\z/)
+            end
+          end
+          def normalized_destination(anchor)
+            href = anchor['href'].to_s.split('#').first.to_s.strip
+            return if href.empty?
+            Html2rss::Url.from_relative(href, base_url)
+          rescue ArgumentError
+            nil
+          end
+          def meaningful_text?(text)
+            text.scan(/\p{Alnum}+/).any?
+          end
+          def utility_text?(text)
+            text.match?(
+              /\A(about|contact|log in|login|sign up|signup|share|comments?|view all|recommended for you|subscribe)\b/i
+            )
+          end
+          def utility_landmark_anchor?(ancestors)
+            ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
+          end
+          def visible_text(node)
+            return '' unless node
+            HtmlExtractor.extract_visible_text(node).to_s.strip
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/semantic_html.rb CHANGED Viewed

@@ -1,115 +1,124 @@
 # frozen_string_literal: true
-require 'addressable'
-require 'parallel'
+require_relative 'semantic_html/anchor_selector'
 module Html2rss
   class AutoSource
     module Scraper
       ##
-      # Scrapes articles by looking for common markup tags (article, section, li)
-      # containing an <a href> tag.
+      # Scrapes semantic containers by choosing one primary content link per
+      # block before extraction.
       #
-      # See:
-      # 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
+      # This scraper is intentionally container-first:
+      # 1. collect candidate semantic containers once
+      # 2. select the strongest content-like anchor within each container
+      # 3. extract fields from the container while honoring that anchor choice
+      #
+      # The result is lower recall on weak-signal blocks, but much better link
+      # quality on modern teaser cards that mix headlines, utility links, and
+      # duplicate image overlays.
       class SemanticHtml
         include Enumerable
+        # Container plus selected anchor chosen for extraction.
+        Entry = Data.define(:container, :selected_anchor)
+        # Candidate semantic container selectors used to locate extractable blocks.
+        CONTAINER_SELECTORS = [
+          'article:not(:has(article))',
+          'section:not(:has(section))',
+          'li:not(:has(li))',
+          'tr:not(:has(tr))',
+          'div:not(:has(div))'
+        ].freeze
         ##
-        # Map of parent element names to CSS selectors for finding <a href> tags.
-        ANCHOR_TAG_SELECTORS = {
-          'section' => ['section :not(section) a[href]'],
-          'tr' => ['table tr :not(tr) a[href]'],
-          'article' => [
-            'article :not(article) a[href]',
-            'article a[href]'
-          ],
-          'li' => [
-            'ul > li :not(li) a[href]',
-            'ol > li :not(li) a[href]'
-          ]
-        }.freeze
-        # Check if the parsed_body contains articles
-        # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
-        # @return [Boolean] True if articles are found, otherwise false.
+        # @return [Symbol] config key used to enable or configure this scraper
+        def self.options_key = :semantic_html
+        # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+        # @return [Boolean] true when at least one semantic container has an eligible anchor
         def self.articles?(parsed_body)
           return false unless parsed_body
-          ANCHOR_TAG_SELECTORS.each_value do |selectors|
-            return true if selectors.any? { |selector| parsed_body.at_css(selector) }
-          end
-          false
+          new(parsed_body, url: 'https://example.com').extractable?
+        end
+        # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+        # @param url [String, Html2rss::Url] base url
+        # @param extractor [Class] extractor class used for article extraction
+        # @param _opts [Hash] scraper-specific options
+        # @option _opts [Object] :_reserved reserved for future scraper-specific options
+        def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
+          @parsed_body = parsed_body
+          @url = url
+          @extractor = extractor
+          @anchor_selector = AnchorSelector.new(url)
         end
-        # Finds the closest ancestor tag matching the specified tag name
-        # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
-        # @param tag_name [String] The tag name to search for
-        # @param stop_tag [String] The tag name to stop searching at
-        # @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
-        def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
-          return current_tag if current_tag.name == tag_name
+        attr_reader :parsed_body
-          stop_tags = Set[tag_name, stop_tag]
+        ##
+        # Yields extracted article hashes for each semantic container that
+        # survives anchor selection.
+        #
+        # Detection and extraction share the same memoized entry list so this
+        # scraper does not rerun anchor ranking once a page has already been
+        # accepted as extractable.
+        #
+        # @yieldparam article_hash [Hash] extracted article hash
+        # @return [Enumerator<Hash>]
+        def each
+          return enum_for(:each) unless block_given?
-          while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
-            current_tag = current_tag.parent
+          extractable_entries.each do |entry|
+            article_hash = @extractor.new(
+              entry.container,
+              base_url: @url,
+              selected_anchor: entry.selected_anchor
+            ).call
+            yield article_hash if article_hash
           end
-          current_tag
         end
-        # Finds the closest matching selector upwards in the DOM tree
-        # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
-        # @param selector [String] The CSS selector to search for
-        # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
-        def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
-          current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
+        ##
+        # Reports whether the page contains at least one semantic container with
+        # a selectable primary anchor.
+        #
+        # @return [Boolean] true when at least one candidate container yields a primary anchor
+        def extractable?
+          extractable_entries.any?
         end
-        # Helper method to find a matching selector upwards
-        # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
-        # @param selector [String] The CSS selector to search for
-        # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
-        def self.find_closest_selector_upwards(current_tag, selector:)
-          while current_tag
-            found = current_tag.at_css(selector)
-            return found if found
-            return nil unless current_tag.respond_to?(:parent)
+        protected
-            current_tag = current_tag.parent
-          end
+        def candidate_containers
+          @candidate_containers ||= collect_candidate_containers
         end
-        # Returns an array of [tag_name, selector] pairs
-        # @return [Array<[String, String]>] Array of tag name and selector pairs
-        def self.anchor_tag_selector_pairs
-          ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
-            selectors.map { |selector| [tag_name, selector] }
-          end
+        def primary_anchor_for(container)
+          @anchor_selector.primary_anchor_for(container)
         end
-        def initialize(parsed_body, url:)
-          @parsed_body = parsed_body
-          @url = url
-        end
+        def extractable_entries
+          @extractable_entries ||= candidate_containers.filter_map do |container|
+            selected_anchor = primary_anchor_for(container)
+            next unless selected_anchor
-        attr_reader :parsed_body
+            Entry.new(container:, selected_anchor:)
+          end
+        end
-        ##
-        # @yieldparam [Hash] The scraped article hash
-        # @return [Enumerator] Enumerator for the scraped articles
-        def each
-          return enum_for(:each) unless block_given?
+        def collect_candidate_containers
+          seen = {}.compare_by_identity
-          SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
-            parsed_body.css(selector).each do |selected_tag|
-              article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
+          CONTAINER_SELECTORS.each_with_object([]) do |selector, containers|
+            parsed_body.css(selector).each do |container|
+              next if container.path.match?(Html::TAGS_TO_IGNORE)
+              next if seen[container]
-              if article_tag && (article_hash = Extractor.new(article_tag, url: @url).call)
-                yield article_hash
-              end
+              seen[container] = true
+              containers << container
             end
           end
         end