RubyGems - html2rss - Versions diffs - 0.19.1 → 0.20.0 - Mend

html2rss 0.19.1 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/lib/html2rss/auto_source/scraper/html.rb +48 -56
data/lib/html2rss/auto_source/scraper/link_heuristics.rb +447 -0
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +6 -161
data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +102 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +172 -30
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +1 -1
data/lib/html2rss/config/class_methods.rb +2 -2
data/lib/html2rss/config/request_headers.rb +18 -9
data/lib/html2rss/configuration.rb +176 -0
data/lib/html2rss/html_extractor/list_candidates.rb +94 -0
data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +257 -0
data/lib/html2rss/html_extractor/semantic_containers.rb +70 -0
data/lib/html2rss/html_extractor.rb +11 -0
data/lib/html2rss/rss_builder/channel.rb +10 -7
data/lib/html2rss/url.rb +2 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +54 -5
metadata +9 -3

data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb CHANGED Viewed

@@ -11,47 +11,13 @@ module Html2rss
         # scraping can reason about link intent instead of DOM order. It favors
         # heading-aligned article links and suppresses utility links, duplicate
         # destinations, and weak textless affordances.
-        class AnchorSelector # rubocop:disable Metrics/ClassLength
-          AnchorFacts = Data.define(
-            :anchor,
-            :text,
-            :url,
-            :destination,
-            :segments,
-            :meaningful_text,
-            :content_like_destination,
-            :heading_anchor,
-            :heading_text_match,
-            :score
-          )
+        class AnchorSelector
           # Comma-separated heading selector used for heading/anchor matching.
           HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
-          # Path segments that usually represent utility navigation rather than article content.
-          UTILITY_PATH_SEGMENTS = %w[
-            about account author category comment comments contact feedback help
-            login newsletter profile register search settings share signup subscribe
-            topic topics view-all archive archives
-            feed feeds
-            recommended
-            for-you
-            preference preferences
-            notification notifications
-            privacy terms
-            cookie cookies
-            logout
-            user users
-          ].to_set.freeze
-          # Path segments that signal content-like destinations.
-          CONTENT_PATH_SEGMENTS = %w[
-            article articles news post posts story stories update updates
-          ].to_set.freeze
-          # Ancestor tags that usually indicate navigation/utility regions.
-          UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
           # @param base_url [String, Html2rss::Url] page URL used to normalize href destinations
           def initialize(base_url)
-            @base_url = base_url
+            @link_heuristics = LinkHeuristics.new(base_url)
           end
           ##
@@ -70,132 +36,11 @@ module Html2rss
           private
-          attr_reader :base_url
           def facts_for(container)
-            heading = heading_for(container)
-            heading_text = visible_text(heading)
-            container.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).each_with_object({}) do |anchor, best_by_destination|
-              next if anchor.path.match?(Html::TAGS_TO_IGNORE)
-              facts = build_facts(anchor, heading, heading_text)
-              next unless facts
-              keep_stronger_fact(best_by_destination, facts)
-            end.values
-          end
-          def build_facts(anchor, heading, heading_text) # rubocop:disable Metrics/MethodLength
-            text = visible_text(anchor)
-            meaningful_text = meaningful_text?(text)
-            ancestors = anchor.ancestors.to_a
-            url = normalized_destination(anchor)
-            return unless url
-            segments = url.path_segments
-            content_like_destination = content_like_destination?(segments)
-            return if ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
-            heading_anchor = heading_anchor?(ancestors, heading)
-            heading_text_match = heading_text_match?(heading_text, text, meaningful_text)
-            return unless heading_anchor || content_like_anchor?(meaningful_text, content_like_destination)
-            AnchorFacts.new(
-              anchor:,
-              text:,
-              url:,
-              destination: url.to_s,
-              segments:,
-              meaningful_text:,
-              content_like_destination:,
-              heading_anchor:,
-              heading_text_match:,
-              score: score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
-            )
-          end
-          def ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
-            utility_destination?(segments) ||
-              utility_text?(text) ||
-              icon_only_anchor?(anchor, meaningful_text) ||
-              utility_landmark_anchor?(ancestors)
-          end
-          def keep_stronger_fact(best_by_destination, facts)
-            current = best_by_destination[facts.destination]
-            return best_by_destination[facts.destination] = facts unless current
-            return if current.score >= facts.score
-            best_by_destination[facts.destination] = facts
-          end
-          def content_like_anchor?(meaningful_text, content_like_destination)
-            meaningful_text || content_like_destination
-          end
-          def score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
-            score = 0
-            score += 100 if heading_anchor
-            score += 20 if heading_text_match
-            score += 10 if meaningful_text
-            score += 10 if content_like_destination
-            score
-          end
-          def heading_anchor?(ancestors, heading)
-            heading && ancestors.include?(heading)
-          end
-          def heading_text_match?(heading_text, text, meaningful_text)
-            meaningful_text && meaningful_text?(heading_text) && heading_text == text
-          end
-          def heading_for(container)
-            container.at_css(HEADING_SELECTOR)
-          end
-          def icon_only_anchor?(anchor, meaningful_text)
-            !meaningful_text && anchor.at_css('img, svg')
-          end
-          def utility_destination?(segments)
-            segments.empty? || segments.any? { |segment| UTILITY_PATH_SEGMENTS.include?(segment) }
-          end
-          def content_like_destination?(segments)
-            segments.any? do |segment|
-              CONTENT_PATH_SEGMENTS.include?(segment) || segment.match?(/\A\d[\w-]*\z/)
-            end
-          end
-          def normalized_destination(anchor)
-            href = anchor['href'].to_s.split('#').first.to_s.strip
-            return if href.empty?
-            Html2rss::Url.from_relative(href, base_url)
-          rescue ArgumentError
-            nil
-          end
-          def meaningful_text?(text)
-            text.scan(/\p{Alnum}+/).any?
-          end
-          def utility_text?(text)
-            text.match?(
-              /\A(about|contact|log in|login|sign up|signup|share|comments?|view all|recommended for you|subscribe)\b/i
-            )
-          end
-          def utility_landmark_anchor?(ancestors)
-            ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
-          end
-          def visible_text(node)
-            return '' unless node
-            HtmlExtractor.extract_visible_text(node).to_s.strip
+            HtmlExtractor::SemanticAnchorCandidates.new(
+              container,
+              link_heuristics: @link_heuristics
+            ).to_a
           end
         end
       end

data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb ADDED Viewed

@@ -0,0 +1,102 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      class SemanticHtml
+        ##
+        # Collapses nested containers and deduplicates entries pointing to the same destination.
+        # It resolves ties using scoring precedence and payload richness comparison.
+        class Deduplicator
+          # @param url [String, Html2rss::Url] base url used to resolve relative hrefs
+          # @param extractor [Class] extractor class used to materialize articles
+          def initialize(url, extractor)
+            @url = url
+            @extractor = extractor
+            @article_cache = {}.compare_by_identity
+          end
+          # Collapses and deduplicates the given entries.
+          #
+          # @param entries [Array<Entry>] list of scraper entries
+          # @return [Array<Entry>] deduplicated list of scraper entries
+          def call(entries)
+            destination_groups(entries).filter_map do |group|
+              collapsed_group = collapse_nested_destination_group(group)
+              collapsed_group.reduce do |best, entry|
+                stronger_entry?(entry, best) ? entry : best
+              end
+            end
+          end
+          # Returns the materialized article hash for the entry, using the cache.
+          #
+          # @param entry [Entry] scraper entry
+          # @return [Hash, nil] article payload
+          def article_for(entry)
+            return entry.article if entry.article
+            @article_cache.fetch(entry) do
+              @article_cache[entry] = @extractor.new(
+                entry.container, base_url: @url, selected_anchor: entry.selected_anchor
+              ).call
+            end
+          end
+          # Compares two entries to determine which is stronger.
+          #
+          # @param left [Entry] left entry
+          # @param right [Entry] right entry
+          # @return [Boolean] true if left is stronger than right
+          def stronger_entry?(left, right) # rubocop:disable Metrics/AbcSize
+            final_delta = left.final_score <=> right.final_score
+            return final_delta.positive? unless final_delta.zero?
+            quality_delta = left.quality_score <=> right.quality_score
+            return quality_delta.positive? unless quality_delta.zero?
+            left_article = article_for(left)
+            right_article = article_for(right)
+            return !right_article if left_article.nil? || right_article.nil?
+            richness_delta = payload_richness_signature(left_article) <=> payload_richness_signature(right_article)
+            richness_delta.zero? ? left.position < right.position : richness_delta.positive?
+          end
+          private
+          def destination_groups(entries) = entries.group_by { entry_destination(_1) }.values
+          def collapse_nested_destination_group(entries)
+            return entries if entries.size <= 1
+            entries.reject do |entry|
+              entries.any? do |other|
+                next if entry.equal?(other)
+                next unless nested_container_pair?(entry.container, other.container)
+                stronger_entry?(other, entry)
+              end
+            end
+          end
+          def nested_container_pair?(left, right) = left.ancestors.include?(right) || right.ancestors.include?(left)
+          def entry_destination(entry) = entry.destination_facts&.destination || article_for(entry)&.[](:url)&.to_s
+          def payload_richness_signature(article)
+            [
+              article[:published_at] ? 1 : 0,
+              word_count(article[:description]),
+              article[:image] ? 1 : 0,
+              Array(article[:categories]).length,
+              Array(article[:enclosures]).length
+            ]
+          end
+          def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/semantic_html.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require_relative 'semantic_html/anchor_selector'
+require_relative 'semantic_html/deduplicator'
 module Html2rss
   class AutoSource
@@ -17,20 +18,20 @@ module Html2rss
       # The result is lower recall on weak-signal blocks, but much better link
       # quality on modern teaser cards that mix headlines, utility links, and
       # duplicate image overlays.
-      class SemanticHtml
+      class SemanticHtml # rubocop:disable Metrics/ClassLength
         include Enumerable
-        # Container plus selected anchor chosen for extraction.
-        Entry = Data.define(:container, :selected_anchor)
-        # Candidate semantic container selectors used to locate extractable blocks.
-        CONTAINER_SELECTORS = [
-          'article:not(:has(article))',
-          'section:not(:has(section))',
-          'li:not(:has(li))',
-          'tr:not(:has(tr))',
-          'div:not(:has(div))'
-        ].freeze
+        # Container plus selected anchor, scoring metadata, and extracted article.
+        Entry = Data.define(
+          :container,
+          :selected_anchor,
+          :destination_facts,
+          :quality_score,
+          :junk_score,
+          :final_score,
+          :position,
+          :article
+        )
         ##
         # @return [Symbol] config key used to enable or configure this scraper
@@ -53,6 +54,7 @@ module Html2rss
           @parsed_body = parsed_body
           @url = url
           @extractor = extractor
+          @link_heuristics = LinkHeuristics.new(url)
           @anchor_selector = AnchorSelector.new(url)
         end
@@ -71,14 +73,7 @@ module Html2rss
         def each
           return enum_for(:each) unless block_given?
-          extractable_entries.each do |entry|
-            article_hash = @extractor.new(
-              entry.container,
-              base_url: @url,
-              selected_anchor: entry.selected_anchor
-            ).call
-            yield article_hash if article_hash
-          end
+          ranked_entries.each { yield _1.article }
         end
         ##
@@ -100,28 +95,175 @@ module Html2rss
           @anchor_selector.primary_anchor_for(container)
         end
-        def extractable_entries
+        def extractable_entries # rubocop:disable Metrics/MethodLength
           @extractable_entries ||= candidate_containers.filter_map do |container|
             selected_anchor = primary_anchor_for(container)
             next unless selected_anchor
-            Entry.new(container:, selected_anchor:)
+            destination_facts = normalized_destination(selected_anchor)
+            next unless destination_facts
+            next if hard_junk_entry?(container, selected_anchor, destination_facts)
+            quality = quality_score(container, selected_anchor, destination_facts)
+            junk = junk_score(container, selected_anchor, destination_facts)
+            Entry.new(
+              container:,
+              selected_anchor:,
+              destination_facts:,
+              quality_score: quality,
+              junk_score: junk,
+              final_score: quality - junk,
+              position: document_position(container),
+              article: nil
+            )
           end
         end
-        def collect_candidate_containers
-          seen = {}.compare_by_identity
+        # rubocop:disable Metrics/MethodLength
+        def ranked_entries
+          @ranked_entries ||= begin
+            deduplicator = Deduplicator.new(@url, @extractor)
+            entries = deduplicator.call(extractable_entries)
+            entries = stable_rank(entries)
-          CONTAINER_SELECTORS.each_with_object([]) do |selector, containers|
-            parsed_body.css(selector).each do |container|
-              next if container.path.match?(Html::TAGS_TO_IGNORE)
-              next if seen[container]
+            entries.filter_map do |entry|
+              article = deduplicator.article_for(entry)
+              next unless article
-              seen[container] = true
-              containers << container
+              Entry.new(
+                container: entry.container,
+                selected_anchor: entry.selected_anchor,
+                destination_facts: entry.destination_facts,
+                quality_score: entry.quality_score,
+                junk_score: entry.junk_score,
+                final_score: entry.final_score,
+                position: entry.position,
+                article:
+              )
             end
           end
         end
+        # rubocop:enable Metrics/MethodLength
+        def collect_candidate_containers
+          HtmlExtractor::SemanticContainers.call(parsed_body)
+        end
+        private
+        def document_position(container)
+          (@document_positions ||= candidate_containers.each_with_index.to_h).fetch(container)
+        end
+        def quality_score(container, selected_anchor, destination_facts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
+          title = entry_title(container, selected_anchor)
+          words = word_count(title)
+          container_text = visible_text(container)
+          score = 0
+          score += 40 if words >= 3
+          score += 15 if words >= 7
+          score += 20 if destination_facts.url.path.to_s.length > 6
+          score += 15 if destination_facts.content_path
+          score += 15 if publish_marker?(container)
+          score += 10 if descriptive_context?(container_text, title)
+          score += 10 if article_container?(container)
+          score += 10 if content_tokens?(container_tokens(container))
+          score
+        end
+        def junk_score(container, selected_anchor, destination_facts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
+          title = entry_title(container, selected_anchor)
+          utility_text = @link_heuristics.utility_prefix_text?(title)
+          recommended_text = @link_heuristics.recommended_text?(title)
+          content_signal = destination_facts.content_path
+          no_content_signal = !content_signal
+          non_content_utility_path =
+            destination_facts.utility_path &&
+            no_content_signal &&
+            !destination_facts.strong_post_suffix
+          publish_signal = publish_marker?(container)
+          descriptive_signal = descriptive_context?(visible_text(container), title)
+          weak_container = !publish_signal && !descriptive_signal
+          score = 0
+          score += 25 if non_content_utility_path
+          score += 15 if utility_text && word_count(title) <= 6
+          score += 10 if destination_facts.shallow
+          score += 10 if weak_container
+          score += 10 if recommended_text && no_content_signal
+          score += 5 if destination_facts.high_confidence_junk_path
+          score += 15 if junk_tokens?(container_tokens(container))
+          score
+        end
+        def hard_junk_entry?(container, selected_anchor, destination_facts) # rubocop:disable Metrics/MethodLength
+          title = entry_title(container, selected_anchor)
+          publish_signal = publish_marker?(container)
+          descriptive_signal = descriptive_context?(visible_text(container), title)
+          content_signal = destination_facts.content_path
+          weak_article_candidate = article_signal_count(
+            container,
+            publish_signal:,
+            descriptive_signal:,
+            content_signal:
+          ) < 2
+          destination_facts.high_confidence_junk_path ||
+            (@link_heuristics.recommended_text?(title) && destination_facts.shallow && weak_article_candidate) ||
+            (@link_heuristics.utility_prefix_text?(title) &&
+              destination_facts.high_confidence_utility_destination &&
+              weak_article_candidate)
+        end
+        def publish_marker?(container)
+          container.at_css('time, [datetime], [itemprop="datePublished"], [itemprop="dateModified"]')
+        end
+        def article_signal_count(container, publish_signal:, descriptive_signal:, content_signal:)
+          [article_container?(container), publish_signal, descriptive_signal, content_signal].count(&:itself)
+        end
+        def article_container?(container) = container.name == 'article'
+        def descriptive_context?(container_text, title)
+          snippet = container_text.to_s.sub(/\A#{Regexp.escape(title.to_s)}/i, '')
+          word_count(snippet) >= 8
+        end
+        def heading_for(container) = container.at_css(AnchorSelector::HEADING_SELECTOR)
+        def normalized_destination(anchor) = @link_heuristics.destination_facts(anchor)
+        def visible_text(node)
+          return '' unless node
+          HtmlExtractor.extract_visible_text(node).to_s.strip
+        end
+        def entry_title(container, selected_anchor) = visible_text(heading_for(container) || selected_anchor)
+        def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
+        def container_tokens(container)
+          classes = container['class'].to_s.split
+          id = container['id'].to_s
+          (classes << id).flat_map { |str| str.downcase.split(/[-_]+/) }.reject(&:empty?)
+        end
+        def content_tokens?(tokens)
+          (@content_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:content)).intersect?(tokens.to_set)
+        end
+        def junk_tokens?(tokens)
+          (@junk_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:utility)).intersect?(tokens.to_set)
+        end
+        def stable_rank(entries)
+          entries.sort_by { |entry| [-entry.final_score, entry.position] }
+        end
       end
     end
   end

data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb CHANGED Viewed

@@ -54,7 +54,7 @@ module Html2rss
             return log_missing_api_root if href.empty?
             Html2rss::Url.from_relative(href, page_url)
-          rescue Addressable::URI::InvalidURIError, ArgumentError => error
+          rescue ArgumentError => error
             logger.warn("#{WordpressApi}: invalid WordPress API endpoint #{href.inspect} (#{error.message})")
             nil
           end

data/lib/html2rss/config/class_methods.rb CHANGED Viewed

@@ -138,13 +138,13 @@ module Html2rss
           },
           channel: { time_zone: 'UTC' },
           headers: RequestHeaders.browser_defaults,
-          stylesheets: []
+          stylesheets: Html2rss.configuration.stylesheets || []
         }
       end
       # @return [Symbol] the default strategy for feed orchestration
       def default_strategy_name
-        :auto
+        Html2rss.configuration.default_strategy || :auto
       end
       private

data/lib/html2rss/config/request_headers.rb CHANGED Viewed

@@ -17,13 +17,8 @@ module Html2rss
         */*;q=0.8
       ].join(',')
-      # Browser-like default `User-Agent` header value.
-      DEFAULT_USER_AGENT = [
-        'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
-        'AppleWebKit/537.36 (KHTML, like Gecko)',
-        'Chrome/123.0.0.0',
-        'Safari/537.36'
-      ].join(' ')
+      # Default `User-Agent` header value.
+      DEFAULT_USER_AGENT = "html2rss/#{Html2rss::VERSION}".freeze
       # Baseline browser-like header set used for outbound requests.
       DEFAULT_HEADERS = {
@@ -40,9 +35,23 @@ module Html2rss
       class << self
         ##
-        # @return [Hash{String => String}] the unmodified default header set
+        # :reek:ManualDispatch
+        # :reek:TooManyStatements
+        #
+        # @return [Hash{String => String}] the default header set merged with global defaults
         def browser_defaults
-          DEFAULT_HEADERS.dup
+          defaults = DEFAULT_HEADERS.dup
+          global_headers = Html2rss.configuration.headers
+          global_headers = global_headers.call if global_headers.respond_to?(:call)
+          if global_headers.is_a?(Hash)
+            global_headers.each do |key, value|
+              canonical_key = key.to_s.split('-').map(&:capitalize).join('-')
+              defaults[canonical_key] = value.to_s
+            end
+          end
+          defaults
         end
         ##