RubyGems - html2rss - Versions diffs - 0.20.1 → 0.21.0 - Mend

html2rss 0.20.1 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/html2rss.gemspec +1 -2
data/lib/html2rss/auto_source/scraper/html.rb +61 -16
data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
data/lib/html2rss/auto_source/scraper.rb +0 -3
data/lib/html2rss/auto_source.rb +2 -11
data/lib/html2rss/category_extractor.rb +54 -20
data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
data/lib/html2rss/html_extractor.rb +51 -30
data/lib/html2rss/rendering/description_builder.rb +3 -3
data/lib/html2rss/rss_builder/article.rb +44 -23
data/lib/html2rss/rss_builder/enclosure.rb +4 -2
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
data/lib/html2rss/selectors/post_processors/template.rb +3 -2
data/lib/html2rss/selectors.rb +18 -4
data/lib/html2rss/url.rb +4 -3
data/lib/html2rss/version.rb +1 -1
metadata +3 -17

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3eb20836fb55a5e33d114634c7c20bf2a65afd1a923af1bba82896797fdb099d
-  data.tar.gz: 97c00923e0ca5744cf82f22d54aa428cc784cba3e259bbcdcd9252928f9bc3c0
+  metadata.gz: 8168109d2cc60920d8a18b6b99970a5558e43163ad5cd11cb3d3f0d944d46943
+  data.tar.gz: 833a936f89f9ce31c0b4fb0036020c7962a4ac77e0dfa72f1134a0bae8bea4c4
 SHA512:
-  metadata.gz: 36431edafddcca32a53f562a75fbcd77fae969ca8c8fa7c2b6de77f121add2ee3c34c2b6e2e7b57109742780d62eeddde823b1a9bf8f4c1aaeed08a0ae4e5c90
-  data.tar.gz: d3eafa9cbbecc5ccaded7b21508e0af1c43337999cb5651f2d879df5217c1aa1f3e5484bcfd6354037d709715db1cc106f2946dd592745c0ac58252bcdd26ac8
+  metadata.gz: 734f286a486d49c86ab7baf48d157cdee9d988fdc8b693ac7d79bf3c64c661fcd54538d5e94dc19bdc8a6f3021168c1ecac2d8e34417f56879392d71600c7340
+  data.tar.gz: f008a767b452557cff1b45b1abb0eccb26f38d839417e95d21b8cf74f4546f9143b067e2d11f9fc00f5955c3f145f8933a1b1d4912ad25756c890280d4bb1a37

data/html2rss.gemspec CHANGED Viewed

@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
   spec.description   = 'Supports JSON content, custom HTTP headers, and post-processing of extracted content.'
   spec.homepage      = 'https://github.com/html2rss/html2rss'
   spec.license       = 'MIT'
-  spec.required_ruby_version = '>= 3.2'
+  spec.required_ruby_version = '>= 3.3'
   if spec.respond_to?(:metadata)
     spec.metadata['allowed_push_host'] = 'https://rubygems.org'
@@ -41,7 +41,6 @@ Gem::Specification.new do |spec|
   spec.add_dependency 'kramdown'
   spec.add_dependency 'mime-types', '> 3.0'
   spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
-  spec.add_dependency 'parallel'
   spec.add_dependency 'puppeteer-ruby'
   spec.add_dependency 'regexp_parser'
   spec.add_dependency 'reverse_markdown', '~> 3.0'

data/lib/html2rss/auto_source/scraper/html.rb CHANGED Viewed

@@ -63,6 +63,7 @@ module Html2rss
           @extractor = extractor
           @opts = opts
           @link_heuristics = LinkHeuristics.new(url)
+          @ignored_cache = {}.compare_by_identity
         end
         attr_reader :parsed_body
@@ -73,10 +74,13 @@ module Html2rss
         def each
           return enum_for(:each) unless block_given?
-          each_article_tag do |article_tag, selected_anchor|
-            article_hash = extract_article(article_tag, selected_anchor:)
-            yield article_hash if article_hash
-          end
+          articles.each { yield _1 }
+        end
+        ##
+        # @return [Boolean] true when the scraper can likely extract articles
+        def extractable?
+          articles.any?
         end
         ##
@@ -91,7 +95,7 @@ module Html2rss
         # @return [Boolean] true when the node is a good extraction boundary
         def article_tag_condition?(node)
           # Ignore tags that are below ignored DOM chrome.
-          return false if HtmlExtractor.ignored_container_path?(node)
+          return false if HtmlExtractor.ignored_container_path?(node, @ignored_cache)
           return true if %w[body html].include?(node.name)
           return false unless (parent = node.parent)
@@ -100,14 +104,30 @@ module Html2rss
         private
+        def articles
+          @articles ||= each_article_tag.filter_map do |article_tag, selected_anchor|
+            extract_article(article_tag, selected_anchor:)
+          end
+        end
+        ##
+        # @return [Integer]
         def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
+        ##
+        # @return [Boolean]
         def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
+        ##
+        # @param node [Nokogiri::XML::Node]
+        # @return [Integer]
         def anchor_count(node)
-          @anchor_counts ||= {}
-          @anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
+          (@anchor_counts ||= {}.compare_by_identity)[node] ||= node.name == 'a' ? 1 : node.css('a').size
         end
+        ##
+        # @param node [Nokogiri::XML::Node]
+        # @return [Boolean]
         def relevant_anchor?(node)
           destination_facts = @link_heuristics.destination_facts(node)
           return false unless destination_facts
@@ -115,14 +135,24 @@ module Html2rss
           !noise_anchor?(node, destination_facts)
         end
+        ##
+        # @yield [article_tag, selected_anchor]
+        # @yieldparam article_tag [Nokogiri::XML::Node]
+        # @yieldparam selected_anchor [Nokogiri::XML::Node]
+        # @return [Enumerator, nil]
         def each_article_tag(&block)
           return enum_for(:each_article_tag) unless block
-          list_candidates.each_article_tag(anchor_filter: method(:relevant_anchor?),
-                                           boundary_condition: method(:article_tag_condition?),
-                                           &block)
+          anchor_filter = ->(node) { relevant_anchor?(node) }
+          boundary_condition = ->(node) { article_tag_condition?(node) }
+          list_candidates.each_article_tag(anchor_filter:, boundary_condition:, &block)
         end
+        ##
+        # @param article_tag [Nokogiri::XML::Node]
+        # @param selected_anchor [Nokogiri::XML::Node, nil]
+        # @return [Hash, nil]
         def extract_article(article_tag, selected_anchor: nil)
           selected_anchor ||= preferred_anchor_for(article_tag)
           return unless selected_anchor
@@ -131,18 +161,28 @@ module Html2rss
           @extractor.new(article_tag, base_url: @url, selected_anchor:).call
         end
+        ##
+        # @param anchor [Nokogiri::XML::Node]
+        # @param destination_facts [DestinationFacts]
+        # @return [Boolean]
         def noise_anchor?(anchor, destination_facts) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
           return true unless destination_facts
-          text = HtmlExtractor.extract_visible_text(anchor).to_s.strip
+          (@noise_anchors ||= {}.compare_by_identity)[anchor] ||= begin
+            text = HtmlExtractor.extract_visible_text(anchor).to_s.strip
-          destination_facts.taxonomy_path ||
-            short_utility_label?(text, destination_facts) ||
-            (@link_heuristics.recommended_text?(text) && destination_facts.shallow) ||
-            (@link_heuristics.utility_prefix_text?(text) && destination_facts.high_confidence_utility_destination) ||
-            (@link_heuristics.utility_text?(text) && destination_facts.vanity_path)
+            destination_facts.taxonomy_path ||
+              short_utility_label?(text, destination_facts) ||
+              (@link_heuristics.recommended_text?(text) && destination_facts.shallow) ||
+              (@link_heuristics.utility_prefix_text?(text) && destination_facts.high_confidence_utility_destination) ||
+              (@link_heuristics.utility_text?(text) && destination_facts.vanity_path)
+          end
         end
+        ##
+        # @param text [String]
+        # @param destination_facts [DestinationFacts]
+        # @return [Boolean]
         def short_utility_label?(text, destination_facts)
           destination_facts.utility_path &&
             !destination_facts.content_path &&
@@ -150,11 +190,16 @@ module Html2rss
             text.scan(/\p{Alnum}+/).size <= 3
         end
+        ##
+        # @param article_tag [Nokogiri::XML::Node]
+        # @return [Nokogiri::XML::Node, nil]
         def preferred_anchor_for(article_tag)
           article_tag.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).find { relevant_anchor?(_1) } ||
             HtmlExtractor.main_anchor_for(article_tag)
         end
+        ##
+        # @return [HtmlExtractor::ListCandidates]
         def list_candidates
           HtmlExtractor::ListCandidates.new(
             parsed_body,

data/lib/html2rss/auto_source/scraper/json_state.rb CHANGED Viewed

@@ -30,6 +30,9 @@ module Html2rss
           /(?:window|self|globalThis)\.angular\s*=\s*/m
         ].freeze
+        # Combined regex for faster matching of global assignments.
+        GLOBAL_ASSIGNMENT_REGEXP = Regexp.union(GLOBAL_ASSIGNMENT_PATTERNS).freeze
         # Preferred keys when extracting title-like values from state payloads.
         TITLE_KEYS = %i[title headline name text].freeze
         # Preferred keys when extracting URL-like values from state payloads.
@@ -53,7 +56,12 @@ module Html2rss
           # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
           # @return [Array<Hash, Array>] parsed JSON documents discovered in scripts
           def json_documents(parsed_body)
-            script_documents(parsed_body) + assignment_documents(parsed_body)
+            # Use identity-based cache to avoid double-parsing of the same document.
+            # WeakMap allows the Nokogiri Document (key) to be garbage collected.
+            # rubocop:disable ThreadSafety/ClassInstanceVariable
+            (@cache ||= ObjectSpace::WeakMap.new)[parsed_body] ||=
+              script_documents(parsed_body) + assignment_documents(parsed_body)
+            # rubocop:enable ThreadSafety/ClassInstanceVariable
           end
           # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
@@ -80,15 +88,10 @@ module Html2rss
           def assignment_payload(text)
             trimmed = text.to_s.strip
             return if trimmed.empty?
+            return unless trimmed.match?(GLOBAL_ASSIGNMENT_REGEXP)
-            GLOBAL_ASSIGNMENT_PATTERNS.each do |pattern|
-              next unless trimmed.match?(pattern)
-              payload = trimmed.sub(pattern, '')
-              return extract_assignment_payload(payload)
-            end
-            nil
+            payload = trimmed.sub(GLOBAL_ASSIGNMENT_REGEXP, '')
+            extract_assignment_payload(payload)
           end
           # @param text [String] text potentially containing JSON-like payloads
@@ -116,8 +119,10 @@ module Html2rss
             in_string = false
             escape = false
-            text.each_char.with_index do |char, index|
-              next if index < start_index
+            i = start_index
+            len = text.length
+            while i < len
+              char = text[i]
               if in_string
                 if escape
@@ -127,24 +132,22 @@ module Html2rss
                 elsif char == '"'
                   in_string = false
                 end
-                next
-              end
-              case char
-              when '"'
-                in_string = true
-              when '{'
-                stack << '}'
-              when '['
-                stack << ']'
-              when '}', ']'
-                expected = stack.pop
-                return index if expected == char && stack.empty?
+              else
+                case char
+                when '"' then in_string = true
+                when '{' then stack << '}'
+                when '[' then stack << ']'
+                when '}', ']'
+                  expected = stack.pop
+                  return i if expected == char && stack.empty?
+                end
               end
+              i += 1
             end
             nil
           end
           # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
           # @param payload [String, nil] JSON payload to parse
@@ -184,8 +187,9 @@ module Html2rss
           # @param jsonish [String] JSON-like string with potentially unquoted keys
           # @return [String] payload with unquoted object keys quoted
           def quote_unquoted_keys(jsonish)
-            jsonish.gsub(/(\A\s*|[{,\[]\s*)([A-Za-z_]\w*)(\s*:)/) do
-              "#{Regexp.last_match(1)}\"#{Regexp.last_match(2)}\"#{Regexp.last_match(3)}"
+            jsonish.gsub(/(?<prefix>\A\s*|[{,\[]\s*)(?<key>[A-Za-z_]\w*)(?<suffix>\s*:)/) do
+              captures = Regexp.last_match.named_captures(symbolize_names: true)
+              "#{captures[:prefix]}\"#{captures[:key]}\"#{captures[:suffix]}"
             end
           end
@@ -415,12 +419,17 @@ module Html2rss
         attr_reader :parsed_body
+        # @return [Boolean] true when the page contains article-like arrays in JSON state
+        def extractable?
+          json_documents.any? { CandidateDetector.candidate_array?(_1) }
+        end
         # @yield [Hash{Symbol => Object}] normalized article hash
         # @return [Enumerator, void] article enumerator when no block is given
         def each
           return enum_for(:each) unless block_given?
-          DocumentScanner.json_documents(parsed_body).each do |document|
+          json_documents.each do |document|
             discover_articles(document) do |article|
               yield article if article
             end
@@ -431,6 +440,10 @@ module Html2rss
         attr_reader :url
+        def json_documents
+          self.class.json_documents(parsed_body)
+        end
         def discover_articles(document, &block)
           case document
           when Array then handle_array(document, &block)

data/lib/html2rss/auto_source/scraper/link_heuristics.rb CHANGED Viewed

@@ -24,19 +24,30 @@ module Html2rss
         ) do
           # @param url [Html2rss::Url] normalized destination URL
           # @return [DestinationFacts] route facts for downstream link scoring
-          def self.build(url)
+          def self.build(url) # rubocop:disable Metrics/MethodLength
             classifier = PathClassifier.new(url.path_segments)
             new(
               url:,
               destination: url.to_s,
-              **classifier.destination_attributes
+              segments: classifier.segments,
+              strong_post_suffix: classifier.strong_post_suffix?,
+              content_path: classifier.content_path?,
+              utility_path: classifier.utility_path?,
+              taxonomy_path: classifier.taxonomy_path?,
+              vanity_path: classifier.vanity_path?,
+              shallow: classifier.shallow?,
+              high_confidence_junk_path: classifier.junk_path?,
+              high_confidence_utility_destination: classifier.utility_destination?
             )
           end
         end
         # Extracts a normalized href from a Nokogiri anchor or raw href value.
         class HrefExtractor
+          # Regexp to capture everything before the first '#'
+          HREF_BASE_PATTERN = /\A([^#]*)/
           # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
           # @return [String, nil] href without fragment, or nil when blank
           def self.call(anchor_or_href) = new(anchor_or_href).call
@@ -48,20 +59,18 @@ module Html2rss
           # @return [String, nil] href without fragment, or nil when blank
           def call
-            raw_href.to_s.split('#', 2).first.to_s.strip.then do |href|
-              href unless href.empty?
-            end
-          end
+            href = case @anchor_or_href
+                   when Nokogiri::XML::Node
+                     @anchor_or_href['href']
+                   else
+                     @anchor_or_href
+                   end
-          private
+            return unless href
-          def raw_href
-            case @anchor_or_href
-            when Nokogiri::XML::Node
-              @anchor_or_href['href']
-            else
-              @anchor_or_href
-            end
+            # Extract base part before # and strip whitespace
+            base = href.to_s[HREF_BASE_PATTERN, 1].strip
+            base unless base.empty?
           end
         end
@@ -125,8 +134,7 @@ module Html2rss
         end
         # Classifies normalized destination path segments for scoring.
-        # rubocop:disable Metrics/ClassLength
-        class PathClassifier
+        class PathClassifier # rubocop:disable Metrics/ClassLength
           attr_reader :segments
           # Segment groups used to classify article, taxonomy, utility, and vanity routes.
@@ -206,48 +214,25 @@ module Html2rss
             @segments = segments
           end
-          # @return [Hash] destination attributes consumed by DestinationFacts
-          def destination_attributes
-            route_attributes.merge(confidence_attributes)
-          end
-          # @return [Hash] baseline path classification attributes
-          def route_attributes
-            {
-              segments:,
-              content_path: content_path?,
-              utility_path: utility_path?,
-              taxonomy_path: taxonomy_path?,
-              vanity_path: vanity_path?,
-              shallow: shallow?,
-              strong_post_suffix: strong_post_suffix?
-            }
-          end
-          # @return [Hash] high-confidence noise classification attributes
-          def confidence_attributes
-            ConfidenceClassifier.new(self).attributes
-          end
           # @return [Boolean] true when the route has article-like path evidence
           def content_path?
-            @content_path ||= SEGMENT_SETS.fetch(:content).intersect?(segments.to_set) ||
+            @content_path ||= segments.any? { |s| SEGMENT_SETS[:content].include?(s) } ||
                               yearish_content_context?
           end
           # @return [Boolean] true when the route includes utility/navigation evidence
           def utility_path?
-            @utility_path ||= SEGMENT_SETS.fetch(:utility).intersect?(segments.to_set)
+            @utility_path ||= segments.any? { |s| SEGMENT_SETS[:utility].include?(s) }
           end
           # @return [Boolean] true when the route points at conversion or account chrome
           def vanity_path?
-            @vanity_path ||= SEGMENT_SETS.fetch(:vanity).intersect?(segments.to_set)
+            @vanity_path ||= segments.any? { |s| SEGMENT_SETS[:vanity].include?(s) }
           end
           # @return [Boolean] true when the route points at taxonomy/listing chrome
           def taxonomy_path?
-            @taxonomy_path ||= SEGMENT_SETS.fetch(:taxonomy).intersect?(segments.to_set)
+            @taxonomy_path ||= segments.any? { |s| SEGMENT_SETS[:taxonomy].include?(s) }
           end
           # @return [Boolean] true when the route is too shallow to strongly indicate an article
@@ -260,7 +245,9 @@ module Html2rss
           # @return [Boolean] true when the final path segment looks like a post slug
           def strong_post_suffix?
-            PostSuffixClassifier.new(segments).strong?
+            @strong_post_suffix ||= segments.any? &&
+                                    included_last_segment? &&
+                                    trusted_post_context?(segments.size - 1)
           end
           # @return [Boolean] true when every path segment is utility chrome
@@ -282,131 +269,81 @@ module Html2rss
           # @return [Boolean] true when the leading segments are all utility chrome
           def deep_utility_context_route?
-            LeadingSegments.new(segments).all_junk?
+            all_junk?(segments.size - 1)
           end
-          private
-          def yearish_content_context?
-            segments.any? { |segment| segment.match?(YEARISH_SEGMENT) } &&
-              (strong_post_suffix? || LeadingSegments.new(segments).trusted_post_context?)
-          end
-        end
-        # rubocop:enable Metrics/ClassLength
-        # Classifies high-confidence junk and utility routes from path facts.
-        class ConfidenceClassifier
-          # @param path [PathClassifier] classified destination path
-          def initialize(path)
-            @path = path
-          end
-          # @return [Hash] high-confidence route classification attributes
-          def attributes
-            {
-              high_confidence_junk_path: junk_path?,
-              high_confidence_utility_destination: utility_destination?
-            }
-          end
-          private
+          # @return [Boolean] true when the route is shallow and contains high-confidence noise
           def junk_path?
             return false if excluded_content_route?
-            @path.taxonomy_path? ||
-              @path.utility_only_route? ||
-              @path.deep_utility_context_route? ||
-              @path.shallow_high_confidence_route?
+            taxonomy_path? ||
+              utility_only_route? ||
+              deep_utility_context_route? ||
+              shallow_high_confidence_route?
           end
+          # @return [Boolean] true when the route points at conversion or account chrome
           def utility_destination?
             return false if excluded_content_route?
-            @path.vanity_path? || utility_route?
+            vanity_path? || utility_route?
+          end
+          private
+          def yearish_content_context?
+            segments.any? { |segment| segment.match?(YEARISH_SEGMENT) } &&
+              (strong_post_suffix? || trusted_post_context?(segments.size - 1))
           end
           def excluded_content_route?
-            @path.segments.empty? || @path.content_path? || @path.strong_post_suffix?
+            segments.empty? || content_path? || strong_post_suffix?
           end
           def utility_route?
-            @path.taxonomy_path? ||
-              @path.utility_only_route? ||
-              @path.deep_utility_context_route? ||
+            taxonomy_path? ||
+              utility_only_route? ||
+              deep_utility_context_route? ||
               shallow_utility_route?
           end
           def shallow_utility_route?
-            @path.shallow? && @path.utility_path?
-          end
-        end
-        # Classifies route context before the final segment.
-        class LeadingSegments
-          # @param segments [Array<String>] normalized URL path segments
-          def initialize(segments)
-            @segments = segments[0...-1]
+            shallow? && utility_path?
           end
-          # @return [Boolean] true when every leading segment is utility chrome
-          def all_junk?
-            junk_segments = PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk)
+          def all_junk?(limit)
+            return false if limit <= 0
-            @segments.any? && @segments.all? { |segment| junk_segments.include?(segment) }
+            junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
+            (0...limit).all? { |i| junk_segments.include?(segments[i]) }
           end
-          # @return [Boolean] true when leading segments provide article context
-          def trusted_post_context?
-            content_segments = PathClassifier::SEGMENT_SETS.fetch(:content)
-            context_segments = PathClassifier::SEGMENT_SETS.fetch(:deep_post_context)
+          def trusted_post_context?(limit)
+            return false if limit <= 0
+            content_segments = SEGMENT_SETS.fetch(:content)
+            context_segments = SEGMENT_SETS.fetch(:deep_post_context)
-            @segments.any? do |segment|
+            (0...limit).any? do |i|
+              segment = segments[i]
               content_segments.include?(segment) ||
                 segment.match?(PathClassifier::YEARISH_SEGMENT) ||
                 context_segments.include?(segment)
             end
           end
-        end
-        # Classifies whether the final segment is a strong post-like suffix.
-        class PostSuffixClassifier
-          # @param segments [Array<String>] normalized URL path segments
-          def initialize(segments)
-            @segments = segments
-          end
-          # @return [Boolean] true when the final path segment looks like a post slug
-          def strong?
-            @segments.any? &&
-              included_last_segment? &&
-              LeadingSegments.new(@segments).trusted_post_context?
-          end
-          private
           def included_last_segment?
             !excluded_last_segment? && slug_last_segment?
           end
           def excluded_last_segment?
-            excluded_segments.any? { |segment| segment.include?(last_segment) }
-          end
-          def excluded_segments
-            [
-              PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk),
-              PathClassifier::SEGMENT_SETS.fetch(:vanity)
-            ]
+            last = segments.last
+            [SEGMENT_SETS[:high_confidence_junk], SEGMENT_SETS[:vanity]].any? { |set| set.include?(last) }
           end
           def slug_last_segment?
-            last_segment.match?(PathClassifier::YEARISH_SEGMENT) ||
-              last_segment.match?(PathClassifier::POST_SLUG_SEGMENT)
-          end
-          def last_segment
-            @segments.last
+            last = segments.last
+            last.match?(YEARISH_SEGMENT) || last.match?(POST_SLUG_SEGMENT)
           end
         end
@@ -421,11 +358,15 @@ module Html2rss
         # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
         # @return [DestinationFacts, nil] normalized destination facts, or nil for blank/invalid URLs
         def destination_facts(anchor_or_href)
+          return node_facts[anchor_or_href] if node_facts.key?(anchor_or_href)
           href = HrefExtractor.call(anchor_or_href)
           return unless href
-          url = Html2rss::Url.from_relative(href, @base_url)
-          DestinationFacts.build(url)
+          res = memoized_destination_facts(href)
+          node_facts[anchor_or_href] = res if anchor_or_href.is_a?(Nokogiri::XML::Node)
+          res
         rescue ArgumentError
           nil
         end
@@ -441,6 +382,19 @@ module Html2rss
         # @param text [String, #to_s] visible anchor text
         # @return [Boolean] true when text identifies recommendation chrome
         def recommended_text?(text) = @text_classifier.recommended?(text)
+        private
+        def node_facts
+          @node_facts ||= {}.compare_by_identity
+        end
+        def memoized_destination_facts(href)
+          (@destination_facts ||= {})[href] ||= begin
+            url = Html2rss::Url.from_relative(href, @base_url)
+            DestinationFacts.build(url)
+          end
+        end
       end
     end
   end