RubyGems - html2rss - Versions diffs - 0.19.1 → 0.20.0 - Mend

html2rss 0.19.1 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/lib/html2rss/auto_source/scraper/html.rb +48 -56
data/lib/html2rss/auto_source/scraper/link_heuristics.rb +447 -0
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +6 -161
data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +102 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +172 -30
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +1 -1
data/lib/html2rss/config/class_methods.rb +2 -2
data/lib/html2rss/config/request_headers.rb +18 -9
data/lib/html2rss/configuration.rb +176 -0
data/lib/html2rss/html_extractor/list_candidates.rb +94 -0
data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +257 -0
data/lib/html2rss/html_extractor/semantic_containers.rb +70 -0
data/lib/html2rss/html_extractor.rb +11 -0
data/lib/html2rss/rss_builder/channel.rb +10 -7
data/lib/html2rss/url.rb +2 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +54 -5
metadata +9 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 69268fde80ddaa21f5ca3588de51f63182909714956af3ed8b1ee11a47075dc8
-  data.tar.gz: 045dfb3fec6cebfa8c7d066acd12c056dbf01766bbe1c292642d6d4d9db72055
+  metadata.gz: 1e4867b7a4906d0e4bb9d6cb9facfe96da516175a82fe10824e9ed579cf4aa3d
+  data.tar.gz: 53a8f699b87817b2b62cbe5d5d1761f33004f0b3be1ddb6c7e2428d449923a7c
 SHA512:
-  metadata.gz: de88861fd21375da62549cbed418f5f1550e7adf8e9c6ea98cfce9331944067bc8aa43eacdbdfe3ab6380764e485031f1d6bb3b456e0bf486340864328a5abc8
-  data.tar.gz: 6f65cd2e7dc555c35cb456bff595184331f3df07dafe10c50d6d334102237b0f98a5971c6033d8e56c4504254726ca0eca9757c22b1a697c2a47b8b76681945a
+  metadata.gz: a38e85afebf7bd17739915cf9f59846a76212690dc5309a02f37b24f4910247a1e34e3c3eeede165f52ea8c77dff39de472a65b5b5d3b6d8b99c02a19f6dfb0a
+  data.tar.gz: e3a0ad5868a070adf65a0cd78b560d196df7ed932fa424372397f906e0e2afc362eeb0afa864ddcb9bef10cf344efc2f6d9505c8515be502df342fc2a0975252

data/lib/html2rss/auto_source/scraper/html.rb CHANGED Viewed

@@ -19,9 +19,8 @@ module Html2rss
       class Html
         include Enumerable
-        # Elements ignored when traversing potential article containers.
-        TAGS_TO_IGNORE = /(nav|footer|header|svg|script|style)/i
+        # Absolute base URL used when probe-time detection needs to normalize relative hrefs.
+        DETECTION_BASE_URL = 'https://example.com'
         # Minimum selector frequency required to treat a path as a stable list signal.
         DEFAULT_MINIMUM_SELECTOR_FREQUENCY = 2
         # Number of most frequent selectors kept for container extraction.
@@ -39,7 +38,7 @@ module Html2rss
         # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
         # @return [Boolean] true when the scraper can likely extract articles
         def self.articles?(parsed_body)
-          new(parsed_body, url: '').any?
+          new(parsed_body, url: DETECTION_BASE_URL).any?
         end
         ##
@@ -49,7 +48,7 @@ module Html2rss
         # @param xpath [String] original XPath
         # @return [String] XPath without positional indexes
         def self.simplify_xpath(xpath)
-          xpath.gsub(/\[\d+\]/, '')
+          HtmlExtractor::ListCandidates.simplify_xpath(xpath)
         end
         # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
@@ -63,6 +62,7 @@ module Html2rss
           @url = url
           @extractor = extractor
           @opts = opts
+          @link_heuristics = LinkHeuristics.new(url)
         end
         attr_reader :parsed_body
@@ -73,8 +73,8 @@ module Html2rss
         def each
           return enum_for(:each) unless block_given?
-          each_article_tag do |article_tag|
-            article_hash = extract_article(article_tag)
+          each_article_tag do |article_tag, selected_anchor|
+            article_hash = extract_article(article_tag, selected_anchor:)
             yield article_hash if article_hash
           end
         end
@@ -90,8 +90,8 @@ module Html2rss
         # @param node [Nokogiri::XML::Node] candidate boundary node
         # @return [Boolean] true when the node is a good extraction boundary
         def article_tag_condition?(node)
-          # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
-          return false if node.path.match?(TAGS_TO_IGNORE)
+          # Ignore tags that are below ignored DOM chrome.
+          return false if HtmlExtractor.ignored_container_path?(node)
           return true if %w[body html].include?(node.name)
           return false unless (parent = node.parent)
@@ -100,24 +100,6 @@ module Html2rss
         private
-        ##
-        # Find relevant anchors in root.
-        # @return [Set<String>] The set of XPath selectors
-        def selectors
-          @selectors ||= Hash.new(0).tap do |selectors|
-            each_relevant_anchor { |node| increment_selector_count(selectors, node) }
-          end
-        end
-        ##
-        # Filter the frequent selectors by the minimum_selector_frequency and use_top_selectors.
-        # @return [Array<String>] The filtered selectors
-        def filtered_selectors
-          selectors.select { |_selector, count| count >= minimum_selector_frequency }
-                   .max_by(use_top_selectors, &:last)
-                   .map(&:first)
-        end
         def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
         def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
@@ -126,49 +108,59 @@ module Html2rss
           @anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
         end
-        def each_relevant_anchor
-          return enum_for(:each_relevant_anchor) unless block_given?
+        def relevant_anchor?(node)
+          destination_facts = @link_heuristics.destination_facts(node)
+          return false unless destination_facts
-          traversal_root&.traverse do |node|
-            yield node if relevant_anchor?(node)
-          end
+          !noise_anchor?(node, destination_facts)
         end
-        def relevant_anchor?(node)
-          node.element? && node.name == 'a' && !String(node['href']).empty?
-        end
+        def each_article_tag(&block)
+          return enum_for(:each_article_tag) unless block
-        def increment_selector_count(selectors, node)
-          path = self.class.simplify_xpath(node.path)
-          selectors[path] += 1 unless path.match?(TAGS_TO_IGNORE)
+          list_candidates.each_article_tag(anchor_filter: method(:relevant_anchor?),
+                                           boundary_condition: method(:article_tag_condition?),
+                                           &block)
         end
-        def traversal_root
-          parsed_body.at_css('body, html') || parsed_body.root
+        def extract_article(article_tag, selected_anchor: nil)
+          selected_anchor ||= preferred_anchor_for(article_tag)
+          return unless selected_anchor
+          return if noise_anchor?(selected_anchor, @link_heuristics.destination_facts(selected_anchor))
+          @extractor.new(article_tag, base_url: @url, selected_anchor:).call
         end
-        def each_article_tag
-          return enum_for(:each_article_tag) unless block_given?
+        def noise_anchor?(anchor, destination_facts) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+          return true unless destination_facts
-          filtered_selectors.each do |selector|
-            parsed_body.xpath(selector).each do |selected_tag|
-              article_tag = article_tag_for(selected_tag)
-              yield article_tag if article_tag
-            end
-          end
-        end
+          text = HtmlExtractor.extract_visible_text(anchor).to_s.strip
-        def article_tag_for(selected_tag)
-          return if selected_tag.path.match?(Html::TAGS_TO_IGNORE)
+          destination_facts.taxonomy_path ||
+            short_utility_label?(text, destination_facts) ||
+            (@link_heuristics.recommended_text?(text) && destination_facts.shallow) ||
+            (@link_heuristics.utility_prefix_text?(text) && destination_facts.high_confidence_utility_destination) ||
+            (@link_heuristics.utility_text?(text) && destination_facts.vanity_path)
+        end
-          HtmlNavigator.parent_until_condition(selected_tag, method(:article_tag_condition?))
+        def short_utility_label?(text, destination_facts)
+          destination_facts.utility_path &&
+            !destination_facts.content_path &&
+            !destination_facts.strong_post_suffix &&
+            text.scan(/\p{Alnum}+/).size <= 3
         end
-        def extract_article(article_tag)
-          selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
-          return unless selected_anchor
+        def preferred_anchor_for(article_tag)
+          article_tag.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).find { relevant_anchor?(_1) } ||
+            HtmlExtractor.main_anchor_for(article_tag)
+        end
-          @extractor.new(article_tag, base_url: @url, selected_anchor:).call
+        def list_candidates
+          HtmlExtractor::ListCandidates.new(
+            parsed_body,
+            minimum_selector_frequency:,
+            use_top_selectors:
+          )
         end
       end
     end

data/lib/html2rss/auto_source/scraper/link_heuristics.rb ADDED Viewed

@@ -0,0 +1,447 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      ##
+      # Shared link-level heuristics used by scraper-local selection and
+      # scoring. This keeps normalization and route/text classification
+      # consistent without moving scraper policy into higher orchestration.
+      class LinkHeuristics
+        # Normalized URL plus reusable route-classification facts for one link.
+        DestinationFacts = Data.define(
+          :url,
+          :destination,
+          :segments,
+          :content_path,
+          :utility_path,
+          :taxonomy_path,
+          :vanity_path,
+          :shallow,
+          :strong_post_suffix,
+          :high_confidence_junk_path,
+          :high_confidence_utility_destination
+        ) do
+          # @param url [Html2rss::Url] normalized destination URL
+          # @return [DestinationFacts] route facts for downstream link scoring
+          def self.build(url)
+            classifier = PathClassifier.new(url.path_segments)
+            new(
+              url:,
+              destination: url.to_s,
+              **classifier.destination_attributes
+            )
+          end
+        end
+        # Extracts a normalized href from a Nokogiri anchor or raw href value.
+        class HrefExtractor
+          # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
+          # @return [String, nil] href without fragment, or nil when blank
+          def self.call(anchor_or_href) = new(anchor_or_href).call
+          # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
+          def initialize(anchor_or_href)
+            @anchor_or_href = anchor_or_href
+          end
+          # @return [String, nil] href without fragment, or nil when blank
+          def call
+            raw_href.to_s.split('#', 2).first.to_s.strip.then do |href|
+              href unless href.empty?
+            end
+          end
+          private
+          def raw_href
+            case @anchor_or_href
+            when Nokogiri::XML::Node
+              @anchor_or_href['href']
+            else
+              @anchor_or_href
+            end
+          end
+        end
+        # Classifies visible anchor text for utility and recommendation chrome.
+        class TextClassifier
+          # Prefix labels that usually identify navigation or subscription links.
+          UTILITY_PREFIX_PATTERN = /
+            \A\s*(
+              # English
+              view\s+all|see\s+all|all\s+news|subscribe|newsletter|comment\s+feed|comments\s+feed|join|premium|plus|
+              # German
+              alle\s+anzeigen|alle\s+news|abonnieren|newsletter|kommentar\s+feed|mitmachen|
+              # Spanish
+              ver\s+todos|ver\s+todo|todas\s+las\s+noticias|suscribirse|bolet(i|í)n|comentarios\s+feed|unirse|
+              # French
+              voir\s+tout|voir\s+tous|toutes\s+les\s+nouvelles|s['’]abonner|flux\s+de\s+commentaires|rejoindre
+            )\b
+          /ix
+          # Short labels that usually identify non-article navigation links.
+          UTILITY_PATTERN = /
+            \A\s*(
+              # English
+              about|contact|comments?|join|log\s+in|login|member(ship)?|
+              plus|premium|pricing|recommended(\s+for\s+you)?|
+              see\s+all|share|sign\s+up|signup|subscribe|view\s+all|
+              # German
+              (ue|ü)ber(\s+uns)?|kontakt|kommentare?|mitmachen|anmelden|login|
+              mitglied(schaft)?|empfohlen(\s+f(ue|ü)r\s+dich)?|alle\s+anzeigen|
+              teilen|registrieren|abonnieren|newsletter|
+              # Spanish
+              sobre(\s+nosotros)?|contacto|comentarios?|unirse|iniciar\s+sesion|
+              login|miembro|membres(i|í)a|recomendado(\s+para\s+ti)?|ver\s+todo|
+              compartir|registrarse|suscribirse|bolet(i|í)n|
+              # French
+              (a|à)\s+propos|(a|à)propos|contact|commentaires?|rejoindre|
+              se\s+connecter|login|membre|abonnement|recommand(e|é)(\s+pour\s+vous)?|
+              voir\s+tout|partager|s['’]inscrire|s['’]abonner|newsletter
+            )\b
+          /ix
+          # Labels for recommendation chrome rather than source articles.
+          RECOMMENDED_PATTERN = /
+            \A\s*(
+              recommended(\s+for\s+you)?|
+              empfohlen(\s+f(ue|ü)r\s+dich)?|
+              recomendado(\s+para\s+ti)?|
+              recommand(e|é)(\s+pour\s+vous)?
+            )\b
+          /ix
+          # @param text [String, #to_s] visible anchor text
+          # @return [Boolean] true when text matches a utility label
+          def utility?(text) = text.to_s.match?(UTILITY_PATTERN)
+          # @param text [String, #to_s] visible anchor text
+          # @return [Boolean] true when text begins with a utility label
+          def utility_prefix?(text) = text.to_s.match?(UTILITY_PREFIX_PATTERN)
+          # @param text [String, #to_s] visible anchor text
+          # @return [Boolean] true when text identifies recommendation chrome
+          def recommended?(text) = text.to_s.match?(RECOMMENDED_PATTERN)
+        end
+        # Classifies normalized destination path segments for scoring.
+        # rubocop:disable Metrics/ClassLength
+        class PathClassifier
+          attr_reader :segments
+          # Segment groups used to classify article, taxonomy, utility, and vanity routes.
+          SEGMENT_SETS = {
+            content: %w[
+              article articles blog blogs changelog changelogs insight insights
+              launch launches news post posts release releases story stories update updates
+              artikel beitrag beitraege nachrichten neuigkeiten aktuelles
+              articulo articulos noticia noticias entrada entradas publicacion publicaciones
+              actualite actualites nouvelle nouvelles
+              teaser teasers card cards
+            ].to_set.freeze,
+            utility: %w[
+              about account archive archives author authors category categories comment comments
+              contact feedback help login logout newsletter newsletters notification notifications
+              preference preferences profile register search settings share signup subscribe
+              tag tags topic topics
+              feed feeds comment-feed comments-feed
+              recommended
+              for-you
+              privacy terms cookie cookies
+              join member members membership plus premium plans pricing user users
+              kategorie kategorien schlagwort schlagworte thema themen autor autoren archiv
+              ueber-uns ueber ueberuns profil kontakt impressum suche hilfe anmelden registrieren
+              konto registrierung anmeldung abonnieren abo datenschutz nutzungsbedingungen agb
+              categoria categorias etiqueta etiquetas tema temas autores archivos
+              sobre-nosotros sobre quienes-somos buscar busqueda ayuda entrar ingresar
+              registrarse registro cuenta suscribirse boletin privacidad condiciones
+              categorie etiquette etiquettes sujet sujets theme themes auteur auteurs
+              a-propos apropos recherche rechercher aide connexion s-inscrire
+              sinscrire inscription compte s-abonner saboner lettre-information confidentialite mentions-legales cgu
+              menu sidebar widget social modal popup banner promo ad ads
+              related recommendation recommendations pagination pager
+            ].to_set.freeze,
+            high_confidence_junk: %w[
+              about account archive archives author authors category categories comment comments
+              contact cookie cookies feedback feed feeds help login logout notification notifications
+              preference preferences privacy profile register search settings share signup subscribe
+              tag tags terms topic topics comment-feed comments-feed user users
+              kategorie kategorien schlagwort schlagworte thema themen autor autoren archiv
+              ueber-uns ueber ueberuns profil kontakt impressum suche hilfe anmelden registrieren
+              konto registrierung anmeldung abonnieren abo datenschutz nutzungsbedingungen agb
+              categoria categorias etiqueta etiquetas tema temas autores archivos
+              sobre-nosotros sobre quienes-somos buscar busqueda ayuda entrar ingresar
+              registrarse registro cuenta suscribirse boletin privacidad condiciones
+              categorie etiquette etiquettes sujet sujets theme themes auteur auteurs
+              a-propos apropos recherche rechercher aide connexion s-inscrire
+              sinscrire inscription compte s-abonner saboner lettre-information confidentialite mentions-legales cgu
+              menu sidebar widget social modal popup banner promo ad ads
+              related recommendation recommendations pagination pager
+            ].to_set.freeze,
+            taxonomy: %w[
+              category categories tag tags topic topics
+              kategorie kategorien schlagwort schlagworte thema themen
+              categoria categorias etiqueta etiquetas tema temas
+              categorie etiquette etiquettes sujet sujets theme themes
+            ].to_set.freeze,
+            vanity: %w[
+              join membership plus premium pricing plans subscribe signup
+              abonnieren abo
+              suscribirse boletin
+              s-abonner saboner
+            ].to_set.freeze,
+            deep_post_context: %w[
+              press newsroom
+              presse pressemitteilungen
+              prensa
+            ].to_set.freeze
+          }.freeze
+          # Path segment that begins with a year-like publishing marker.
+          YEARISH_SEGMENT = /\A\d{4,}[\w-]*\z/
+          # Hyphenated slug shape common to article permalinks.
+          POST_SLUG_SEGMENT = /\A[a-z0-9]+(?:-[a-z0-9]+){2,}\z/i
+          # @param segments [Array<String>] normalized URL path segments
+          def initialize(segments)
+            @segments = segments
+          end
+          # @return [Hash] destination attributes consumed by DestinationFacts
+          def destination_attributes
+            route_attributes.merge(confidence_attributes)
+          end
+          # @return [Hash] baseline path classification attributes
+          def route_attributes
+            {
+              segments:,
+              content_path: content_path?,
+              utility_path: utility_path?,
+              taxonomy_path: taxonomy_path?,
+              vanity_path: vanity_path?,
+              shallow: shallow?,
+              strong_post_suffix: strong_post_suffix?
+            }
+          end
+          # @return [Hash] high-confidence noise classification attributes
+          def confidence_attributes
+            ConfidenceClassifier.new(self).attributes
+          end
+          # @return [Boolean] true when the route has article-like path evidence
+          def content_path?
+            @content_path ||= SEGMENT_SETS.fetch(:content).intersect?(segments.to_set) ||
+                              yearish_content_context?
+          end
+          # @return [Boolean] true when the route includes utility/navigation evidence
+          def utility_path?
+            @utility_path ||= SEGMENT_SETS.fetch(:utility).intersect?(segments.to_set)
+          end
+          # @return [Boolean] true when the route points at conversion or account chrome
+          def vanity_path?
+            @vanity_path ||= SEGMENT_SETS.fetch(:vanity).intersect?(segments.to_set)
+          end
+          # @return [Boolean] true when the route points at taxonomy/listing chrome
+          def taxonomy_path?
+            @taxonomy_path ||= SEGMENT_SETS.fetch(:taxonomy).intersect?(segments.to_set)
+          end
+          # @return [Boolean] true when the route is too shallow to strongly indicate an article
+          def shallow?
+            segment_count = segments.size
+            junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
+            segment_count <= 1 || (segment_count == 2 && junk_segments.include?(segments.last))
+          end
+          # @return [Boolean] true when the final path segment looks like a post slug
+          def strong_post_suffix?
+            PostSuffixClassifier.new(segments).strong?
+          end
+          # @return [Boolean] true when every path segment is utility chrome
+          def utility_only_route?
+            junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
+            segments.all? { |segment| junk_segments.include?(segment) }
+          end
+          # @return [Boolean] true when the route is shallow and contains high-confidence noise
+          def shallow_high_confidence_route?
+            junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
+            vanity_segments = SEGMENT_SETS.fetch(:vanity)
+            shallow? && segments.any? do |segment|
+              junk_segments.include?(segment) || vanity_segments.include?(segment)
+            end
+          end
+          # @return [Boolean] true when the leading segments are all utility chrome
+          def deep_utility_context_route?
+            LeadingSegments.new(segments).all_junk?
+          end
+          private
+          def yearish_content_context?
+            segments.any? { |segment| segment.match?(YEARISH_SEGMENT) } &&
+              (strong_post_suffix? || LeadingSegments.new(segments).trusted_post_context?)
+          end
+        end
+        # rubocop:enable Metrics/ClassLength
+        # Classifies high-confidence junk and utility routes from path facts.
+        class ConfidenceClassifier
+          # @param path [PathClassifier] classified destination path
+          def initialize(path)
+            @path = path
+          end
+          # @return [Hash] high-confidence route classification attributes
+          def attributes
+            {
+              high_confidence_junk_path: junk_path?,
+              high_confidence_utility_destination: utility_destination?
+            }
+          end
+          private
+          def junk_path?
+            return false if excluded_content_route?
+            @path.taxonomy_path? ||
+              @path.utility_only_route? ||
+              @path.deep_utility_context_route? ||
+              @path.shallow_high_confidence_route?
+          end
+          def utility_destination?
+            return false if excluded_content_route?
+            @path.vanity_path? || utility_route?
+          end
+          def excluded_content_route?
+            @path.segments.empty? || @path.content_path? || @path.strong_post_suffix?
+          end
+          def utility_route?
+            @path.taxonomy_path? ||
+              @path.utility_only_route? ||
+              @path.deep_utility_context_route? ||
+              shallow_utility_route?
+          end
+          def shallow_utility_route?
+            @path.shallow? && @path.utility_path?
+          end
+        end
+        # Classifies route context before the final segment.
+        class LeadingSegments
+          # @param segments [Array<String>] normalized URL path segments
+          def initialize(segments)
+            @segments = segments[0...-1]
+          end
+          # @return [Boolean] true when every leading segment is utility chrome
+          def all_junk?
+            junk_segments = PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk)
+            @segments.any? && @segments.all? { |segment| junk_segments.include?(segment) }
+          end
+          # @return [Boolean] true when leading segments provide article context
+          def trusted_post_context?
+            content_segments = PathClassifier::SEGMENT_SETS.fetch(:content)
+            context_segments = PathClassifier::SEGMENT_SETS.fetch(:deep_post_context)
+            @segments.any? do |segment|
+              content_segments.include?(segment) ||
+                segment.match?(PathClassifier::YEARISH_SEGMENT) ||
+                context_segments.include?(segment)
+            end
+          end
+        end
+        # Classifies whether the final segment is a strong post-like suffix.
+        class PostSuffixClassifier
+          # @param segments [Array<String>] normalized URL path segments
+          def initialize(segments)
+            @segments = segments
+          end
+          # @return [Boolean] true when the final path segment looks like a post slug
+          def strong?
+            @segments.any? &&
+              included_last_segment? &&
+              LeadingSegments.new(@segments).trusted_post_context?
+          end
+          private
+          def included_last_segment?
+            !excluded_last_segment? && slug_last_segment?
+          end
+          def excluded_last_segment?
+            excluded_segments.any? { |segment| segment.include?(last_segment) }
+          end
+          def excluded_segments
+            [
+              PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk),
+              PathClassifier::SEGMENT_SETS.fetch(:vanity)
+            ]
+          end
+          def slug_last_segment?
+            last_segment.match?(PathClassifier::YEARISH_SEGMENT) ||
+              last_segment.match?(PathClassifier::POST_SLUG_SEGMENT)
+          end
+          def last_segment
+            @segments.last
+          end
+        end
+        # @param base_url [String, Html2rss::Url] page URL used to resolve relative hrefs
+        def initialize(base_url)
+          @base_url = base_url
+          @text_classifier = TextClassifier.new
+        end
+        # Builds normalized destination facts for an anchor element or href string.
+        #
+        # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
+        # @return [DestinationFacts, nil] normalized destination facts, or nil for blank/invalid URLs
+        def destination_facts(anchor_or_href)
+          href = HrefExtractor.call(anchor_or_href)
+          return unless href
+          url = Html2rss::Url.from_relative(href, @base_url)
+          DestinationFacts.build(url)
+        rescue ArgumentError
+          nil
+        end
+        # @param text [String, #to_s] visible anchor text
+        # @return [Boolean] true when text matches a utility label
+        def utility_text?(text) = @text_classifier.utility?(text)
+        # @param text [String, #to_s] visible anchor text
+        # @return [Boolean] true when text begins with a utility label
+        def utility_prefix_text?(text) = @text_classifier.utility_prefix?(text)
+        # @param text [String, #to_s] visible anchor text
+        # @return [Boolean] true when text identifies recommendation chrome
+        def recommended_text?(text) = @text_classifier.recommended?(text)
+      end
+    end
+  end
+end