RubyGems - html2rss - Versions diffs - 0.20.1 → 0.21.0 - Mend

html2rss 0.20.1 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +4 -4
data/html2rss.gemspec +1 -2
data/lib/html2rss/auto_source/scraper/html.rb +61 -16
data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
data/lib/html2rss/auto_source/scraper.rb +0 -3
data/lib/html2rss/auto_source.rb +2 -11
data/lib/html2rss/category_extractor.rb +54 -20
data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
data/lib/html2rss/html_extractor.rb +51 -30
data/lib/html2rss/rendering/description_builder.rb +3 -3
data/lib/html2rss/rss_builder/article.rb +44 -23
data/lib/html2rss/rss_builder/enclosure.rb +4 -2
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
data/lib/html2rss/selectors/post_processors/template.rb +3 -2
data/lib/html2rss/selectors.rb +18 -4
data/lib/html2rss/url.rb +4 -3
data/lib/html2rss/version.rb +1 -1
metadata +3 -17

data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb CHANGED Viewed

@@ -13,11 +13,10 @@ module Html2rss
           # @param schema_object [Hash] The schema object
           # @return [Array<String>] Array of category strings
           def self.call(schema_object)
-            # Build union of all category sources
-            field_categories = extract_field_categories(schema_object)
-            about_categories = extract_about_categories(schema_object)
-            (field_categories | about_categories).to_a
+            Set.new.tap do |categories|
+              extract_field_categories!(categories, schema_object)
+              extract_about_categories!(categories, schema_object)
+            end.to_a
           end
           ##
@@ -26,10 +25,18 @@ module Html2rss
           # @param schema_object [Hash] The schema object
           # @return [Set<String>] Set of category strings
           def self.extract_field_categories(schema_object)
-            Set.new.tap do |categories|
-              %w[keywords categories tags].each do |field|
-                categories.merge(extract_field_value(schema_object, field))
-              end
+            Set.new.tap { |categories| extract_field_categories!(categories, schema_object) }
+          end
+          ##
+          # Extracts categories from keywords, categories, and tags fields.
+          #
+          # @param categories [Set<String>] Accumulator set
+          # @param schema_object [Hash] The schema object
+          # @return [void]
+          def self.extract_field_categories!(categories, schema_object)
+            %i[keywords categories tags].each do |field|
+              extract_field_value!(categories, schema_object[field])
             end
           end
@@ -39,15 +46,23 @@ module Html2rss
           # @param schema_object [Hash] The schema object
           # @return [Set<String>] Set of category strings
           def self.extract_about_categories(schema_object)
+            Set.new.tap { |categories| extract_about_categories!(categories, schema_object) }
+          end
+          ##
+          # Extracts categories from the about field.
+          #
+          # @param categories [Set<String>] Accumulator set
+          # @param schema_object [Hash] The schema object
+          # @return [void]
+          def self.extract_about_categories!(categories, schema_object)
             about = schema_object[:about]
-            return Set.new unless about
+            return unless about
             if about.is_a?(Array)
-              extract_about_array(about)
+              extract_about_array!(categories, about)
             elsif about.is_a?(String)
-              extract_string_categories(about)
-            else
-              Set.new
+              extract_string_categories!(categories, about)
             end
           end
@@ -58,15 +73,25 @@ module Html2rss
           # @param field [String] The field name
           # @return [Set<String>] Set of category strings
           def self.extract_field_value(schema_object, field)
-            value = schema_object[field.to_sym]
-            return Set.new unless value
+            Set.new.tap { |categories| extract_field_value!(categories, schema_object[field.to_sym]) }
+          end
+          ##
+          # Extracts categories from a single field value.
+          #
+          # @param categories [Set<String>] Accumulator set
+          # @param value [Object] The field value
+          # @return [void]
+          def self.extract_field_value!(categories, value)
+            return unless value
             if value.is_a?(Array)
-              Set.new(value.map(&:to_s).reject(&:empty?))
+              value.each do |item|
+                s = item.to_s
+                categories.add(s) unless s.empty?
+              end
             elsif value.is_a?(String)
-              extract_string_categories(value)
-            else
-              Set.new
+              extract_string_categories!(categories, value)
             end
           end
@@ -76,13 +101,21 @@ module Html2rss
           # @param about [Array] The about array
           # @return [Set<String>] Set of category strings
           def self.extract_about_array(about)
-            Set.new.tap do |categories|
-              about.each do |item|
-                if item.is_a?(Hash) && item[:name]
-                  categories.add(item[:name].to_s)
-                elsif item.is_a?(String)
-                  categories.add(item)
-                end
+            Set.new.tap { |categories| extract_about_array!(categories, about) }
+          end
+          ##
+          # Extracts categories from an about array.
+          #
+          # @param categories [Set<String>] Accumulator set
+          # @param about [Array] The about array
+          # @return [void]
+          def self.extract_about_array!(categories, about)
+            about.each do |item|
+              if item.is_a?(Hash) && item[:name]
+                categories.add(item[:name].to_s)
+              elsif item.is_a?(String)
+                categories.add(item)
               end
             end
           end
@@ -93,7 +126,20 @@ module Html2rss
           # @param string [String] source string that may contain category delimiters
           # @return [Set<String>] Set of category strings
           def self.extract_string_categories(string)
-            Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?))
+            Set.new.tap { |categories| extract_string_categories!(categories, string) }
+          end
+          ##
+          # Extracts categories from a string by splitting on separators.
+          #
+          # @param categories [Set<String>] Accumulator set
+          # @param string [String] source string that may contain category delimiters
+          # @return [void]
+          def self.extract_string_categories!(categories, string)
+            string.split(/[,;|]/).each do |part|
+              s = part.strip
+              categories.add(s) unless s.empty?
+            end
           end
         end
       end

data/lib/html2rss/auto_source/scraper/schema/list_item.rb CHANGED Viewed

@@ -16,9 +16,10 @@ module Html2rss
           # @return [Html2rss::Url, nil]
           def url
-            url = schema_object.dig(:item, :url) || super
+            return @url if defined?(@url)
-            Url.from_relative(url, base_url || url) if url
+            item_url = schema_object.dig(:item, :url)
+            @url = item_url ? Url.from_relative(item_url, base_url || item_url) : super
           end
         end
       end

data/lib/html2rss/auto_source/scraper/schema/thing.rb CHANGED Viewed

@@ -13,24 +13,10 @@ module Html2rss
         class Thing
           # Supported Schema.org `@type` values mapped to article extraction.
           SUPPORTED_TYPES = %w[
-            AdvertiserContentArticle
-            AnalysisNewsArticle
-            APIReference
-            Article
-            AskPublicNewsArticle
-            BackgroundNewsArticle
-            BlogPosting
-            DiscussionForumPosting
-            LiveBlogPosting
-            NewsArticle
-            OpinionNewsArticle
-            Report
-            ReportageNewsArticle
-            ReviewNewsArticle
-            SatiricalArticle
-            ScholarlyArticle
-            SocialMediaPosting
-            TechArticle
+            AdvertiserContentArticle AnalysisNewsArticle APIReference Article
+            AskPublicNewsArticle BackgroundNewsArticle BlogPosting DiscussionForumPosting
+            LiveBlogPosting NewsArticle OpinionNewsArticle Report ReportageNewsArticle
+            ReviewNewsArticle SatiricalArticle ScholarlyArticle SocialMediaPosting TechArticle
           ].to_set.freeze
           # Attributes exposed by `#call` in generated article hashes.
@@ -44,21 +30,14 @@ module Html2rss
           end
           # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
-          def call
-            DEFAULT_ATTRIBUTES.to_h do |attribute|
-              [attribute, public_send(attribute)]
-            end
-          end
+          def call = DEFAULT_ATTRIBUTES.to_h { [_1, public_send(_1)] }
           # @return [String, nil] stable schema object identifier
           def id
             return @id if defined?(@id)
             id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s
-            return if id.empty?
-            @id = id
+            @id = id.to_s.empty? ? nil : id
           end
           # @return [String, nil] article title
@@ -66,26 +45,28 @@ module Html2rss
           # @return [String, nil] longest available description field
           def description
-            schema_object.values_at(:description, :schema_object_body, :abstract)
-                         .max_by { |string| string.to_s.size }
+            schema_object.values_at(:description, :schema_object_body, :abstract).max_by { _1.to_s.size }
           end
           # @return [Html2rss::Url, nil] the URL of the schema object
           def url
+            return @url if defined?(@url)
             url = schema_object[:url]
             if url.to_s.empty?
               Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}")
-              return
+              return @url = nil
             end
-            Url.from_relative(url, base_url || url)
+            @url = Url.from_relative(url, base_url || url)
           end
           # @return [Html2rss::Url, nil] normalized article image URL
           def image
-            if (image_url = image_urls.first)
-              Url.from_relative(image_url, base_url || image_url)
-            end
+            return @image if defined?(@image)
+            img_url = image_urls.first
+            @image = img_url ? Url.from_relative(img_url, base_url || img_url) : nil
           end
           # @return [String, nil] published-at timestamp string
@@ -93,24 +74,23 @@ module Html2rss
           # @return [Array<String>, nil] extracted category labels
           def categories
-            return @categories if defined?(@categories)
-            @categories = CategoryExtractor.call(schema_object)
+            @categories ||= CategoryExtractor.call(schema_object)
           end
           attr_reader :schema_object, :base_url
           # @return [Array<String>] normalized image URL candidates
           def image_urls
-            schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
-              next unless object
-              if object.is_a?(String)
-                object
-              elsif object.is_a?(Hash) && object[:@type] == 'ImageObject'
-                object[:url] || object[:contentUrl]
-              end
-            end
+            @image_urls ||= schema_object.values_at(:image, :thumbnailUrl).filter_map { image_url_from(_1) }
+          end
+          private
+          def image_url_from(obj)
+            return obj if obj.is_a?(String)
+            return unless obj.is_a?(Hash) && obj[:@type] == 'ImageObject'
+            obj[:url] || obj[:contentUrl]
           end
           # @param value [String, Symbol, nil] candidate schema identifier
@@ -120,10 +100,8 @@ module Html2rss
             text = value.to_s
             return if text.empty?
-            normalized_url = normalized_id_url(text, reference_url:)
-            return text unless reference_url && normalized_url.host == reference_url.host
-            normalized_id_value(normalized_url)
+            norm_url = normalized_id_url(text, reference_url:)
+            reference_url && norm_url.host == reference_url.host ? normalized_id_value(norm_url) : text
           rescue ArgumentError
             text
           end
@@ -132,11 +110,7 @@ module Html2rss
           # @param reference_url [Html2rss::Url, nil] URL used to resolve relative IDs
           # @return [Html2rss::Url] normalized identifier URL
           def normalized_id_url(text, reference_url:)
-            if text.start_with?('/')
-              Url.from_relative(text, reference_url || text)
-            else
-              Url.from_absolute(text)
-            end
+            text.start_with?('/') ? Url.from_relative(text, reference_url || text) : Url.from_absolute(text)
           end
           # @param url [Html2rss::Url] normalized identifier URL
@@ -144,17 +118,14 @@ module Html2rss
           def normalized_id_value(url)
             path = url.path.to_s
             return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
-            return path unless path.empty?
-            url.query
+            path.empty? ? url.query : path
           end
           # @param url [String, Html2rss::Url, nil] candidate page URL
           # @return [Html2rss::Url, nil] normalized absolute URL for schema resolution
           def normalized_base_url(url)
-            return if url.to_s.strip.empty?
-            Url.from_absolute(url)
+            Url.from_absolute(url) unless url.to_s.strip.empty?
           rescue ArgumentError
             nil
           end

data/lib/html2rss/auto_source/scraper/schema.rb CHANGED Viewed

@@ -18,6 +18,13 @@ module Html2rss
         # Selector for JSON-LD script tags containing Schema.org objects.
         TAG_SELECTOR = 'script[type="application/ld+json"]'
+        # Pre-compiled regex union for supported schema types.
+        # Performs a single pass over script tag text instead of multiple regex matches.
+        SUPPORTED_TYPES_RE = begin
+          types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
+          /"@type"\s*:\s*"(?:#{Regexp.union(types.to_a).source})"/
+        end.freeze
         # @return [Symbol] scraper config key
         def self.options_key = :schema
@@ -31,8 +38,7 @@ module Html2rss
           # @param script [Nokogiri::XML::Element] schema JSON-LD script tag
           # @return [Boolean] whether the tag references a supported schema type
           def supported_schema_type?(script)
-            supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
-            supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
+            script.text.match?(SUPPORTED_TYPES_RE)
           end
           ##

data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb CHANGED Viewed

@@ -22,8 +22,7 @@ module Html2rss
           # @return [Array<Entry>] deduplicated list of scraper entries
           def call(entries)
             destination_groups(entries).filter_map do |group|
-              collapsed_group = collapse_nested_destination_group(group)
-              collapsed_group.reduce do |best, entry|
+              group.reduce do |best, entry|
                 stronger_entry?(entry, best) ? entry : best
               end
             end
@@ -67,21 +66,6 @@ module Html2rss
           def destination_groups(entries) = entries.group_by { entry_destination(_1) }.values
-          def collapse_nested_destination_group(entries)
-            return entries if entries.size <= 1
-            entries.reject do |entry|
-              entries.any? do |other|
-                next if entry.equal?(other)
-                next unless nested_container_pair?(entry.container, other.container)
-                stronger_entry?(other, entry)
-              end
-            end
-          end
-          def nested_container_pair?(left, right) = left.ancestors.include?(right) || right.ancestors.include?(left)
           def entry_destination(entry) = entry.destination_facts&.destination || article_for(entry)&.[](:url)&.to_s
           def payload_richness_signature(article)
@@ -94,7 +78,9 @@ module Html2rss
             ]
           end
-          def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
+          def word_count(text)
+            (@word_counts ||= {})[text] ||= text.to_s.scan(/\p{Alnum}+/).size
+          end
         end
       end
     end

data/lib/html2rss/auto_source/scraper/semantic_html.rb CHANGED Viewed

@@ -21,6 +21,18 @@ module Html2rss
       class SemanticHtml # rubocop:disable Metrics/ClassLength
         include Enumerable
+        # Regexp to match content-related tokens.
+        CONTENT_REGEXP = begin
+          words = LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:content)
+          /(?:^|\s|[-_])(#{Regexp.union(words.to_a).source})(?:\s|[-_]|$)/i
+        end.freeze
+        # Regexp to match junk/utility-related tokens.
+        JUNK_REGEXP = begin
+          words = LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:utility)
+          /(?:^|\s|[-_])(#{Regexp.union(words.to_a).source})(?:\s|[-_]|$)/i
+        end.freeze
         # Container plus selected anchor, scoring metadata, and extracted article.
         Entry = Data.define(
           :container,
@@ -218,47 +230,79 @@ module Html2rss
               weak_article_candidate)
         end
+        ##
+        # @param container [Nokogiri::XML::Node]
+        # @return [Boolean]
         def publish_marker?(container)
-          container.at_css('time, [datetime], [itemprop="datePublished"], [itemprop="dateModified"]')
+          (@publish_markers ||= {}.compare_by_identity)[container] ||=
+            !!container.at_css('time, [datetime], [itemprop="datePublished"], [itemprop="dateModified"]')
         end
+        ##
+        # @param container [Nokogiri::XML::Node]
+        # @param publish_signal [Boolean]
+        # @param descriptive_signal [Boolean]
+        # @param content_signal [Boolean]
+        # @return [Integer]
         def article_signal_count(container, publish_signal:, descriptive_signal:, content_signal:)
           [article_container?(container), publish_signal, descriptive_signal, content_signal].count(&:itself)
         end
+        ##
+        # @param container [Nokogiri::XML::Node]
+        # @return [Boolean]
         def article_container?(container) = container.name == 'article'
         def descriptive_context?(container_text, title)
           snippet = container_text.to_s.sub(/\A#{Regexp.escape(title.to_s)}/i, '')
-          word_count(snippet) >= 8
+          # Only check for existence of enough words if snippet is long enough to have them
+          snippet.length > 30 && word_count(snippet) >= 8
         end
-        def heading_for(container) = container.at_css(AnchorSelector::HEADING_SELECTOR)
+        ##
+        # @param container [Nokogiri::XML::Node]
+        # @return [Nokogiri::XML::Node, nil]
+        def heading_for(container)
+          (@headings ||= {}.compare_by_identity)[container] ||= container.at_css(AnchorSelector::HEADING_SELECTOR)
+        end
-        def normalized_destination(anchor) = @link_heuristics.destination_facts(anchor)
+        def normalized_destination(anchor)
+          (@normalized_destinations ||= {}.compare_by_identity)[anchor] ||= @link_heuristics.destination_facts(anchor)
+        end
         def visible_text(node)
           return '' unless node
-          HtmlExtractor.extract_visible_text(node).to_s.strip
+          (@visible_texts ||= {}.compare_by_identity)[node] ||= HtmlExtractor.extract_visible_text(node).to_s.strip
         end
+        ##
+        # @param container [Nokogiri::XML::Node]
+        # @param selected_anchor [Nokogiri::XML::Node]
+        # @return [String]
         def entry_title(container, selected_anchor) = visible_text(heading_for(container) || selected_anchor)
-        def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
+        ##
+        # @param text [String, #to_s]
+        # @return [Integer]
+        def word_count(text)
+          (@word_counts ||= {})[text] ||= begin
+            count = 0
+            text.to_s.scan(/\p{Alnum}+/) { count += 1 }
+            count
+          end
+        end
         def container_tokens(container)
-          classes = container['class'].to_s.split
-          id = container['id'].to_s
-          (classes << id).flat_map { |str| str.downcase.split(/[-_]+/) }.reject(&:empty?)
+          (@container_tokens ||= {}.compare_by_identity)[container] ||= "#{container['class']} #{container['id']}"
         end
         def content_tokens?(tokens)
-          (@content_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:content)).intersect?(tokens.to_set)
+          tokens.match?(CONTENT_REGEXP)
         end
         def junk_tokens?(tokens)
-          (@junk_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:utility)).intersect?(tokens.to_set)
+          tokens.match?(JUNK_REGEXP)
         end
         def stable_rank(entries)

data/lib/html2rss/auto_source/scraper.rb CHANGED Viewed

@@ -11,9 +11,6 @@ module Html2rss
     # Detection is intentionally shallow for most scrapers, but instance-based
     # matching is available for scrapers that need to carry expensive selection
     # state forward into extraction.
-    # Scrapers run in parallel threads, so implementations must avoid shared
-    # mutable state and degrade by returning no articles when a follow-up would
-    # be unsafe or unsupported.
     module Scraper
       # Root markers indicating likely app-shell/client-rendered surfaces.
       APP_SHELL_ROOT_SELECTORS = '#app, #root, #__next, [data-reactroot], [ng-app], [id*="app-shell"]'

data/lib/html2rss/auto_source.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 # frozen_string_literal: true
-require 'parallel'
 require 'dry-validation'
 module Html2rss
@@ -121,11 +120,8 @@ module Html2rss
       scraper_instances = Scraper.instances_for(parsed_body, url:, request_session:, opts: @opts[:scraper])
       return [] if scraper_instances.empty?
-      # Scrapers are instantiated and run in parallel threads. Implementations
-      # must avoid shared mutable state, treat request_session calls as
-      # concurrency-safe from the scraper side, and return no articles when a
-      # follow-up would be unsafe or unsupported.
-      articles = Parallel.flat_map(scraper_instances, in_threads: thread_count_for(scraper_instances)) do |instance|
+      # Scrapers are run sequentially.
+      articles = scraper_instances.flat_map do |instance|
         run_scraper(instance)
       end
       Cleanup.call(articles, url:, **cleanup_options)
@@ -140,10 +136,5 @@ module Html2rss
     def cleanup_options
       @opts.fetch(:cleanup, {})
     end
-    def thread_count_for(scrapers)
-      count = [scrapers.size, Parallel.processor_count].min
-      count.zero? ? 1 : count
-    end
   end
 end

data/lib/html2rss/category_extractor.rb CHANGED Viewed

@@ -8,8 +8,10 @@ module Html2rss
     # Common category-related terms to look for in class names
     CATEGORY_TERMS = %w[category tag topic section label theme subject].freeze
-    # CSS selectors to find elements with category-related class names
-    CATEGORY_SELECTORS = CATEGORY_TERMS.map { |term| "[class*=\"#{term}\"]" }.freeze
+    # CSS selectors to find elements with category-related class names or data attributes
+    CATEGORY_SELECTORS = CATEGORY_TERMS.flat_map do |term|
+      ["[class*=\"#{term}\"]", "[data-#{term}]", "[#{term}]"]
+    end.freeze
     # Regex pattern for matching category-related attribute names
     CATEGORY_ATTR_PATTERN = /#{CATEGORY_TERMS.join('|')}/i
@@ -36,12 +38,12 @@ module Html2rss
     # @return [Set<String>] Set of category strings
     def self.extract_all_categories(article_tag)
       Set.new.tap do |categories|
-        article_tag.css('*').each do |element|
+        article_tag.css(CATEGORY_SELECTORS.join(',')).each do |element|
           # Extract text categories from elements with category-related class names
-          categories.merge(extract_text_categories(element)) if element['class']&.match?(CATEGORY_ATTR_PATTERN)
+          extract_text_categories!(categories, element) if element['class']&.match?(CATEGORY_ATTR_PATTERN)
           # Extract data categories from all elements
-          categories.merge(extract_element_data_categories(element))
+          extract_element_data_categories!(categories, element)
         end
       end
     end
@@ -49,34 +51,66 @@ module Html2rss
     ##
     # Extracts categories from data attributes of a single element.
     #
+    # @param categories [Set<String>] Accumulator set
     # @param element [Nokogiri::XML::Element] metadata element that may contain category links
-    # @return [Set<String>] Set of category strings
-    def self.extract_element_data_categories(element)
-      Set.new.tap do |categories|
-        element.attributes.each_value do |attr|
-          next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
+    # @return [void]
+    def self.extract_element_data_categories!(categories, element)
+      element.attributes.each_value do |attr|
+        next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
-          value = attr.value&.strip
-          categories.add(value) if value && !value.empty?
-        end
+        value = attr.value&.strip
+        categories.add(value) if value && !value.empty?
       end
     end
     ##
     # Extracts text-based categories from elements, splitting content into discrete values.
     #
+    # @param categories [Set<String>] Accumulator set
     # @param element [Nokogiri::XML::Element] metadata element whose text may contain delimiters
-    # @return [Set<String>] Set of category strings
-    def self.extract_text_categories(element)
-      anchor_values = element.css('a').filter_map do |node|
-        HtmlExtractor.extract_visible_text(node)
+    # @return [void]
+    def self.extract_text_categories!(categories, element)
+      if element.name == 'a'
+        add_text_to_categories!(categories, element)
+        return
       end
-      return Set.new(anchor_values.reject(&:empty?)) if anchor_values.any?
+      anchors = element.css('a')
+      if anchors.any?
+        anchors.each { |node| add_text_to_categories!(categories, node) }
+      else
+        extract_split_text_categories!(categories, element)
+      end
+    end
+    ##
+    # Adds the visible text of the given element to the categories set.
+    #
+    # @param categories [Set<String>] Accumulator set
+    # @param element [Nokogiri::XML::Element] The element to extract text from
+    # @return [void]
+    def self.add_text_to_categories!(categories, element)
+      text = HtmlExtractor.extract_visible_text(element)
+      categories.add(text) if text && !text.empty?
+    end
+    ##
+    # Extracts categories from the element's text by splitting on newlines.
+    #
+    # @param categories [Set<String>] Accumulator set
+    # @param element [Nokogiri::XML::Element] The element to extract text from
+    # @return [void]
+    def self.extract_split_text_categories!(categories, element)
       text = HtmlExtractor.extract_visible_text(element)
-      return Set.new unless text
+      return unless text
-      Set.new(text.split(/\n+/).map(&:strip).reject(&:empty?))
+      text.split(/\n+/).each do |line|
+        line = line.strip
+        categories.add(line) unless line.empty?
+      end
     end
+    private_class_method :add_text_to_categories!, :extract_split_text_categories!
   end
 end