RubyGems - html2rss - Versions diffs - 0.22.0 → 0.22.1 - Mend

html2rss 0.22.0 → 0.22.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/lib/html2rss/auto_source/scraper/html/class_clustering.rb +14 -3
data/lib/html2rss/auto_source/scraper/microdata.rb +14 -2
data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +1 -1
data/lib/html2rss/auto_source/scraper/semantic_html.rb +28 -20
data/lib/html2rss/auto_source.rb +3 -1
data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +4 -18
data/lib/html2rss/html_extractor/semantic_containers.rb +28 -3
data/lib/html2rss/html_extractor.rb +36 -17
data/lib/html2rss/html_navigator.rb +17 -0
data/lib/html2rss/selectors.rb +8 -3
data/lib/html2rss/version.rb +1 -1
data/schema/html2rss-config.schema.json +8 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9ff7cdc4e25f3abc6da000e4f672d6832fb6027e49885aecc0cad38329c5e6ae
-  data.tar.gz: e42c216e328bb2c56971dd58871f023f15e672563e27741c56c6cf7fe4cb322a
+  metadata.gz: 750b7fb967b328cef2238b66729cafe122d1bae23bee05fd8504bb31e760b8a7
+  data.tar.gz: 327406de9c7c97ea13e90c89bec1c2653c962bbcaccfd29ddb78b282477f7578
 SHA512:
-  metadata.gz: e52812f947561b9a52537f1b28c530f63e116194642d13ff526fac1ad32f02d7ea6ff8ca9b5ee16e2d7f686e1babec1908ca51399ba42ef9461ed3dbe0d02117
-  data.tar.gz: 4754495a5947aca6de71846d1c88128d9fc1826e1014fd00806f58e7cd1e1575dc3bd2380ced3aeecd0cd6d51e0366ddee7a1f7db0220750e13f66645de2edde
+  metadata.gz: 5f41e00edfdd19ceb012900db7518f28236b417662dc4f11f45d9c498ede0720bdbbc0fb31443d80495eaf6076df646062fdf7f972b27bd71c50cc4e198b4540
+  data.tar.gz: 7a7eff85bd7f98cd872131041aa58faf9d3fba1aff47893a6d286378b6f52904ee11120f053e3ca8beca3060d86d1438cf2037e3e4e7e1586d3d3c4679b026f0

data/lib/html2rss/auto_source/scraper/html/class_clustering.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Html2rss
         # rubocop:disable Metrics/ClassLength
         class ClassClustering
           # Node tags considered layout containers
-          LAYOUT_TAG_NAMES = Set['div', 'section', 'article'].freeze
+          LAYOUT_TAG_NAMES = Set['div', 'section', 'article', 'li', 'ul', 'ol'].freeze
           # HTML/layout tags excluded from candidate nodes
           EXCLUDED_TAGS = Set['html', 'body', 'nav', 'footer', 'header', 'svg', 'script', 'style'].freeze
@@ -83,13 +83,24 @@ module Html2rss
             end
           end
+          # rubocop:disable Metrics/MethodLength
           def container_of?(nodes_a, nodes_b)
             return false unless LAYOUT_TAG_NAMES.include?(nodes_b.first.name)
             nodes_a.any? do |node_a|
-              nodes_b.count { |node_b| node_a != node_b && node_b.ancestors.include?(node_a) } > 1
+              count = 0
+              nodes_b.each do |node_b|
+                next if node_a == node_b
+                if HtmlNavigator.descendant_of?(node_b, node_a)
+                  count += 1
+                  break if count > 1
+                end
+              end
+              count > 1
             end
           end
+          # rubocop:enable Metrics/MethodLength
           # If group A contains group B, and they have the same size:
           # - If B (the descendant) contains >= 80% of A's words, AND B's tag is div/section/article,
@@ -112,7 +123,7 @@ module Html2rss
             nodes_a = groups[cls_a]
             nodes_b = groups[cls_b]
             return if nodes_a.size != nodes_b.size
-            return unless nodes_a.zip(nodes_b).all? { |a, b| a != b && b.ancestors.include?(a) }
+            return unless nodes_a.zip(nodes_b).all? { |a, b| a != b && HtmlNavigator.descendant_of?(b, a) }
             discarded << (keep_descendant?(nodes_a, nodes_b) ? cls_a : cls_b)
           end

data/lib/html2rss/auto_source/scraper/microdata.rb CHANGED Viewed

@@ -55,7 +55,13 @@ module Html2rss
           def top_level_item?(node)
             return false if node.attribute('itemprop')
-            node.ancestors.none? { |ancestor| ancestor.attribute('itemscope') && ancestor.attribute('itemprop') }
+            curr = node.parent
+            while curr && !curr.document? && curr.name != 'html'
+              return false if curr.attribute('itemscope') && curr.attribute('itemprop')
+              curr = curr.parent
+            end
+            true
           end
         end
@@ -147,7 +153,13 @@ module Html2rss
           def direct_property?(root, node)
             return false if node == root
-            node.ancestors.take_while { _1 != root }.none? { |ancestor| ancestor.attribute('itemscope') }
+            curr = node.parent
+            while curr && curr != root
+              return false if curr.attribute('itemscope')
+              curr = curr.parent
+            end
+            true
           end
           # @param node [Nokogiri::XML::Element] itemprop node

data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb CHANGED Viewed

@@ -37,7 +37,7 @@ module Html2rss
             @article_cache.fetch(entry) do
               @article_cache[entry] = @extractor.new(
-                entry.container, base_url: @url, selected_anchor: entry.selected_anchor
+                entry.container, base_url: @url, selected_anchor: entry.selected_anchor, fallback_anchorless: true
               ).call
             end
           end

data/lib/html2rss/auto_source/scraper/semantic_html.rb CHANGED Viewed

@@ -60,12 +60,13 @@ module Html2rss
         # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
         # @param url [String, Html2rss::Url] base url
         # @param extractor [Class] extractor class used for article extraction
-        # @param _opts [Hash] scraper-specific options
-        # @option _opts [Object] :_reserved reserved for future scraper-specific options
-        def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
+        # @param opts [Hash] scraper-specific options
+        # @option opts [Boolean] :fallback_anchorless whether to extract anchorless blocks
+        def initialize(parsed_body, url:, extractor: HtmlExtractor, **opts)
           @parsed_body = parsed_body
           @url = url
           @extractor = extractor
+          @fallback_anchorless = opts.fetch(:fallback_anchorless, false)
           @link_heuristics = LinkHeuristics.new(url)
           @anchor_selector = AnchorSelector.new(url)
         end
@@ -107,14 +108,15 @@ module Html2rss
           @anchor_selector.primary_anchor_for(container)
         end
-        def extractable_entries # rubocop:disable Metrics/MethodLength
+        # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
+        def extractable_entries
           @extractable_entries ||= candidate_containers.filter_map do |container|
             selected_anchor = primary_anchor_for(container)
-            next unless selected_anchor
+            next unless selected_anchor || @fallback_anchorless
-            destination_facts = normalized_destination(selected_anchor)
-            next unless destination_facts
+            destination_facts = selected_anchor ? normalized_destination(selected_anchor) : nil
+            next if selected_anchor && !destination_facts
             next if hard_junk_entry?(container, selected_anchor, destination_facts)
             quality = quality_score(container, selected_anchor, destination_facts)
@@ -132,6 +134,7 @@ module Html2rss
             )
           end
         end
+        # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
         # rubocop:disable Metrics/MethodLength
         def ranked_entries
@@ -177,8 +180,8 @@ module Html2rss
           score += 40 if words >= 3
           score += 15 if words >= 7
-          score += 20 if destination_facts.url.path.to_s.length > 6
-          score += 15 if destination_facts.content_path
+          score += 20 if destination_facts&.url&.path.to_s.length > 6
+          score += 15 if destination_facts&.content_path
           score += 15 if publish_marker?(container)
           score += 10 if descriptive_context?(container_text, title)
           score += 10 if article_container?(container)
@@ -190,12 +193,12 @@ module Html2rss
           title = entry_title(container, selected_anchor)
           utility_text = @link_heuristics.utility_prefix_text?(title)
           recommended_text = @link_heuristics.recommended_text?(title)
-          content_signal = destination_facts.content_path
+          content_signal = destination_facts&.content_path
           no_content_signal = !content_signal
           non_content_utility_path =
-            destination_facts.utility_path &&
+            destination_facts&.utility_path &&
             no_content_signal &&
-            !destination_facts.strong_post_suffix
+            !destination_facts&.strong_post_suffix
           publish_signal = publish_marker?(container)
           descriptive_signal = descriptive_context?(visible_text(container), title)
           weak_container = !publish_signal && !descriptive_signal
@@ -203,19 +206,20 @@ module Html2rss
           score += 25 if non_content_utility_path
           score += 15 if utility_text && word_count(title) <= 6
-          score += 10 if destination_facts.shallow
+          score += 10 if destination_facts&.shallow
           score += 10 if weak_container
           score += 10 if recommended_text && no_content_signal
-          score += 5 if destination_facts.high_confidence_junk_path
+          score += 5 if destination_facts&.high_confidence_junk_path
           score += 15 if junk_tokens?(container_tokens(container))
           score
         end
-        def hard_junk_entry?(container, selected_anchor, destination_facts) # rubocop:disable Metrics/MethodLength
+        # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
+        def hard_junk_entry?(container, selected_anchor, destination_facts)
           title = entry_title(container, selected_anchor)
           publish_signal = publish_marker?(container)
           descriptive_signal = descriptive_context?(visible_text(container), title)
-          content_signal = destination_facts.content_path
+          content_signal = destination_facts&.content_path
           weak_article_candidate = article_signal_count(
             container,
             publish_signal:,
@@ -223,12 +227,16 @@ module Html2rss
             content_signal:
           ) < 2
-          destination_facts.high_confidence_junk_path ||
-            (@link_heuristics.recommended_text?(title) && destination_facts.shallow && weak_article_candidate) ||
-            (@link_heuristics.utility_prefix_text?(title) &&
-              destination_facts.high_confidence_utility_destination &&
+          destination_facts&.high_confidence_junk_path ||
+            (selected_anchor &&
+              @link_heuristics.recommended_text?(title) &&
+              destination_facts&.shallow &&
+              weak_article_candidate) ||
+            (selected_anchor && @link_heuristics.utility_prefix_text?(title) &&
+              destination_facts&.high_confidence_utility_destination &&
               weak_article_candidate)
         end
+        # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
         ##
         # @param container [Nokogiri::XML::Node]

data/lib/html2rss/auto_source.rb CHANGED Viewed

@@ -32,7 +32,8 @@ module Html2rss
           enabled: true
         },
         semantic_html: {
-          enabled: true
+          enabled: true,
+          fallback_anchorless: true
         },
         html: {
           enabled: true,
@@ -59,6 +60,7 @@ module Html2rss
       end
       optional(:semantic_html).hash do
         optional(:enabled).filled(:bool)
+        optional(:fallback_anchorless).filled(:bool)
       end
       optional(:html).hash do
         optional(:enabled).filled(:bool)

data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb CHANGED Viewed

@@ -138,17 +138,7 @@ module Html2rss
         # @return [Boolean] true when the anchor is inside the selected heading
         def heading_anchor?
           heading = @context.heading
-          return false unless heading
-          curr = @anchor
-          container = @context.container
-          while curr.respond_to?(:parent)
-            return true if curr == heading
-            break if curr == container
-            curr = curr.parent
-          end
-          false
+          heading && (@anchor == heading || HtmlNavigator.descendant_of?(@anchor, heading))
         end
         # @return [Boolean] true when anchor text exactly matches heading text
@@ -183,15 +173,11 @@ module Html2rss
         end
         def utility_landmark_ancestor?
-          curr = @anchor.parent
           container = @context.container
-          while curr.respond_to?(:parent)
-            return true if Context::UTILITY_LANDMARK_TAGS.include?(curr.name)
-            break if curr == container
+          condition = proc { |node| node == container || Context::UTILITY_LANDMARK_TAGS.include?(node.name) }
+          landmark = HtmlNavigator.parent_until_condition(@anchor.parent, condition)
-            curr = curr.parent
-          end
-          false
+          landmark && landmark != container
         end
         def icon_only_anchor?

data/lib/html2rss/html_extractor/semantic_containers.rb CHANGED Viewed

@@ -32,9 +32,34 @@ module Html2rss
           HtmlExtractor.ignored_container_path?(node, cache)
         end
-        # Preserve the original post-order traversal intent (specific-first)
-        # by sorting candidates by depth (descending) while keeping original document
-        # order for nodes at the same depth.
+        candidates = filter_nested_containers(candidates)
+        sort_by_depth(candidates)
+      end
+      private
+      def filter_nested_containers(candidates)
+        candidate_set = Set.new(candidates)
+        rejected = Set.new
+        candidates.each do |candidate_b|
+          next if candidate_b.name == 'div'
+          find_and_reject_ancestors(candidate_b, candidate_set, rejected)
+        end
+        candidates.reject { |c| rejected.include?(c) }
+      end
+      def find_and_reject_ancestors(node, candidate_set, rejected)
+        curr = node.parent
+        while curr && !curr.document? && curr.name != 'html'
+          rejected << curr if candidate_set.include?(curr)
+          curr = curr.parent
+        end
+      end
+      def sort_by_depth(candidates)
         candidates.each_with_index
                   .sort_by { |node, index| [-node.ancestors.size, index] }
                   .map!(&:first)

data/lib/html2rss/html_extractor.rb CHANGED Viewed

@@ -47,25 +47,33 @@ module Html2rss
       # @param node [Nokogiri::XML::Node]
       # @param cache [Hash, nil] identity cache used to store results (must use compare_by_identity)
       # @return [Boolean] true when the node belongs to ignored DOM chrome
+      # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
       def ignored_container_path?(node, cache = nil)
         return cache[node] if cache&.key?(node)
-        res = walk_ignored_container_path?(node)
-        cache[node] = res if cache
-        res
-      end
+        curr = node
+        visited = []
+        is_ignored = false
-      private
+        while curr.respond_to?(:parent) && curr
+          if cache&.key?(curr)
+            is_ignored = cache[curr]
+            break
+          end
-      def walk_ignored_container_path?(node)
-        curr = node
-        while curr.respond_to?(:parent)
-          return true if IGNORED_CONTAINER_TAGS.include?(curr.name)
+          if IGNORED_CONTAINER_TAGS.include?(curr.name)
+            is_ignored = true
+            break
+          end
+          visited << curr
           curr = curr.parent
         end
-        false
+        visited.each { |n| cache[n] = is_ignored } if cache
+        is_ignored
       end
+      # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
     end
     ##
@@ -119,14 +127,16 @@ module Html2rss
       Url.from_relative("##{id}", base_url) if id
     end
+    # rubocop:disable Metrics/CyclomaticComplexity
     def extract_title
-      title_source = heading || selected_anchor
-      if title_source
-        self.class.extract_visible_text(title_source)
-      else
-        fallback_anchorless_title
-      end
+      source = heading || selected_anchor
+      title_text = source ? self.class.extract_visible_text(source) : fallback_anchorless_title
+      return unless title_text
+      kicker = kicker_node ? self.class.extract_visible_text(kicker_node).to_s.strip : nil
+      kicker && !kicker.empty? && !title_text.include?(kicker) ? "#{kicker}: #{title_text}" : title_text
     end
+    # rubocop:enable Metrics/CyclomaticComplexity
     def fallback_anchorless_title
       return unless @fallback_anchorless && selected_anchor.nil?
@@ -143,8 +153,17 @@ module Html2rss
       )
     end
+    def kicker_node
+      @kicker_node ||= begin
+        selector = '[data-tb-kicker], [class*="kicker"], [class*="eyebrow"], ' \
+                   '[class*="pre-title"], [class*="pretitle"], [class*="overline"]'
+        node = article_tag.at_css(selector)
+        node && heading && (node == heading || HtmlNavigator.descendant_of?(node, heading)) ? nil : node
+      end
+    end
     def extract_description
-      exclude = [heading, selected_anchor].compact.to_set
+      exclude = [heading, selected_anchor, kicker_node].compact.to_set
       description = self.class.extract_visible_text(article_tag, exclude_nodes: exclude)
       return if description.nil?

data/lib/html2rss/html_navigator.rb CHANGED Viewed

@@ -49,6 +49,23 @@ module Html2rss
         current_tag.ancestors(tag_name).first
       end
+      ##
+      # Returns true if child_node is a descendant of parent_node.
+      # Walks up using parent pointers to avoid NodeSet allocations.
+      #
+      # @param child_node [Nokogiri::XML::Node] potential descendant
+      # @param parent_node [Nokogiri::XML::Node] potential ancestor
+      # @return [Boolean] true when child_node is a descendant of parent_node
+      def descendant_of?(child_node, parent_node)
+        curr = child_node.respond_to?(:parent) ? child_node.parent : nil
+        while curr
+          return true if curr == parent_node
+          curr = curr.respond_to?(:parent) ? curr.parent : nil
+        end
+        false
+      end
     end
   end
 end

data/lib/html2rss/selectors.rb CHANGED Viewed

@@ -103,11 +103,15 @@ module Html2rss
     # @param article_tag [Nokogiri::XML::Element] HTML element to extract additional info from.
     # @param base_url [String, Html2rss::Url] base URL for normalization during enhancement
     # @return [Hash] The enhanced article hash.
+    # rubocop:disable Metrics/MethodLength
     def enhance_article_hash(article_hash, article_tag, base_url = @url)
       selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
-      return article_hash unless selected_anchor
-      extracted = HtmlExtractor.new(article_tag, base_url:, selected_anchor:).call
+      extracted = HtmlExtractor.new(
+        article_tag,
+        base_url:,
+        selected_anchor:,
+        fallback_anchorless: true
+      ).call
       return article_hash unless extracted
       extracted.each_with_object(article_hash) do |(key, value), hash|
@@ -116,6 +120,7 @@ module Html2rss
         hash[key] = value
       end
     end
+    # rubocop:enable Metrics/MethodLength
     ##
     # Selects the value for a given attribute from an HTML element.

data/lib/html2rss/version.rb CHANGED Viewed

@@ -4,6 +4,6 @@
 # The Html2rss namespace.
 module Html2rss
   # Current application version.
-  VERSION = '0.22.0'
+  VERSION = '0.22.1'
   public_constant :VERSION
 end

data/schema/html2rss-config.schema.json CHANGED Viewed

@@ -153,6 +153,12 @@
                   "not": {
                     "type": "null"
                   }
+                },
+                "fallback_anchorless": {
+                  "type": "boolean",
+                  "not": {
+                    "type": "null"
+                  }
                 }
               },
               "required": []
@@ -228,7 +234,8 @@
             "enabled": true
           },
           "semantic_html": {
-            "enabled": true
+            "enabled": true,
+            "fallback_anchorless": true
           },
           "html": {
             "enabled": true,

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: html2rss
 version: !ruby/object:Gem::Version
-  version: 0.22.0
+  version: 0.22.1
 platform: ruby
 authors:
 - Gil Desmarais
@@ -381,7 +381,7 @@ licenses:
 - MIT
 metadata:
   allowed_push_host: https://rubygems.org
-  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.0
+  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.1
   rubygems_mfa_required: 'true'
 rdoc_options: []
 require_paths: