RubyGems - html2rss - Versions diffs - 0.14.0 → 0.15.0 - Mend

html2rss 0.14.0 → 0.15.0

Files changed (7) hide show

checksums.yaml +4 -4
data/lib/html2rss/auto_source/scraper/html.rb +87 -0
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +21 -8
data/lib/html2rss/auto_source/scraper.rb +1 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +2 -1
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a2ce9bbe8640372b5e98672760d76aee5f6f23373dd4b22ca067d2cdaa6f2b15
-  data.tar.gz: ff280d9466ee6b15b1149f582dadf9b209f0e99e4fb02e6b82f91b25a7ca0b7a
+  metadata.gz: d89191b35f643372cc18b880dab7535d18a10d9fd123897460ee16c5e990a5d9
+  data.tar.gz: 71cb356f5261b2e6a3d2152afcb68f658e78d5fec5ff15bc67ed0d5bd153fc00
 SHA512:
-  metadata.gz: d516b897253374425ccd3b26d21362df46c18c26694fe1a8aaddc06b956f93e36111d3310dc635b84d7626a2072014d27dacb075cd98d15a79d39aed40991bcb
-  data.tar.gz: 91ae4190d04967c1bc9d3f46b0a0bdbbd23f3dbda6559c3ff10391ab0d63d4b984a44789c8427dbace345af617a33e86c2f441c0a0d58dbcf2b734bd78b73b87
+  metadata.gz: 46f048feae342844df1af51c741d681677192c1dc84452fae1002f5cca5b406c0698a426ec6e532572c4fb4f6fb896a966862d8d2599b8dd742a174707289aed
+  data.tar.gz: 98d0316c64bb5a160d26d5efa59b25901b3a64e572795bbd840539fe69d84a4ea3c797bb16721edb73277d1b9bfb9238f9d40ea2b9bb4ebeffc81e8790a02062

data/lib/html2rss/auto_source/scraper/html.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+require 'nokogiri'
+require 'set'
+module Html2rss
+  class AutoSource
+    module Scraper
+      ##
+      # Scrapes articles from HTML pages by
+      # finding similar structures around anchor tags in the parsed_body.
+      class Html
+        include Enumerable
+        def self.articles?(parsed_body)
+          new(parsed_body, url: '').any?
+        end
+        def self.parent_until_condition(node, condition)
+          return nil if !node || node.parent.name == 'html'
+          return node if condition.call(node)
+          parent_until_condition(node.parent, condition)
+        end
+        ##
+        # Simplify an XPath selector by removing the index notation.
+        def self.simplify_xpath(xpath)
+          xpath.gsub(/\[\d+\]/, '')
+        end
+        def initialize(parsed_body, url:)
+          @parsed_body = parsed_body
+          @url = url
+          @css_selectors = Hash.new(0)
+        end
+        attr_reader :parsed_body
+        ##
+        # @yieldparam [Hash] The scraped article hash
+        # @return [Enumerator] Enumerator for the scraped articles
+        def each
+          return enum_for(:each) unless block_given?
+          return if frequent_selectors.empty?
+          frequent_selectors.each do |selector|
+            parsed_body.xpath(selector).each do |selected_tag|
+              article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
+              article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
+              yield article_hash if article_hash
+            end
+          end
+        end
+        ##
+        # Find all the anchors in root.
+        # @param root [Nokogiri::XML::Node] The root node to search for anchors
+        # @return [Set<String>] The set of CSS selectors which exist at least min_frequency times
+        def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
+          @frequent_selectors ||= begin
+            root.traverse do |node|
+              next if !node.element? || node.name != 'a'
+              @css_selectors[self.class.simplify_xpath(node.path)] += 1
+            end
+            @css_selectors.keys
+                          .select { |selector| (@css_selectors[selector]).to_i >= min_frequency }
+                          .to_set
+          end
+        end
+        private
+        def article_condition(node)
+          return true if %w[body html].include?(node.name)
+          return true if node.parent.css('a').size > 1
+          false
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb CHANGED Viewed

@@ -35,13 +35,13 @@ module Html2rss
           def initialize(article_tag, url:)
             @article_tag = article_tag
             @url = url
-            @heading = find_heading
-            @extract_url = find_url
           end
           # @return [Hash, nil] The scraped article or nil.
           def call
-            return unless heading
+            @heading = find_heading || closest_anchor || return
+            @extract_url = find_url
             {
               title: extract_title,
@@ -71,14 +71,20 @@ module Html2rss
             times.min
           end
+          ##
+          # Find the heading of the article.
+          # @return [Nokogiri::XML::Node, nil]
           def find_heading
             heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
+            return if heading_tags.empty?
             smallest_heading = heading_tags.keys.min
             heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
           end
           def extract_title
-            @extract_title ||= if heading.children.empty? && heading.text
+            @extract_title ||= if heading && (heading.children.empty? || heading.text)
                                  visible_text_from_tag(heading)
                                else
                                  visible_text_from_tag(
@@ -101,9 +107,12 @@ module Html2rss
             description.empty? ? nil : description
           end
+          def closest_anchor
+            SemanticHtml.find_closest_selector(heading || article_tag,
+                                               selector: 'a[href]:not([href=""])')
+          end
           def find_url
-            closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
-                                                                selector: 'a[href]:not([href=""])')
             href = closest_anchor&.[]('href')&.split('#')&.first&.strip
             Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
           end
@@ -113,8 +122,12 @@ module Html2rss
           end
           def generate_id
-            [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
-             extract_url&.path].compact.reject(&:empty?).first
+            [
+              article_tag['id'],
+              article_tag.at_css('[id]')&.attr('id'),
+              extract_url&.path,
+              extract_url&.query
+            ].compact.reject(&:empty?).first
           end
         end
       end

data/lib/html2rss/auto_source/scraper.rb CHANGED Viewed

@@ -10,6 +10,7 @@ module Html2rss
     #
     module Scraper
       SCRAPERS = [
+        Html,
         Schema,
         SemanticHtml
       ].freeze

data/lib/html2rss/version.rb CHANGED Viewed

@@ -3,6 +3,6 @@
 ##
 # The Html2rss namespace.
 module Html2rss
-  VERSION = '0.14.0'
+  VERSION = '0.15.0'
   public_constant :VERSION
 end

data/lib/html2rss.rb CHANGED Viewed

@@ -5,8 +5,9 @@ require 'zeitwerk'
 loader = Zeitwerk::Loader.for_gem
 loader.setup
-require 'yaml'
+require 'addressable'
 require 'logger'
+require 'yaml'
 ##
 # The Html2rss namespace.

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html2rss
 version: !ruby/object:Gem::Version
-  version: 0.14.0
+  version: 0.15.0
 platform: ruby
 authors:
 - Gil Desmarais
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-10-08 00:00:00.000000000 Z
+date: 2024-10-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: addressable
@@ -251,6 +251,7 @@ files:
 - lib/html2rss/auto_source/reducer.rb
 - lib/html2rss/auto_source/rss_builder.rb
 - lib/html2rss/auto_source/scraper.rb
+- lib/html2rss/auto_source/scraper/html.rb
 - lib/html2rss/auto_source/scraper/schema.rb
 - lib/html2rss/auto_source/scraper/schema/base.rb
 - lib/html2rss/auto_source/scraper/semantic_html.rb
@@ -279,7 +280,7 @@ licenses:
 - MIT
 metadata:
   allowed_push_host: https://rubygems.org
-  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.14.0
+  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.15.0
   rubygems_mfa_required: 'true'
 post_install_message:
 rdoc_options: []