RubyGems - html2rss - Versions diffs - 0.11.0 → 0.13.0 - Mend

html2rss 0.11.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

checksums.yaml +4 -4
data/README.md +38 -10
data/html2rss.gemspec +1 -0
data/lib/html2rss/attribute_post_processors/base.rb +74 -0
data/lib/html2rss/attribute_post_processors/gsub.rb +17 -17
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +6 -7
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +5 -9
data/lib/html2rss/attribute_post_processors/parse_time.rb +10 -10
data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -9
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +17 -8
data/lib/html2rss/attribute_post_processors/substring.rb +30 -10
data/lib/html2rss/attribute_post_processors/template.rb +19 -11
data/lib/html2rss/attribute_post_processors.rb +8 -0
data/lib/html2rss/auto_source/article.rb +95 -0
data/lib/html2rss/auto_source/channel.rb +79 -0
data/lib/html2rss/auto_source/cleanup.rb +76 -0
data/lib/html2rss/auto_source/reducer.rb +48 -0
data/lib/html2rss/auto_source/rss_builder.rb +68 -0
data/lib/html2rss/auto_source/scraper/schema/base.rb +61 -0
data/lib/html2rss/auto_source/scraper/schema.rb +122 -0
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +123 -0
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +54 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +118 -0
data/lib/html2rss/auto_source/scraper.rb +33 -0
data/lib/html2rss/auto_source.rb +77 -0
data/lib/html2rss/cli.rb +10 -0
data/lib/html2rss/config/channel.rb +4 -2
data/lib/html2rss/config/selectors.rb +13 -2
data/lib/html2rss/item.rb +8 -2
data/lib/html2rss/utils.rb +5 -10
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +21 -0
metadata +30 -3

data/lib/html2rss/auto_source/scraper/semantic_html.rb ADDED Viewed

@@ -0,0 +1,118 @@
+# frozen_string_literal: true
+require 'addressable'
+require 'parallel'
+module Html2rss
+  class AutoSource
+    module Scraper
+      ##
+      # Scrapes articles by looking for common markup tags (article, section, li)
+      # containing an <a href> tag.
+      #
+      # See:
+      # 1. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/article
+      class SemanticHtml
+        include Enumerable
+        ##
+        # Map of parent element names to CSS selectors for finding <a href> tags.
+        ANCHOR_TAG_SELECTORS = {
+          'section' => ['section :not(section) a[href]'],
+          'tr' => ['table tr :not(tr) a[href]'],
+          'article' => [
+            'article :not(article) a[href]',
+            'article a[href]'
+          ],
+          'li' => [
+            'ul > li :not(li) a[href]',
+            'ol > li :not(li) a[href]'
+          ]
+        }.freeze
+        # Check if the parsed_body contains articles
+        # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document
+        # @return [Boolean] True if articles are found, otherwise false.
+        def self.articles?(parsed_body)
+          return false unless parsed_body
+          ANCHOR_TAG_SELECTORS.each_value do |selectors|
+            return true if selectors.any? { |selector| parsed_body.at_css(selector) }
+          end
+          false
+        end
+        # Finds the closest ancestor tag matching the specified tag name
+        # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
+        # @param tag_name [String] The tag name to search for
+        # @param stop_tag [String] The tag name to stop searching at
+        # @return [Nokogiri::XML::Node] The found ancestor tag or the current tag if matched
+        def self.find_tag_in_ancestors(current_tag, tag_name, stop_tag: 'html')
+          return current_tag if current_tag.name == tag_name
+          stop_tags = Set[tag_name, stop_tag]
+          while current_tag.respond_to?(:parent) && !stop_tags.member?(current_tag.name)
+            current_tag = current_tag.parent
+          end
+          current_tag
+        end
+        # Finds the closest matching selector upwards in the DOM tree
+        # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
+        # @param selector [String] The CSS selector to search for
+        # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
+        def self.find_closest_selector(current_tag, selector: 'a[href]:not([href=""])')
+          current_tag.at_css(selector) || find_closest_selector_upwards(current_tag, selector:)
+        end
+        # Helper method to find a matching selector upwards
+        # @param current_tag [Nokogiri::XML::Node] The current tag to start searching from
+        # @param selector [String] The CSS selector to search for
+        # @return [Nokogiri::XML::Node, nil] The closest matching tag or nil if not found
+        def self.find_closest_selector_upwards(current_tag, selector:)
+          while current_tag
+            found = current_tag.at_css(selector)
+            return found if found
+            return nil unless current_tag.respond_to?(:parent)
+            current_tag = current_tag.parent
+          end
+        end
+        # Returns an array of [tag_name, selector] pairs
+        # @return [Array<[String, String]>] Array of tag name and selector pairs
+        def self.anchor_tag_selector_pairs
+          ANCHOR_TAG_SELECTORS.flat_map do |tag_name, selectors|
+            selectors.map { |selector| [tag_name, selector] }
+          end
+        end
+        def initialize(parsed_body, url:)
+          @parsed_body = parsed_body
+          @url = url
+        end
+        attr_reader :parsed_body
+        ##
+        # @yieldparam [Hash] The scraped article hash
+        # @return [Enumerator] Enumerator for the scraped articles
+        def each
+          return enum_for(:each) unless block_given?
+          SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
+            parsed_body.css(selector).each do |selected_tag|
+              article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
+              article_hash = Extractor.new(article_tag, url: @url).call
+              yield article_hash if article_hash
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    ##
+    # The Scraper module contains all scrapers that can be used to extract articles.
+    # Each scraper should implement a `call` method that returns an array of article hashes.
+    # Each scraper should also implement an `articles?` method that returns true if the scraper
+    # can potentially be used to extract articles from the given HTML.
+    #
+    module Scraper
+      SCRAPERS = [
+        Schema,
+        SemanticHtml
+      ].freeze
+      ##
+      # Error raised when no suitable scraper is found.
+      class NoScraperFound < Html2rss::Error; end
+      ##
+      # Returns an array of scrapers that claim to find articles in the parsed body.
+      # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
+      # @return [Array<Class>] An array of scraper classes that can handle the parsed body.
+      def self.from(parsed_body)
+        scrapers = SCRAPERS.select { |scraper| scraper.articles?(parsed_body) }
+        raise NoScraperFound, 'No suitable scraper found for URL.' if scrapers.empty?
+        scrapers
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+require 'nokogiri'
+require 'parallel'
+require 'addressable'
+module Html2rss
+  ##
+  # The AutoSource class is responsible for extracting channel and articles
+  # from a given URL.
+  # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
+  # marking articles, e.g. schema, microdata, open graph, etc.
+  class AutoSource
+    class UnsupportedUrlScheme < Html2rss::Error; end
+    class NoArticlesFound < Html2rss::Error; end
+    SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
+    def initialize(url)
+      unless url.is_a?(String) || url.is_a?(Addressable::URI)
+        raise ArgumentError,
+              'URL must be a String or Addressable::URI'
+      end
+      @url = Addressable::URI.parse(url)
+      raise ArgumentError, 'URL must be absolute' unless @url.absolute?
+      raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme)
+    end
+    def build
+      raise NoArticlesFound if articles.empty?
+      Reducer.call(articles, url:)
+      Cleanup.call(articles, url:, keep_different_domain: true)
+      Html2rss::AutoSource::RssBuilder.new(
+        channel:,
+        articles:
+      ).call
+    end
+    def articles
+      @articles ||= Scraper.from(parsed_body).flat_map do |scraper|
+        instance = scraper.new(parsed_body, url:)
+        articles_in_thread = Parallel.map(instance.each) do |article_hash|
+          Log.debug "Scraper: #{scraper} in worker: #{Parallel.worker_number} [#{article_hash[:url]}]"
+          Article.new(**article_hash, scraper:)
+        end
+        Reducer.call(articles_in_thread, url:)
+        articles_in_thread
+      end
+    end
+    def channel
+      Channel.new(parsed_body, response:, url:, articles:)
+    end
+    private
+    attr_reader :url
+    def response
+      @response ||= Html2rss::Utils.request_url(url)
+    end
+    # Parses the HTML body of the response using Nokogiri.
+    # @return [Nokogiri::HTML::Document]
+    def parsed_body
+      @parsed_body ||= Nokogiri.HTML(response.body).freeze
+    end
+  end
+end

data/lib/html2rss/cli.rb CHANGED Viewed

@@ -2,8 +2,13 @@
 require_relative '../html2rss'
 require 'thor'
+require 'addressable'
+##
+# The Html2rss namespace / command line interface.
 module Html2rss
+  Log = Logger.new($stderr)
   ##
   # The Html2rss command line interface.
   class CLI < Thor
@@ -25,5 +30,10 @@ module Html2rss
       params = options.to_h { |opt| opt.split('=', 2) }
       puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
     end
+    desc 'auto URL', 'automatically sources an RSS feed from the URL'
+    def auto(url)
+      puts Html2rss.auto_source(url)
+    end
   end
 end

data/lib/html2rss/config/channel.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module Html2rss
       # @return [Set<String>] the required parameter names
       def self.required_params_for_config(config)
         config.each_with_object(Set.new) do |(_, value), required_params|
-          required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
+          required_params.merge(value.scan(/%<(\w+)>[s|d]/).flatten) if value.is_a?(String)
         end
       end
@@ -25,7 +25,9 @@ module Html2rss
       # @param params [Hash]
       def initialize(channel, params: {})
         raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
-        raise ArgumentError, 'missing key :url' unless channel[:url].is_a?(String)
+        url = channel[:url]
+        raise ArgumentError, 'missing key :url' unless url.is_a?(String) || url.is_a?(Addressable::URI)
         @config = process_params(channel, params.transform_keys(&:to_sym))
       end

data/lib/html2rss/config/selectors.rb CHANGED Viewed

@@ -10,6 +10,9 @@ module Html2rss
       # Struct to represent a selector with associated attributes for extraction and processing.
       Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
+      # raised when an invalid selector name is used
+      class InvalidSelectorName < Html2rss::Error; end
       ##
       # @param config [Hash<Symbol, Object>]
       def initialize(config)
@@ -28,9 +31,15 @@ module Html2rss
       # @param name [Symbol]
       # @return [Selector]
       def selector(name)
-        raise ArgumentError, "invalid item's selector name: #{name}" unless selector?(name)
+        raise InvalidSelectorName, "invalid selector name: #{name}" unless selector?(name)
+        keywords = config[name].slice(*available_keys)
-        Selector.new(config[name])
+        if (additional_keys = keywords.keys - available_keys).any?
+          Log.warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
+        end
+        Selector.new(keywords)
       end
       ##
@@ -86,6 +95,8 @@ module Html2rss
           array.map!(&:to_sym)
         end.to_set
       end
+      def available_keys = @available_keys ||= Selector.members
     end
   end
 end

data/lib/html2rss/item.rb CHANGED Viewed

@@ -23,7 +23,8 @@ module Html2rss
     # @param config [Html2rss::Config] Configuration object.
     # @return [Array<Html2rss::Item>] list of items fetched.
     def self.from_url(url, config)
-      body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
+      body = Utils.request_url(url, headers: config.headers).body
+      body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
       Nokogiri.HTML(body)
               .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
@@ -47,6 +48,7 @@ module Html2rss
     # @param method_name [Symbol]
     # @param _include_private [true, false]
     # @return [true, false]
+    # :reek:BooleanParameter { enabled: false }
     def respond_to_missing?(method_name, _include_private = false)
       config.selector?(method_name) || super
     end
@@ -110,7 +112,11 @@ module Html2rss
     #
     # @return [Array<String>] list of categories.
     def categories
-      config.category_selector_names.map { |method_name| public_send(method_name) }
+      config.category_selector_names
+            .filter_map do |method_name|
+        category = public_send(method_name)
+        category.strip unless category.to_s.empty?
+      end.uniq
     end
     ##

data/lib/html2rss/utils.rb CHANGED Viewed

@@ -31,12 +31,12 @@ module Html2rss
     ##
     # Removes any space, parses and normalizes the given url.
     # @param url [String]
-    # @return [String, nil] sanitized and normalized URL, or nil if input is empty
+    # @return [Addressable::URI, nil] normalized URL, or nil if input is empty
     def self.sanitize_url(url)
       url = url.to_s.gsub(/\s+/, ' ').strip
       return if url.empty?
-      Addressable::URI.parse(url).normalize.to_s
+      Addressable::URI.parse(url).normalize
     end
     ##
@@ -71,18 +71,13 @@ module Html2rss
     ##
     # @param url [String, Addressable::URI]
-    # @param convert_json_to_xml [true, false] Should JSON be converted to XML
     # @param headers [Hash] additional HTTP request headers to use for the request
-    # @return [String] body of the HTTP response
-    def self.request_body_from_url(url, convert_json_to_xml: false, headers: {})
-      response = Faraday.new(url:, headers:) do |faraday|
+    # @return [Faraday::Response] body of the HTTP response
+    def self.request_url(url, headers: {})
+      Faraday.new(url:, headers:) do |faraday|
         faraday.use Faraday::FollowRedirects::Middleware
         faraday.adapter Faraday.default_adapter
       end.get
-      body = response.body
-      convert_json_to_xml ? ObjectToXmlConverter.new(JSON.parse(body)).call : body
     end
     ##

data/lib/html2rss/version.rb CHANGED Viewed

@@ -3,6 +3,6 @@
 ##
 # The Html2rss namespace.
 module Html2rss
-  VERSION = '0.11.0'
+  VERSION = '0.13.0'
   public_constant :VERSION
 end

data/lib/html2rss.rb CHANGED Viewed

@@ -6,10 +6,21 @@ loader = Zeitwerk::Loader.for_gem
 loader.setup
 require 'yaml'
+require 'logger'
 ##
 # The Html2rss namespace.
 module Html2rss
+  ##
+  # The logger instance.
+  Log = Logger.new($stdout)
+  Log.level = ENV.fetch('LOG_LEVEL', :warn).upcase.to_sym
+  Log.formatter = proc do |severity, datetime, _progname, msg|
+    "#{datetime} [#{severity}] #{msg}\n"
+  end
   ##
   # The Html2rss::Error base class.
   class Error < StandardError; end
@@ -91,5 +102,15 @@ module Html2rss
     end
   end
+  ##
+  # Scrapes the provided URL and returns an RSS object.
+  # No need for a "feed config".
+  #
+  # @param url [String] the URL to automatically source the feed from
+  # @return [RSS::Rss]
+  def self.auto_source(url)
+    Html2rss::AutoSource.new(url).build
+  end
   private_class_method :load_yaml, :find_feed_config
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html2rss
 version: !ruby/object:Gem::Version
-  version: 0.11.0
+  version: 0.13.0
 platform: ruby
 authors:
 - Gil Desmarais
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-08-09 00:00:00.000000000 Z
+date: 2024-08-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: addressable
@@ -106,6 +106,20 @@ dependencies:
     - - "<"
       - !ruby/object:Gem::Version
         version: '2.0'
+- !ruby/object:Gem::Dependency
+  name: parallel
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: regexp_parser
   requirement: !ruby/object:Gem::Requirement
@@ -219,6 +233,7 @@ files:
 - html2rss.gemspec
 - lib/html2rss.rb
 - lib/html2rss/attribute_post_processors.rb
+- lib/html2rss/attribute_post_processors/base.rb
 - lib/html2rss/attribute_post_processors/gsub.rb
 - lib/html2rss/attribute_post_processors/html_to_markdown.rb
 - lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
@@ -229,6 +244,18 @@ files:
 - lib/html2rss/attribute_post_processors/sanitize_html.rb
 - lib/html2rss/attribute_post_processors/substring.rb
 - lib/html2rss/attribute_post_processors/template.rb
+- lib/html2rss/auto_source.rb
+- lib/html2rss/auto_source/article.rb
+- lib/html2rss/auto_source/channel.rb
+- lib/html2rss/auto_source/cleanup.rb
+- lib/html2rss/auto_source/reducer.rb
+- lib/html2rss/auto_source/rss_builder.rb
+- lib/html2rss/auto_source/scraper.rb
+- lib/html2rss/auto_source/scraper/schema.rb
+- lib/html2rss/auto_source/scraper/schema/base.rb
+- lib/html2rss/auto_source/scraper/semantic_html.rb
+- lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
+- lib/html2rss/auto_source/scraper/semantic_html/image.rb
 - lib/html2rss/cli.rb
 - lib/html2rss/config.rb
 - lib/html2rss/config/channel.rb
@@ -252,7 +279,7 @@ licenses:
 - MIT
 metadata:
   allowed_push_host: https://rubygems.org
-  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.11.0
+  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
   rubygems_mfa_required: 'true'
 post_install_message:
 rdoc_options: []