RubyGems - html2rss - Versions diffs - 0.13.0 → 0.15.0 - Mend

html2rss 0.13.0 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/README.md +3 -3
data/lib/html2rss/auto_source/channel.rb +12 -6
data/lib/html2rss/auto_source/rss_builder.rb +2 -0
data/lib/html2rss/auto_source/scraper/html.rb +87 -0
data/lib/html2rss/auto_source/scraper/schema.rb +6 -0
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +22 -9
data/lib/html2rss/auto_source/scraper.rb +1 -0
data/lib/html2rss/auto_source.rb +20 -17
data/lib/html2rss/config.rb +1 -4
data/lib/html2rss/item.rb +1 -1
data/lib/html2rss/rss_builder/stylesheet.rb +38 -23
data/lib/html2rss/utils.rb +6 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +9 -13
metadata +5 -4

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7a2bf557dd65533533e07b4581e195f2d2b32ff906831526a4d7aed27a558d71
-  data.tar.gz: f42e5f03649a08219d310a2545413c371f851530c4d323fd68ef783b4b3b5e13
+  metadata.gz: d89191b35f643372cc18b880dab7535d18a10d9fd123897460ee16c5e990a5d9
+  data.tar.gz: 71cb356f5261b2e6a3d2152afcb68f658e78d5fec5ff15bc67ed0d5bd153fc00
 SHA512:
-  metadata.gz: 724a1fa8ab15ae140278eb9b055f22e7aad12e94627795f7a2f13c78f5421607e39d6ba040821b4c47b69f963cc0180bf8e964ff0b896403cb6305ed1d67dbb5
-  data.tar.gz: a06c2e16b0b51c6b6d2184430efc2a4e8b2812fee413163aa2991567e7608141f1c18189fdded58c8c3383940c4790478cd631abc6a1470ad648b2030fdefaab
+  metadata.gz: 46f048feae342844df1af51c741d681677192c1dc84452fae1002f5cca5b406c0698a426ec6e532572c4fb4f6fb896a966862d8d2599b8dd742a174707289aed
+  data.tar.gz: 98d0316c64bb5a160d26d5efa59b25901b3a64e572795bbd840539fe69d84a4ea3c797bb16721edb73277d1b9bfb9238f9d40ea2b9bb4ebeffc81e8790a02062

data/README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 [![Gem Version](https://badge.fury.io/rb/html2rss.svg)](http://rubygems.org/gems/html2rss/) [![Yard Docs](http://img.shields.io/badge/yard-docs-blue.svg)](https://www.rubydoc.info/gems/html2rss) ![Retro Badge: valid RSS](https://validator.w3.org/feed/images/valid-rss-rogers.png)
-`html2rss` is a Ruby gem that generates RSS 2.0 feeds from a _feed config_.
+`html2rss` is a Ruby gem that generates RSS 2.0 feeds from websites automatically, and as a fallback via _feed config_.
 With the _feed config_, you provide a URL to scrape and CSS selectors for extracting information (like title, URL, etc.). The gem builds the RSS feed accordingly. [Extractors](#using-extractors) and chainable [post processors](#using-post-processors) make information extraction, processing, and sanitizing a breeze. The gem also supports [scraping JSON](#scraping-and-handling-json-responses) responses and [setting HTTP request headers](#set-any-http-header-in-the-request).
@@ -26,9 +26,9 @@ You can also install it as a dependency in your Ruby project:
 ## Generating a feed on the CLI
-### using automatic scraping
+### using automatic generation
-html2rss offers an automatic scrapting feature. Try it with:
+html2rss offers an automatic RSS generation feature. Try it with:
 `html2rss auto https://unmatchedstyle.com/`

data/lib/html2rss/auto_source/channel.rb CHANGED Viewed

@@ -10,21 +10,27 @@ module Html2rss
       ##
       #
       # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
-      # @param response [Faraday::Response] The URL of the HTML document.
-      def initialize(parsed_body, url:, response:, articles: [])
+      # @param url [Addressable::URI] The URL of the channel.
+      # @param headers [Hash<String, String>] the http headers
+      # @param articles [Array<Html2rss::AutoSource::Article>] The articles.
+      def initialize(parsed_body, url:, headers:, articles: [], stylesheets: [])
         @parsed_body = parsed_body
         @url = url
-        @response = response
+        @headers = headers
         @articles = articles
+        @stylesheets = stylesheets
       end
+      attr_writer :articles
+      attr_reader :stylesheets
       def url = extract_url
       def title = extract_title
       def language = extract_language
       def description = extract_description
       def image = extract_image
       def ttl = extract_ttl
-      def last_build_date = response.headers['last-modified']
+      def last_build_date = headers['last-modified']
       def generator
         "html2rss V. #{::Html2rss::VERSION} (using auto_source scrapers: #{scraper_counts})"
@@ -32,7 +38,7 @@ module Html2rss
       private
-      attr_reader :parsed_body, :response
+      attr_reader :parsed_body, :headers
       def extract_url
         @url.normalize.to_s
@@ -58,7 +64,7 @@ module Html2rss
       end
       def extract_ttl
-        ttl = response.headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
+        ttl = headers['cache-control']&.match(/max-age=(\d+)/)&.[](1)
         return unless ttl
         ttl.to_i.fdiv(60).ceil

data/lib/html2rss/auto_source/rss_builder.rb CHANGED Viewed

@@ -31,6 +31,8 @@ module Html2rss
       def call
         RSS::Maker.make('2.0') do |maker|
+          Html2rss::RssBuilder::Stylesheet.add(maker, channel.stylesheets)
           make_channel(maker.channel)
           make_items(maker)
         end

data/lib/html2rss/auto_source/scraper/html.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+require 'nokogiri'
+require 'set'
+module Html2rss
+  class AutoSource
+    module Scraper
+      ##
+      # Scrapes articles from HTML pages by
+      # finding similar structures around anchor tags in the parsed_body.
+      class Html
+        include Enumerable
+        def self.articles?(parsed_body)
+          new(parsed_body, url: '').any?
+        end
+        def self.parent_until_condition(node, condition)
+          return nil if !node || node.parent.name == 'html'
+          return node if condition.call(node)
+          parent_until_condition(node.parent, condition)
+        end
+        ##
+        # Simplify an XPath selector by removing the index notation.
+        def self.simplify_xpath(xpath)
+          xpath.gsub(/\[\d+\]/, '')
+        end
+        def initialize(parsed_body, url:)
+          @parsed_body = parsed_body
+          @url = url
+          @css_selectors = Hash.new(0)
+        end
+        attr_reader :parsed_body
+        ##
+        # @yieldparam [Hash] The scraped article hash
+        # @return [Enumerator] Enumerator for the scraped articles
+        def each
+          return enum_for(:each) unless block_given?
+          return if frequent_selectors.empty?
+          frequent_selectors.each do |selector|
+            parsed_body.xpath(selector).each do |selected_tag|
+              article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
+              article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call
+              yield article_hash if article_hash
+            end
+          end
+        end
+        ##
+        # Find all the anchors in root.
+        # @param root [Nokogiri::XML::Node] The root node to search for anchors
+        # @return [Set<String>] The set of CSS selectors which exist at least min_frequency times
+        def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
+          @frequent_selectors ||= begin
+            root.traverse do |node|
+              next if !node.element? || node.name != 'a'
+              @css_selectors[self.class.simplify_xpath(node.path)] += 1
+            end
+            @css_selectors.keys
+                          .select { |selector| (@css_selectors[selector]).to_i >= min_frequency }
+                          .to_set
+          end
+        end
+        private
+        def article_condition(node)
+          return true if %w[body html].include?(node.name)
+          return true if node.parent.css('a').size > 1
+          false
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/schema.rb CHANGED Viewed

@@ -1,5 +1,9 @@
 # frozen_string_literal: true
+require 'json'
+require 'nokogiri'
+require 'set'
 module Html2rss
   class AutoSource
     module Scraper
@@ -99,6 +103,8 @@ module Html2rss
         # @yield [Hash] Each scraped article_hash
         # @return [Array<Hash>] the scraped article_hashes
         def each(&)
+          return enum_for(:each) unless block_given?
           schema_objects.filter_map do |schema_object|
             next unless (klass = self.class.scraper_for_schema_object(schema_object))
             next unless (article_hash = klass.new(schema_object, url:).call)

data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb CHANGED Viewed

@@ -35,13 +35,13 @@ module Html2rss
           def initialize(article_tag, url:)
             @article_tag = article_tag
             @url = url
-            @heading = find_heading
-            @extract_url = find_url
           end
           # @return [Hash, nil] The scraped article or nil.
           def call
-            return unless heading
+            @heading = find_heading || closest_anchor || return
+            @extract_url = find_url
             {
               title: extract_title,
@@ -71,14 +71,20 @@ module Html2rss
             times.min
           end
+          ##
+          # Find the heading of the article.
+          # @return [Nokogiri::XML::Node, nil]
           def find_heading
             heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
+            return if heading_tags.empty?
             smallest_heading = heading_tags.keys.min
-            heading_tags[smallest_heading]&.max_by { |tag| tag.text.size }
+            heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
           end
           def extract_title
-            @extract_title ||= if heading.children.empty? && heading.text
+            @extract_title ||= if heading && (heading.children.empty? || heading.text)
                                  visible_text_from_tag(heading)
                                else
                                  visible_text_from_tag(
@@ -101,9 +107,12 @@ module Html2rss
             description.empty? ? nil : description
           end
+          def closest_anchor
+            SemanticHtml.find_closest_selector(heading || article_tag,
+                                               selector: 'a[href]:not([href=""])')
+          end
           def find_url
-            closest_anchor = SemanticHtml.find_closest_selector(heading || article_tag,
-                                                                selector: 'a[href]:not([href=""])')
             href = closest_anchor&.[]('href')&.split('#')&.first&.strip
             Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
           end
@@ -113,8 +122,12 @@ module Html2rss
           end
           def generate_id
-            [article_tag['id'], article_tag.at_css('[id]')&.attr('id'),
-             extract_url&.path].compact.reject(&:empty?).first
+            [
+              article_tag['id'],
+              article_tag.at_css('[id]')&.attr('id'),
+              extract_url&.path,
+              extract_url&.query
+            ].compact.reject(&:empty?).first
           end
         end
       end

data/lib/html2rss/auto_source/scraper.rb CHANGED Viewed

@@ -10,6 +10,7 @@ module Html2rss
     #
     module Scraper
       SCRAPERS = [
+        Html,
         Schema,
         SemanticHtml
       ].freeze

data/lib/html2rss/auto_source.rb CHANGED Viewed

@@ -16,16 +16,18 @@ module Html2rss
     SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
-    def initialize(url)
-      unless url.is_a?(String) || url.is_a?(Addressable::URI)
-        raise ArgumentError,
-              'URL must be a String or Addressable::URI'
-      end
-      @url = Addressable::URI.parse(url)
-      raise ArgumentError, 'URL must be absolute' unless @url.absolute?
-      raise UnsupportedUrlScheme, "#{@url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(@url.scheme)
+    ##
+    # @param url [Addressable::URI] The URL to extract articles from.
+    # @param body [String] The body of the response.
+    # @param headers [Hash] The headers of the response.
+    def initialize(url, body:, headers: {})
+      raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
+      raise ArgumentError, 'URL must be absolute' unless url.absolute?
+      raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
+      @url = url
+      @body = body
+      @headers = headers
     end
     def build
@@ -34,6 +36,8 @@ module Html2rss
       Reducer.call(articles, url:)
       Cleanup.call(articles, url:, keep_different_domain: true)
+      channel.articles = articles
       Html2rss::AutoSource::RssBuilder.new(
         channel:,
         articles:
@@ -57,21 +61,20 @@ module Html2rss
     end
     def channel
-      Channel.new(parsed_body, response:, url:, articles:)
+      @channel ||= Channel.new(parsed_body, headers: @headers, url:)
     end
     private
     attr_reader :url
-    def response
-      @response ||= Html2rss::Utils.request_url(url)
-    end
-    # Parses the HTML body of the response using Nokogiri.
     # @return [Nokogiri::HTML::Document]
     def parsed_body
-      @parsed_body ||= Nokogiri.HTML(response.body).freeze
+      @parsed_body ||= Nokogiri.HTML(@body)
+                               .tap do |doc|
+        # Remove comments from the document
+        doc.xpath('//comment()').each(&:remove)
+      end.freeze
     end
   end
 end

data/lib/html2rss/config.rb CHANGED Viewed

@@ -18,9 +18,6 @@ module Html2rss
     # Thrown when the feed config does not contain a value at `:channel`.
     class ChannelMissing < Html2rss::Error; end
-    # Struct to store XML Stylesheet attributes
-    Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
     def_delegator :@channel, :author
     def_delegator :@channel, :ttl
     def_delegator :@channel, :title
@@ -75,7 +72,7 @@ module Html2rss
     #
     # @return [Array<Stylesheet>] Array of Stylesheet structs.
     def stylesheets
-      @global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
+      @global.fetch(:stylesheets, []).map { |attributes| Html2rss::RssBuilder::Stylesheet.new(**attributes) }
     end
     # Provides read-only access to the channel object.

data/lib/html2rss/item.rb CHANGED Viewed

@@ -19,7 +19,7 @@ module Html2rss
     ##
     # Fetches items from a given URL using configuration settings.
     #
-    # @param url [String] URL to fetch items from.
+    # @param url [Addressable::URI] URL to fetch items from.
     # @param config [Html2rss::Config] Configuration object.
     # @return [Array<Html2rss::Item>] list of items fetched.
     def self.from_url(url, config)

data/lib/html2rss/rss_builder/stylesheet.rb CHANGED Viewed

@@ -3,35 +3,50 @@
 module Html2rss
   module RssBuilder
     ##
-    # Adds XML stylesheet tags (with the provided maker).
+    # Represents a stylesheet.
     class Stylesheet
-      ##
-      # Adds the stylesheet XML tags to the RSS.
-      #
-      # @param maker [RSS::Maker::RSS20] RSS maker object.
-      # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
-      # @return [nil]
-      def self.add(maker, stylesheets)
-        stylesheets.each do |stylesheet|
-          add_stylesheet(maker, stylesheet)
+      class << self
+        ##
+        # Adds the stylesheet XML tags to the RSS.
+        #
+        # @param maker [RSS::Maker::RSS20] RSS maker object.
+        # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
+        # @return [nil]
+        def add(maker, stylesheets)
+          stylesheets.each do |stylesheet|
+            add_stylesheet(maker, stylesheet)
+          end
         end
-      end
-      ##
-      # Adds a single Stylesheet to the RSS.
-      #
-      # @param maker [RSS::Maker::RSS20] RSS maker object.
-      # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
-      # @return [nil]
-      def self.add_stylesheet(maker, stylesheet)
-        maker.xml_stylesheets.new_xml_stylesheet do |xss|
-          xss.href = stylesheet.href
-          xss.type = stylesheet.type
-          xss.media = stylesheet.media
+        private
+        ##
+        # Adds a single Stylesheet to the RSS.
+        #
+        # @param maker [RSS::Maker::RSS20] RSS maker object.
+        # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
+        # @return [nil]
+        def add_stylesheet(maker, stylesheet)
+          maker.xml_stylesheets.new_xml_stylesheet do |xss|
+            xss.href = stylesheet.href
+            xss.type = stylesheet.type
+            xss.media = stylesheet.media
+          end
         end
       end
-      private_class_method :add_stylesheet
+      TYPES = ['text/css', 'text/xsl'].freeze
+      def initialize(href:, type:, media: 'all')
+        raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)
+        raise ArgumentError, 'stylesheet.type invalid' unless TYPES.include?(type)
+        raise ArgumentError, 'stylesheet.media must be a String' unless media.is_a?(String)
+        @href = href
+        @type = type
+        @media = media
+      end
+      attr_reader :href, :type, :media
     end
   end
 end

data/lib/html2rss/utils.rb CHANGED Viewed

@@ -44,6 +44,7 @@ module Html2rss
     #
     # @param time_zone [String]
     # @param default_time_zone [String]
+    # @yield block to execute with the given time zone
     # @return [Object] whatever the given block returns
     def self.use_zone(time_zone, default_time_zone: Time.now.getlocal.zone)
       raise ArgumentError, 'a block is required' unless block_given?
@@ -74,6 +75,11 @@ module Html2rss
     # @param headers [Hash] additional HTTP request headers to use for the request
     # @return [Faraday::Response] body of the HTTP response
     def self.request_url(url, headers: {})
+      url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
+      raise ArgumentError, 'URL must be absolute' unless url.absolute?
+      raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
       Faraday.new(url:, headers:) do |faraday|
         faraday.use Faraday::FollowRedirects::Middleware
         faraday.adapter Faraday.default_adapter

data/lib/html2rss/version.rb CHANGED Viewed

@@ -3,6 +3,6 @@
 ##
 # The Html2rss namespace.
 module Html2rss
-  VERSION = '0.13.0'
+  VERSION = '0.15.0'
   public_constant :VERSION
 end

data/lib/html2rss.rb CHANGED Viewed

@@ -5,8 +5,9 @@ require 'zeitwerk'
 loader = Zeitwerk::Loader.for_gem
 loader.setup
-require 'yaml'
+require 'addressable'
 require 'logger'
+require 'yaml'
 ##
 # The Html2rss namespace.
@@ -43,7 +44,7 @@ module Html2rss
   # @param params [Hash] Dynamic parameters for the feed configuration.
   # @return [RSS::Rss] RSS object generated from the configuration.
   def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
-    yaml = load_yaml(file)
+    yaml = YAML.safe_load_file(file, symbolize_names: true)
     feeds = yaml[CONFIG_KEY_FEEDS] || {}
     feed_config = find_feed_config(yaml, feeds, name, global_config)
@@ -73,15 +74,6 @@ module Html2rss
     RssBuilder.build(config)
   end
-  ##
-  # Loads and parses the YAML file.
-  #
-  # @param file [String] Path to the YAML file.
-  # @return [Hash] Parsed YAML content.
-  def self.load_yaml(file)
-    YAML.safe_load_file(file, symbolize_names: true)
-  end
   ##
   # Builds the feed configuration based on the provided parameters.
   #
@@ -109,8 +101,12 @@ module Html2rss
   # @param url [String] the URL to automatically source the feed from
   # @return [RSS::Rss]
   def self.auto_source(url)
-    Html2rss::AutoSource.new(url).build
+    url = Addressable::URI.parse(url)
+    response = Html2rss::Utils.request_url(url)
+    Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
   end
-  private_class_method :load_yaml, :find_feed_config
+  private_class_method :find_feed_config
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html2rss
 version: !ruby/object:Gem::Version
-  version: 0.13.0
+  version: 0.15.0
 platform: ruby
 authors:
 - Gil Desmarais
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-08-16 00:00:00.000000000 Z
+date: 2024-10-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: addressable
@@ -251,6 +251,7 @@ files:
 - lib/html2rss/auto_source/reducer.rb
 - lib/html2rss/auto_source/rss_builder.rb
 - lib/html2rss/auto_source/scraper.rb
+- lib/html2rss/auto_source/scraper/html.rb
 - lib/html2rss/auto_source/scraper/schema.rb
 - lib/html2rss/auto_source/scraper/schema/base.rb
 - lib/html2rss/auto_source/scraper/semantic_html.rb
@@ -279,7 +280,7 @@ licenses:
 - MIT
 metadata:
   allowed_push_host: https://rubygems.org
-  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.13.0
+  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.15.0
   rubygems_mfa_required: 'true'
 post_install_message:
 rdoc_options: []
@@ -296,7 +297,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.11
+rubygems_version: 3.5.16
 signing_key:
 specification_version: 4
 summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors