RubyGems - html2rss - Versions diffs - 0.15.0 → 0.17.0 - Mend

html2rss 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/README.md +112 -44
data/html2rss.gemspec +3 -2
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +8 -1
data/lib/html2rss/auto_source/article.rb +37 -5
data/lib/html2rss/auto_source/channel.rb +21 -28
data/lib/html2rss/auto_source/cleanup.rb +0 -16
data/lib/html2rss/auto_source/rss_builder.rb +1 -1
data/lib/html2rss/auto_source/scraper/html.rb +21 -12
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
data/lib/html2rss/auto_source/scraper/schema.rb +22 -34
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +41 -41
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +6 -6
data/lib/html2rss/auto_source/scraper/semantic_html.rb +3 -2
data/lib/html2rss/auto_source.rb +0 -7
data/lib/html2rss/cli.rb +11 -4
data/lib/html2rss/config/channel.rb +7 -1
data/lib/html2rss/config/selectors.rb +2 -1
data/lib/html2rss/config.rb +1 -0
data/lib/html2rss/item.rb +7 -2
data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
data/lib/html2rss/request_service/context.rb +46 -0
data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
data/lib/html2rss/request_service/puppet_commander.rb +61 -0
data/lib/html2rss/request_service/response.rb +27 -0
data/lib/html2rss/request_service/strategy.rb +28 -0
data/lib/html2rss/request_service.rb +97 -0
data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
data/lib/html2rss/utils.rb +23 -26
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +5 -5
metadata +31 -11
data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61

data/lib/html2rss/auto_source/scraper/schema/thing.rb ADDED Viewed

@@ -0,0 +1,104 @@
+# frozen_string_literal: true
+require 'date'
+module Html2rss
+  class AutoSource
+    module Scraper
+      class Schema
+        ##
+        # A Thing is kind of the 'base class' for Schema.org schema_objects.
+        #
+        # @see https://schema.org/Thing
+        class Thing
+          SUPPORTED_TYPES = %w[
+            AdvertiserContentArticle
+            AnalysisNewsArticle
+            APIReference
+            Article
+            AskPublicNewsArticle
+            BackgroundNewsArticle
+            BlogPosting
+            DiscussionForumPosting
+            LiveBlogPosting
+            NewsArticle
+            OpinionNewsArticle
+            Report
+            ReportageNewsArticle
+            ReviewNewsArticle
+            SatiricalArticle
+            ScholarlyArticle
+            SocialMediaPosting
+            TechArticle
+          ].to_set.freeze
+          DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
+          def initialize(schema_object, url:)
+            @schema_object = schema_object
+            @url = url
+          end
+          # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
+          def call
+            DEFAULT_ATTRIBUTES.to_h do |attribute|
+              [attribute, public_send(attribute)]
+            end
+          end
+          def id
+            return @id if defined?(@id)
+            id = (schema_object[:@id] || url&.path).to_s
+            return if id.empty?
+            @id = id
+          end
+          def title = schema_object[:title]
+          def description
+            schema_object.values_at(:description, :schema_object_body, :abstract)
+                         .max_by { |string| string.to_s.size }
+          end
+          # @return [Addressable::URI, nil] the URL of the schema object
+          def url
+            url = schema_object[:url]
+            if url.to_s.empty?
+              Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}")
+              return
+            end
+            Utils.build_absolute_url_from_relative(url, @url)
+          end
+          def image
+            if (image_url = image_urls.first)
+              Utils.build_absolute_url_from_relative(image_url, @url)
+            end
+          end
+          def published_at = schema_object[:datePublished]
+          private
+          attr_reader :schema_object
+          def image_urls
+            schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
+              next unless object
+              if object.is_a?(String)
+                object
+              elsif object.is_a?(Hash) && object[:@type] == 'ImageObject'
+                object[:url] || object[:contentUrl]
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/schema.rb CHANGED Viewed

@@ -2,58 +2,38 @@
 require 'json'
 require 'nokogiri'
-require 'set'
 module Html2rss
   class AutoSource
     module Scraper
       ##
-      # Scraps articles from Schema.org objects, by looking for the objects in:
+      # Scrapes articles from Schema.org objects, by looking for the objects in:
-      #  1. <script type="application/ld+json"> "schema" tag.
-      #  2. tbd
+      # <script type="application/ld+json"> "schema" tags.
       #
       # See:
-      # 1. https://schema.org/NewsArticle
+      # 1. https://schema.org/docs/full.html
       # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
       class Schema
         include Enumerable
         TAG_SELECTOR = 'script[type="application/ld+json"]'
-        SCHEMA_OBJECT_TYPES = %w[
-          AdvertiserContentArticle
-          AnalysisNewsArticle
-          APIReference
-          Article
-          AskPublicNewsArticle
-          BackgroundNewsArticle
-          BlogPosting
-          DiscussionForumPosting
-          LiveBlogPosting
-          NewsArticle
-          OpinionNewsArticle
-          Report
-          ReportageNewsArticle
-          ReviewNewsArticle
-          SatiricalArticle
-          ScholarlyArticle
-          SocialMediaPosting
-          TechArticle
-        ].to_set.freeze
         class << self
           def articles?(parsed_body)
             parsed_body.css(TAG_SELECTOR).any? do |script|
-              SCHEMA_OBJECT_TYPES.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
+              (Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
+                script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
+              end
             end
           end
           ##
           # Returns a flat array
           # of all supported schema objects
-          # by recursively traversing the `from` object.
+          # by recursively traversing the given `object`.
           #
-          # @param object [Hash, Array]
+          # @param object [Hash, Array, Nokogiri::XML::Element]
           # @return [Array<Hash>] the schema_objects, or an empty array
           # :reek:DuplicateMethodCall
           def from(object)
@@ -74,12 +54,16 @@ module Html2rss
           end
           ##
-          # @return [Scraper::Schema::Base, Scraper::Schema::NewsArticle, nil]
+          # @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
           def scraper_for_schema_object(schema_object)
-            if SCHEMA_OBJECT_TYPES.member?(schema_object[:@type])
-              Base
+            type = schema_object[:@type]
+            if Thing::SUPPORTED_TYPES.member?(type)
+              Thing
+            elsif ItemList::SUPPORTED_TYPES.member?(type)
+              ItemList
             else
-              Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{schema_object[:@type]}")
+              Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
               nil
             end
           end
@@ -107,9 +91,13 @@ module Html2rss
           schema_objects.filter_map do |schema_object|
             next unless (klass = self.class.scraper_for_schema_object(schema_object))
-            next unless (article_hash = klass.new(schema_object, url:).call)
+            next unless (results = klass.new(schema_object, url:).call)
-            yield article_hash
+            if results.is_a?(Array)
+              results.each { |result| yield(result) } # rubocop:disable Style/ExplicitBlockArgument
+            else
+              yield(results)
+            end
           end
         end

data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb CHANGED Viewed

@@ -1,7 +1,5 @@
 # frozen_string_literal: true
-require 'set'
 module Html2rss
   class AutoSource
     module Scraper
@@ -33,6 +31,8 @@ module Html2rss
           end
           def initialize(article_tag, url:)
+            raise ArgumentError, 'article_tag is required' unless article_tag
             @article_tag = article_tag
             @url = url
           end
@@ -57,20 +57,6 @@ module Html2rss
           attr_reader :article_tag, :url, :heading, :extract_url
-          def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
-          # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
-          def extract_published_at
-            times = article_tag.css('time[datetime]')
-                               .filter_map do |tag|
-              DateTime.parse(tag['datetime'])
-            rescue ArgumentError, TypeError
-              nil
-            end
-            times.min
-          end
           ##
           # Find the heading of the article.
           # @return [Nokogiri::XML::Node, nil]
@@ -80,18 +66,36 @@ module Html2rss
             return if heading_tags.empty?
             smallest_heading = heading_tags.keys.min
-            heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size }
+            heading_tags[smallest_heading]&.max_by { |tag| visible_text_from_tag(tag)&.size.to_i }
+          end
+          def visible_text_from_tag(tag, separator: ' ') = self.class.visible_text_from_tag(tag, separator:)
+          def closest_anchor
+            SemanticHtml.find_closest_selector(heading || article_tag,
+                                               selector: 'a[href]:not([href=""])')
+          end
+          def find_url
+            href = closest_anchor&.[]('href')
+            return if (parts = href.to_s.split('#')).empty?
+            Utils.build_absolute_url_from_relative(parts.first.strip, url)
           end
           def extract_title
-            @extract_title ||= if heading && (heading.children.empty? || heading.text)
-                                 visible_text_from_tag(heading)
-                               else
-                                 visible_text_from_tag(
-                                   article_tag.css(HEADING_TAGS.join(','))
-                                              .max_by { |tag| tag.text.size }
-                                 )
-                               end
+            if heading && (heading.children.empty? || heading.text)
+              visible_text_from_tag(heading)
+            else
+              visible_text_from_tag(article_tag.css(HEADING_TAGS.join(','))
+                                               .max_by { |tag| tag.text.size })
+            end
+          end
+          def extract_image
+            Image.call(article_tag, url:)
           end
           def extract_description
@@ -101,26 +105,10 @@ module Html2rss
             description = visible_text_from_tag(article_tag)
             return nil unless description
-            title_text = extract_title
-            description.gsub!(title_text, '') if title_text
             description.strip!
             description.empty? ? nil : description
           end
-          def closest_anchor
-            SemanticHtml.find_closest_selector(heading || article_tag,
-                                               selector: 'a[href]:not([href=""])')
-          end
-          def find_url
-            href = closest_anchor&.[]('href')&.split('#')&.first&.strip
-            Utils.build_absolute_url_from_relative(href, url) unless href.to_s.empty?
-          end
-          def extract_image
-            Image.call(article_tag, url:)
-          end
           def generate_id
             [
               article_tag['id'],
@@ -129,6 +117,18 @@ module Html2rss
               extract_url&.query
             ].compact.reject(&:empty?).first
           end
+          # @see https://developer.mozilla.org/en-US/docs/Web/API/HTMLTimeElement/dateTime
+          def extract_published_at
+            times = article_tag.css('time[datetime]')
+                               .filter_map do |tag|
+              DateTime.parse(tag['datetime'])
+            rescue ArgumentError, TypeError
+              nil
+            end
+            times.min
+          end
         end
       end
     end

data/lib/html2rss/auto_source/scraper/semantic_html/image.rb CHANGED Viewed

@@ -28,14 +28,14 @@ module Html2rss
           # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
           def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
             hash = article_tag.css('img[srcset], picture > source[srcset]')
-                              .flat_map { |source| source['srcset'].to_s.split(',') }
-                              .filter_map do |line|
-              width, url = line.split.reverse
-              next if url.nil? || url.start_with?('data:')
+                              .flat_map do |source|
+              source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)/).map do |url, width|
+                next if url.nil? || url.start_with?('data:')
-              width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
+                width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
-              [width_value, url.strip]
+                [width_value, url.strip]
+              end
             end.to_h
             hash[hash.keys.max]

data/lib/html2rss/auto_source/scraper/semantic_html.rb CHANGED Viewed

@@ -106,9 +106,10 @@ module Html2rss
           SemanticHtml.anchor_tag_selector_pairs.each do |tag_name, selector|
             parsed_body.css(selector).each do |selected_tag|
               article_tag = SemanticHtml.find_tag_in_ancestors(selected_tag, tag_name)
-              article_hash = Extractor.new(article_tag, url: @url).call
-              yield article_hash if article_hash
+              if article_tag && (article_hash = Extractor.new(article_tag, url: @url).call)
+                yield article_hash
+              end
             end
           end
         end

data/lib/html2rss/auto_source.rb CHANGED Viewed

@@ -11,20 +11,13 @@ module Html2rss
   # It uses a set of ArticleExtractors to extract articles, utilizing popular ways of
   # marking articles, e.g. schema, microdata, open graph, etc.
   class AutoSource
-    class UnsupportedUrlScheme < Html2rss::Error; end
     class NoArticlesFound < Html2rss::Error; end
-    SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
     ##
     # @param url [Addressable::URI] The URL to extract articles from.
     # @param body [String] The body of the response.
     # @param headers [Hash] The headers of the response.
     def initialize(url, body:, headers: {})
-      raise ArgumentError, 'URL must be a Addressable::URI' unless url.is_a?(Addressable::URI)
-      raise ArgumentError, 'URL must be absolute' unless url.absolute?
-      raise UnsupportedUrlScheme, "#{url.scheme} not supported" unless SUPPORTED_URL_SCHEMES.include?(url.scheme)
       @url = url
       @body = body
       @headers = headers

data/lib/html2rss/cli.rb CHANGED Viewed

@@ -2,7 +2,6 @@
 require_relative '../html2rss'
 require 'thor'
-require 'addressable'
 ##
 # The Html2rss namespace / command line interface.
@@ -26,14 +25,22 @@ module Html2rss
     def feed(yaml_file, *options)
       raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
-      feed_name = options.shift
+      feed_name = options.shift unless options.first&.include?('=')
       params = options.to_h { |opt| opt.split('=', 2) }
       puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
     end
-    desc 'auto URL', 'automatically sources an RSS feed from the URL'
+    desc 'auto URL', 'Automatically sources an RSS feed from the URL'
+    method_option :strategy,
+                  type: :string,
+                  desc: 'The strategy to request the URL',
+                  enum: RequestService.strategy_names,
+                  default: RequestService.default_strategy_name
     def auto(url)
-      puts Html2rss.auto_source(url)
+      strategy = options.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
+      puts Html2rss.auto_source(url, strategy:)
     end
   end
 end

data/lib/html2rss/config/channel.rb CHANGED Viewed

@@ -55,7 +55,7 @@ module Html2rss
       ##
       # @return [String]
       def title
-        config.fetch(:title) { Utils.titleized_url(url) }
+        config.fetch(:title) { Utils.titleized_channel_url(url) }
       end
       ##
@@ -88,6 +88,12 @@ module Html2rss
         config.fetch(:json, false)
       end
+      ##
+      # @return [Symbol]
+      def strategy
+        config.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
+      end
       private
       # @return [Hash<Symbol, Object>]

data/lib/html2rss/config/selectors.rb CHANGED Viewed

@@ -8,7 +8,8 @@ module Html2rss
       ITEMS_SELECTOR_NAME = :items
       # Struct to represent a selector with associated attributes for extraction and processing.
-      Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
+      Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, :content_type,
+                            keyword_init: true)
       # raised when an invalid selector name is used
       class InvalidSelectorName < Html2rss::Error; end

data/lib/html2rss/config.rb CHANGED Viewed

@@ -27,6 +27,7 @@ module Html2rss
     def_delegator :@channel, :url, :link
     def_delegator :@channel, :time_zone
     def_delegator :@channel, :json?
+    def_delegator :@channel, :strategy
     def_delegator :@selectors, :item_selector_names
     def_delegator :@selectors, :selector?

data/lib/html2rss/item.rb CHANGED Viewed

@@ -23,7 +23,9 @@ module Html2rss
     # @param config [Html2rss::Config] Configuration object.
     # @return [Array<Html2rss::Item>] list of items fetched.
     def self.from_url(url, config)
-      body = Utils.request_url(url, headers: config.headers).body
+      ctx = RequestService::Context.new(url:, headers: config.headers)
+      body = RequestService.execute(ctx, strategy: config.strategy).body
       body = ObjectToXmlConverter.new(JSON.parse(body)).call if config.json?
       Nokogiri.HTML(body)
@@ -136,8 +138,11 @@ module Html2rss
       raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
+      type = config.selector_attributes_with_channel(:enclosure)[:content_type] ||
+             Html2rss::Utils.guess_content_type_from_url(url)
       Enclosure.new(
-        type: Html2rss::Utils.guess_content_type_from_url(url),
+        type:,
         bits_length: 0,
         url: url.to_s
       )

data/lib/html2rss/request_service/browserless_strategy.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# frozen_string_literal: true
+require 'puppeteer'
+module Html2rss
+  class RequestService
+    ##
+    # Browserless.io strategy to request websites.
+    #
+    # Provide the WebSocket URL and your API token via environment variables:
+    # - BROWSERLESS_IO_WEBSOCKET_URL
+    # - BROWSERLESS_IO_API_TOKEN
+    #
+    # To use this strategy, you need to have a Browserless.io account or run a
+    # local Browserless.io instance.
+    #
+    # @see https://www.browserless.io/
+    #
+    # To run a local Browserless.io instance, you can use the following Docker command:
+    #
+    # ```sh
+    # docker run \
+    #   --rm \
+    #   -p 3000:3000 \
+    #   -e "CONCURRENT=10" \
+    #   -e "TOKEN=6R0W53R135510" \
+    #   ghcr.io/browserless/chromium
+    # ```
+    #
+    # When running locally, you can skip setting the environment variables, as above commands
+    # are aligned with the default values.
+    # @see https://github.com/browserless/browserless/pkgs/container/chromium
+    class BrowserlessStrategy < Strategy
+      # return [Response]
+      def execute
+        Puppeteer.connect(browser_ws_endpoint:) do |browser|
+          PuppetCommander.new(ctx, browser).call
+        ensure
+          browser.disconnect
+        end
+      end
+      def browser_ws_endpoint
+        @browser_ws_endpoint ||= begin
+          api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', '6R0W53R135510')
+          ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
+          "#{ws_url}?token=#{api_token}"
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/request_service/context.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# frozen_string_literal: true
+require 'addressable/uri'
+module Html2rss
+  class RequestService
+    ##
+    # Holds information needed to send requests to websites.
+    # To be passed down to the RequestService's strategies.
+    class Context
+      SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
+      ##
+      # @param url [String, Addressable::URI] the URL to request
+      # @param headers [Hash] HTTP request headers
+      def initialize(url:, headers: {})
+        @url = Addressable::URI.parse(url)
+        assert_valid_url!
+        @headers = headers
+      end
+      # @return [Addressable::URI] the parsed URL
+      attr_reader :url
+      # @return [Hash] the HTTP request headers
+      attr_reader :headers
+      private
+      ##
+      # Validates the URL.
+      # @raise [InvalidUrl] if the URL is not valid
+      # @raise [UnsupportedUrlScheme] if the URL scheme is not supported
+      def assert_valid_url!
+        raise InvalidUrl, 'URL must be absolute' unless url.absolute?
+        raise InvalidUrl, 'URL must not contain an @ character' if url.to_s.include?('@')
+        return if SUPPORTED_URL_SCHEMES.include?(url.scheme)
+        raise UnsupportedUrlScheme,
+              "URL scheme '#{url.scheme}' is not supported"
+      end
+    end
+  end
+end

data/lib/html2rss/request_service/faraday_strategy.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+require 'faraday'
+require 'faraday/follow_redirects'
+module Html2rss
+  class RequestService
+    ##
+    # Strategy to use Faraday for the request.
+    # @see https://rubygems.org/gems/faraday
+    class FaradayStrategy < Strategy
+      # return [Response]
+      def execute
+        request = Faraday.new(url: ctx.url, headers: ctx.headers) do |faraday|
+          faraday.use Faraday::FollowRedirects::Middleware
+          faraday.adapter Faraday.default_adapter
+        end
+        response = request.get
+        Response.new(body: response.body, headers: response.headers)
+      end
+    end
+  end
+end

data/lib/html2rss/request_service/puppet_commander.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+module Html2rss
+  class RequestService
+    ##
+    # Commands the Puppeteer Browser to the website and builds the Response.
+    class PuppetCommander
+      # @param ctx [Context]
+      # @param browser [Puppeteer::Browser]
+      # @param skip_request_resources [Set<String>] the resource types not to request
+      # @param referer [String] the referer to use for the request
+      def initialize(ctx,
+                     browser,
+                     skip_request_resources: %w[stylesheet image media font].to_set,
+                     referer: [ctx.url.scheme, ctx.url.host].join('://'))
+        @ctx = ctx
+        @browser = browser
+        @skip_request_resources = skip_request_resources
+        @referer = referer
+      end
+      # @return [Response]
+      def call
+        page = new_page
+        response = navigate_to_destination(page, ctx.url)
+        Response.new(body: body(page), headers: response.headers)
+      ensure
+        page&.close
+      end
+      ##
+      # @return [Puppeteer::Page]
+      # @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
+      def new_page
+        page = browser.new_page
+        page.extra_http_headers = ctx.headers
+        return page if skip_request_resources.empty?
+        page.request_interception = true
+        page.on('request') do |request|
+          skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
+        end
+        page
+      end
+      def navigate_to_destination(page, url)
+        page.goto(url, wait_until: 'networkidle0', referer:)
+      end
+      def body(page) = page.content
+      private
+      attr_reader :ctx, :browser, :skip_request_resources, :referer
+    end
+  end
+end