RubyGems - html2rss - Versions diffs - 0.15.0 → 0.16.0 - Mend

html2rss 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/README.md +113 -44
data/html2rss.gemspec +3 -2
data/lib/html2rss/auto_source/article.rb +37 -5
data/lib/html2rss/auto_source/channel.rb +21 -28
data/lib/html2rss/auto_source/cleanup.rb +0 -16
data/lib/html2rss/auto_source/rss_builder.rb +1 -1
data/lib/html2rss/auto_source/scraper/html.rb +18 -9
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +34 -0
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +25 -0
data/lib/html2rss/auto_source/scraper/schema/thing.rb +104 -0
data/lib/html2rss/auto_source/scraper/schema.rb +22 -33
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +39 -39
data/lib/html2rss/auto_source.rb +0 -7
data/lib/html2rss/cli.rb +11 -4
data/lib/html2rss/config/channel.rb +7 -1
data/lib/html2rss/config/selectors.rb +2 -1
data/lib/html2rss/config.rb +1 -0
data/lib/html2rss/item.rb +7 -2
data/lib/html2rss/request_service/browserless_strategy.rb +53 -0
data/lib/html2rss/request_service/context.rb +46 -0
data/lib/html2rss/request_service/faraday_strategy.rb +24 -0
data/lib/html2rss/request_service/puppet_commander.rb +61 -0
data/lib/html2rss/request_service/response.rb +27 -0
data/lib/html2rss/request_service/strategy.rb +28 -0
data/lib/html2rss/request_service.rb +97 -0
data/lib/html2rss/rss_builder/stylesheet.rb +7 -0
data/lib/html2rss/utils.rb +23 -26
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +5 -5
metadata +34 -11
data/lib/html2rss/auto_source/scraper/schema/base.rb +0 -61

data/lib/html2rss/request_service.rb ADDED Viewed

@@ -0,0 +1,97 @@
+# frozen_string_literal: true
+require 'singleton'
+require 'forwardable'
+module Html2rss
+  ##
+  # Requests website URLs to retrieve their HTML for further processing.
+  # Provides strategies, i.e. to integrate Browserless.io.
+  class RequestService
+    include Singleton
+    class UnknownStrategy < Html2rss::Error; end
+    class InvalidUrl < Html2rss::Error; end
+    class UnsupportedUrlScheme < Html2rss::Error; end
+    class << self
+      extend Forwardable
+      %i[default_strategy_name
+         default_strategy_name=
+         strategy_names
+         register_strategy
+         unregister_strategy
+         strategy_registered?
+         execute].each do |method|
+        def_delegator :instance, method
+      end
+    end
+    def initialize
+      @strategies = {
+        faraday: FaradayStrategy,
+        browserless: BrowserlessStrategy
+      }
+      @default_strategy_name = :faraday
+    end
+    # @return [Symbol] the default strategy name
+    attr_reader :default_strategy_name
+    ##
+    # Sets the default strategy.
+    # @param strategy [Symbol] the name of the strategy
+    # @raise [UnknownStrategy] if the strategy is not registered
+    def default_strategy_name=(strategy)
+      raise UnknownStrategy unless strategy_registered?(strategy)
+      @default_strategy_name = strategy.to_sym
+    end
+    # @return [Array<String>] the names of the registered strategies
+    def strategy_names = @strategies.keys.map(&:to_s)
+    ##
+    # Registers a new strategy.
+    # @param name [Symbol] the name of the strategy
+    # @param strategy_class [Class] the class of the strategy
+    def register_strategy(name, strategy_class)
+      raise ArgumentError, 'Strategy class must be a Class' unless strategy_class.is_a?(Class)
+      @strategies[name.to_sym] = strategy_class
+    end
+    ##
+    # Checks if a strategy is registered.
+    # @param name [Symbol] the name of the strategy
+    # @return [Boolean] true if the strategy is registered, false otherwise
+    def strategy_registered?(name)
+      @strategies.key?(name.to_sym)
+    end
+    ##
+    # Unregisters a strategy.
+    # @param name [Symbol] the name of the strategy
+    # @return [Boolean] true if the strategy was unregistered, false otherwise
+    def unregister_strategy(name)
+      raise ArgumentError, 'Cannot unregister the default strategy' if name.to_sym == @default_strategy_name
+      !!@strategies.delete(name.to_sym)
+    end
+    ##
+    # Executes the request.
+    # @param ctx [Context] the context for the request
+    # @param strategy [Symbol] the strategy to use
+    # @return [Response] the response from the strategy
+    # @raise [UnknownStrategy] if the strategy is not known
+    def execute(ctx, strategy: default_strategy_name)
+      strategy_class = @strategies.fetch(strategy) do
+        raise UnknownStrategy,
+              "The strategy '#{strategy}' is not known. Available strategies are: #{strategy_names.join(', ')}"
+      end
+      strategy_class.new(ctx).execute
+    end
+  end
+end

data/lib/html2rss/rss_builder/stylesheet.rb CHANGED Viewed

@@ -47,6 +47,13 @@ module Html2rss
         @media = media
       end
       attr_reader :href, :type, :media
+      # @return [String] the XML representation of the stylesheet
+      def to_xml
+        <<~XML
+          <?xml-stylesheet href="#{href}" type="#{type}" media="#{media}"?>
+        XML
+      end
     end
   end
 end

data/lib/html2rss/utils.rb CHANGED Viewed

@@ -1,8 +1,6 @@
 # frozen_string_literal: true
 require 'addressable/uri'
-require 'faraday'
-require 'faraday/follow_redirects'
 require 'json'
 require 'regexp_parser'
 require 'tzinfo'
@@ -15,11 +13,10 @@ module Html2rss
   module Utils
     ##
     # @param url [String, Addressable::URI]
-    # @param base_url [String]
+    # @param base_url [String, Addressable::URI]
     # @return [Addressable::URI]
     def self.build_absolute_url_from_relative(url, base_url)
-      url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
+      url = Addressable::URI.parse(url)
       return url if url.absolute?
       base_uri = Addressable::URI.parse(base_url)
@@ -59,31 +56,31 @@ module Html2rss
     end
     ##
-    # Builds a titleized representation of the URL.
-    # @param url [String, Addressable::URI]
+    # Builds a titleized representation of the URL with prefixed host.
+    # @param url [Addressable::URI]
     # @return [String]
-    def self.titleized_url(url)
-      uri = Addressable::URI.parse(url)
-      host = uri.host
+    def self.titleized_channel_url(url)
+      nicer_path = CGI.unescapeURIComponent(url.path).split('/').reject(&:empty?)
+      host = url.host
-      nicer_path = uri.path.split('/').reject(&:empty?)
       nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
     end
     ##
-    # @param url [String, Addressable::URI]
-    # @param headers [Hash] additional HTTP request headers to use for the request
-    # @return [Faraday::Response] body of the HTTP response
-    def self.request_url(url, headers: {})
-      url = Addressable::URI.parse(url.to_s) unless url.is_a?(Addressable::URI)
-      raise ArgumentError, 'URL must be absolute' unless url.absolute?
-      raise ArgumentError, 'URL must not contain an @ characater' if url.to_s.include?('@')
-      Faraday.new(url:, headers:) do |faraday|
-        faraday.use Faraday::FollowRedirects::Middleware
-        faraday.adapter Faraday.default_adapter
-      end.get
+    # Builds a titleized representation of the URL.
+    # @param url [Addressable::URI]
+    # @return [String]
+    def self.titleized_url(url)
+      return '' if url.path.empty?
+      nicer_path = CGI.unescapeURIComponent(url.path)
+                      .split('/')
+                      .flat_map do |part|
+        part.gsub(/[^a-zA-Z0-9\.]/, ' ').gsub(/\s+/, ' ').split
+      end
+      nicer_path.map!(&:capitalize)
+      File.basename nicer_path.join(' '), '.*'
     end
     ##
@@ -104,10 +101,10 @@ module Html2rss
     ##
     # Guesses the content type based on the file extension of the URL.
     #
-    # @param url [String, Addressable::URI]
+    # @param url [Addressable::URI]
     # @return [String] guessed content type, defaults to 'application/octet-stream'
     def self.guess_content_type_from_url(url)
-      url = url.to_s.split('?').first
+      url = url.path.split('?').first
       content_type = MIME::Types.type_for(File.extname(url).delete('.'))
       content_type.first&.to_s || 'application/octet-stream'

data/lib/html2rss/version.rb CHANGED Viewed

@@ -3,6 +3,6 @@
 ##
 # The Html2rss namespace.
 module Html2rss
-  VERSION = '0.15.0'
+  VERSION = '0.16.0'
   public_constant :VERSION
 end

data/lib/html2rss.rb CHANGED Viewed

@@ -99,13 +99,13 @@ module Html2rss
   # No need for a "feed config".
   #
   # @param url [String] the URL to automatically source the feed from
+  # @param strategy [Symbol] the request strategy to use
   # @return [RSS::Rss]
-  def self.auto_source(url)
-    url = Addressable::URI.parse(url)
+  def self.auto_source(url, strategy: :faraday)
+    ctx = RequestService::Context.new(url:, headers: {})
+    response = RequestService.execute(ctx, strategy:)
-    response = Html2rss::Utils.request_url(url)
-    Html2rss::AutoSource.new(url, body: response.body, headers: response.headers).build
+    Html2rss::AutoSource.new(ctx.url, body: response.body, headers: response.headers).build
   end
   private_class_method :find_feed_config

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html2rss
 version: !ruby/object:Gem::Version
-  version: 0.15.0
+  version: 0.16.0
 platform: ruby
 authors:
 - Gil Desmarais
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2024-10-30 00:00:00.000000000 Z
+date: 2024-12-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: addressable
@@ -120,6 +120,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: puppeteer-ruby
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: regexp_parser
   requirement: !ruby/object:Gem::Requirement
@@ -140,14 +154,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.0'
+        version: '3.0'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.0'
+        version: '3.0'
 - !ruby/object:Gem::Dependency
   name: rss
   requirement: !ruby/object:Gem::Requirement
@@ -208,16 +222,16 @@ dependencies:
   name: zeitwerk
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 2.6.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 2.6.0
 description: Supports JSON content, custom HTTP headers, and post-processing of extracted
   content.
 email:
@@ -253,7 +267,9 @@ files:
 - lib/html2rss/auto_source/scraper.rb
 - lib/html2rss/auto_source/scraper/html.rb
 - lib/html2rss/auto_source/scraper/schema.rb
-- lib/html2rss/auto_source/scraper/schema/base.rb
+- lib/html2rss/auto_source/scraper/schema/item_list.rb
+- lib/html2rss/auto_source/scraper/schema/list_item.rb
+- lib/html2rss/auto_source/scraper/schema/thing.rb
 - lib/html2rss/auto_source/scraper/semantic_html.rb
 - lib/html2rss/auto_source/scraper/semantic_html/extractor.rb
 - lib/html2rss/auto_source/scraper/semantic_html/image.rb
@@ -269,6 +285,13 @@ files:
 - lib/html2rss/item_extractors/static.rb
 - lib/html2rss/item_extractors/text.rb
 - lib/html2rss/object_to_xml_converter.rb
+- lib/html2rss/request_service.rb
+- lib/html2rss/request_service/browserless_strategy.rb
+- lib/html2rss/request_service/context.rb
+- lib/html2rss/request_service/faraday_strategy.rb
+- lib/html2rss/request_service/puppet_commander.rb
+- lib/html2rss/request_service/response.rb
+- lib/html2rss/request_service/strategy.rb
 - lib/html2rss/rss_builder.rb
 - lib/html2rss/rss_builder/channel.rb
 - lib/html2rss/rss_builder/item.rb
@@ -280,7 +303,7 @@ licenses:
 - MIT
 metadata:
   allowed_push_host: https://rubygems.org
-  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.15.0
+  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.16.0
   rubygems_mfa_required: 'true'
 post_install_message:
 rdoc_options: []
@@ -297,7 +320,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.5.16
+rubygems_version: 3.5.22
 signing_key:
 specification_version: 4
 summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors

data/lib/html2rss/auto_source/scraper/schema/base.rb DELETED Viewed

@@ -1,61 +0,0 @@
-# frozen_string_literal: true
-require 'date'
-module Html2rss
-  class AutoSource
-    module Scraper
-      class Schema
-        ##
-        # Base class for Schema.org schema_objects.
-        #
-        # @see https://schema.org/Article
-        class Base
-          DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
-          def initialize(schema_object, url:)
-            @schema_object = schema_object
-            @url = url
-          end
-          # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
-          def call
-            DEFAULT_ATTRIBUTES.to_h do |attribute|
-              [attribute, public_send(attribute)]
-            end
-          end
-          def id = schema_object[:@id] || url&.path || title.to_s.downcase.gsub(/\s+/, '-')
-          def title = schema_object[:title]
-          def description
-            [schema_object[:description], schema_object[:schema_object_body], schema_object[:abstract]]
-              .max_by { |desc| desc.to_s.size }
-          end
-          # @return [Addressable::URI, nil] the URL of the schema object
-          def url
-            url = schema_object[:url]
-            if url.to_s.empty?
-              Log.debug("Schema#Base.url: no url in schema_object: #{schema_object.inspect}")
-              return
-            end
-            Utils.build_absolute_url_from_relative(url, @url)
-          end
-          def image = images.first || nil
-          def published_at = schema_object[:datePublished]
-          private
-          attr_reader :schema_object
-          def images
-            Array(schema_object[:image]).compact
-          end
-        end
-      end
-    end
-  end
-end