RubyGems - html2rss - Versions diffs - 0.3.3 → 0.4.0 - Mend

html2rss 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +4 -4
data/.rubocop.yml +18 -11
data/.travis.yml +3 -3
data/.yardopts +6 -0
data/Gemfile.lock +23 -5
data/README.md +2 -1
data/docs/Html2rss.html +353 -0
data/docs/Html2rss/AttributePostProcessors.html +203 -0
data/docs/Html2rss/AttributePostProcessors/ParseTime.html +332 -0
data/docs/Html2rss/AttributePostProcessors/ParseUri.html +314 -0
data/docs/Html2rss/AttributePostProcessors/SanitizeHtml.html +346 -0
data/docs/Html2rss/AttributePostProcessors/Substring.html +321 -0
data/docs/Html2rss/AttributePostProcessors/Template.html +336 -0
data/docs/Html2rss/Config.html +795 -0
data/docs/Html2rss/FeedBuilder.html +295 -0
data/docs/Html2rss/Item.html +654 -0
data/docs/Html2rss/ItemExtractors.html +297 -0
data/docs/Html2rss/ItemExtractors/Attribute.html +317 -0
data/docs/Html2rss/ItemExtractors/CurrentTime.html +297 -0
data/docs/Html2rss/ItemExtractors/Href.html +319 -0
data/docs/Html2rss/ItemExtractors/Html.html +314 -0
data/docs/Html2rss/ItemExtractors/Static.html +301 -0
data/docs/Html2rss/ItemExtractors/Text.html +312 -0
data/docs/Html2rss/Utils.html +115 -0
data/docs/Html2rss/Utils/IndifferentAccessHash.html +142 -0
data/docs/_index.html +300 -0
data/docs/class_list.html +51 -0
data/docs/css/common.css +1 -0
data/docs/css/full_list.css +58 -0
data/docs/css/style.css +496 -0
data/docs/file.README.html +135 -0
data/docs/file_list.html +56 -0
data/docs/frames.html +17 -0
data/docs/index.html +135 -0
data/docs/js/app.js +303 -0
data/docs/js/full_list.js +216 -0
data/docs/js/jquery.js +4 -0
data/docs/method_list.html +435 -0
data/docs/top-level-namespace.html +110 -0
data/html2rss.gemspec +3 -0
data/lib/html2rss.rb +19 -4
data/lib/html2rss/attribute_post_processors.rb +5 -3
data/lib/html2rss/attribute_post_processors/parse_time.rb +29 -3
data/lib/html2rss/attribute_post_processors/parse_uri.rb +20 -1
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +65 -3
data/lib/html2rss/attribute_post_processors/substring.rb +24 -3
data/lib/html2rss/attribute_post_processors/template.rb +37 -10
data/lib/html2rss/config.rb +11 -12
data/lib/html2rss/feed_builder.rb +8 -6
data/lib/html2rss/item.rb +28 -19
data/lib/html2rss/item_extractors.rb +29 -0
data/lib/html2rss/item_extractors/attribute.rb +37 -0
data/lib/html2rss/item_extractors/current_time.rb +21 -0
data/lib/html2rss/item_extractors/href.rb +36 -0
data/lib/html2rss/item_extractors/html.rb +34 -0
data/lib/html2rss/item_extractors/static.rb +28 -0
data/lib/html2rss/item_extractors/text.rb +32 -0
data/lib/html2rss/utils.rb +25 -0
data/lib/html2rss/version.rb +1 -1
metadata +88 -4
data/lib/html2rss/item_extractor.rb +0 -37

data/lib/html2rss/config.rb CHANGED

@@ -1,18 +1,9 @@
-require 'hashie'
 module Html2rss
   class Config
-    attr_reader :feed_config, :channel_config, :global_config
-    class IndifferentAccessHash < Hash
-      include Hashie::Extensions::MergeInitializer
-      include Hashie::Extensions::IndifferentAccess
-    end
     def initialize(feed_config, global_config = {})
-      @global_config = IndifferentAccessHash.new global_config
-      @feed_config = IndifferentAccessHash.new feed_config
-      @channel_config = IndifferentAccessHash.new @feed_config.fetch('channel', {})
+      @global_config = Utils::IndifferentAccessHash.new global_config
+      @feed_config = Utils::IndifferentAccessHash.new feed_config
+      @channel_config = Utils::IndifferentAccessHash.new @feed_config.fetch('channel', {})
     end
     def author
@@ -40,6 +31,10 @@ module Html2rss
     end
     alias link url
+    def time_zone
+      channel_config.fetch 'time_zone', 'UTC'
+    end
     def headers
       global_config.fetch('headers', {})
     end
@@ -61,5 +56,9 @@ module Html2rss
       attribute_names.delete('items')
       attribute_names
     end
+    private
+    attr_reader :feed_config, :channel_config, :global_config
   end
 end

data/lib/html2rss/feed_builder.rb CHANGED

@@ -3,12 +3,12 @@ require_relative 'item'
 module Html2rss
   class FeedBuilder
-    attr_reader :config
-    def initialize(feed_config)
-      @config = feed_config
+    def initialize(config)
+      @config = config
     end
+    ##
+    # @return [RSS:Rss]
     def rss
       RSS::Maker.make('2.0') do |maker|
         add_channel_to_maker(maker)
@@ -21,9 +21,11 @@ module Html2rss
     private
+    attr_reader :config
     def add_channel_to_maker(maker)
       %i[language author title description link ttl].each do |attribute_name|
-        maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
+        maker.channel.public_send("#{attribute_name}=".to_sym, config.public_send(attribute_name))
       end
       maker.channel.generator = "html2rss V. #{::Html2rss::VERSION}"
@@ -39,7 +41,7 @@ module Html2rss
       items.new_item do |rss_item|
         feed_item.available_attributes.each do |attribute_name|
-          rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))
+          rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
         end
         feed_item.categories.each do |category|

data/lib/html2rss/item.rb CHANGED

@@ -2,18 +2,18 @@ require 'faraday'
 require 'faraday_middleware'
 require 'open-uri'
 require 'nokogiri'
-require_relative 'item_extractor'
+require_relative 'item_extractors'
 require_relative 'attribute_post_processors'
 module Html2rss
   class Item
-    attr_reader :xml, :config
     def initialize(xml, config)
       @xml = xml
       @config = config
     end
+    private_class_method :new
     def respond_to_missing?(method_name, _include_private = false)
       config.attribute_names.include?(method_name) || super
     end
@@ -22,9 +22,8 @@ module Html2rss
       attribute_config = config.options(method_name.to_s)
       return super unless attribute_config
-      extractor = attribute_config['extractor'] || 'text'
-      proc = ItemExtractor.const_get extractor.upcase.to_sym
-      value = proc.call(xml, attribute_config)
+      extractor = ItemExtractors.get_extractor(attribute_config['extractor'])
+      value = extractor.new(xml, attribute_config).get
       post_process(value, attribute_config.fetch('post_process', false))
     end
@@ -38,17 +37,19 @@ module Html2rss
       [title.to_s, description.to_s].join('') != ''
     end
+    ##
+    # @return [Array]
     def categories
-      config.categories.map(&method(:method_missing)).uniq.keep_if { |category| category.to_s != '' }
+      categories = config.categories
+      categories.map!(&method(:method_missing))
+      categories.uniq!
+      categories.keep_if { |category| category.to_s != '' }
     end
+    ##
+    # @return [Array]
     def self.from_url(url, config)
-      connection = Faraday.new(url: url, headers: config.headers) { |faraday|
-        faraday.use FaradayMiddleware::FollowRedirects
-        faraday.adapter Faraday.default_adapter
-      }
-      page = Nokogiri::HTML(connection.get.body)
+      page = Nokogiri::HTML(get_body_from_url(url, config.headers))
       page.css(config.selector('items')).map do |xml_item|
         new xml_item, config
       end
@@ -56,14 +57,22 @@ module Html2rss
     private
-    def post_process(value, post_process_options = [])
-      return value unless post_process_options
+    def self.get_body_from_url(url, headers)
+      Faraday.new(url: url, headers: headers) do |faraday|
+        faraday.use FaradayMiddleware::FollowRedirects
+        faraday.adapter Faraday.default_adapter
+      end.get.body
+    end
+    private_class_method :get_body_from_url
-      post_process_options = [post_process_options] unless post_process_options.is_a?(Array)
+    attr_reader :xml, :config
+    def post_process(value, post_process_options)
+      return value unless post_process_options
-      post_process_options.each do |options|
-        value = AttributePostProcessors.get_processor(options)
-                                       .new(value, options, self)
+      [post_process_options].flatten.each do |options|
+        value = AttributePostProcessors.get_processor(options['name'])
+                                       .new(value, options: options, item: self, config: @config)
                                        .get
       end

data/lib/html2rss/item_extractors.rb ADDED

@@ -0,0 +1,29 @@
+require_relative 'item_extractors/attribute'
+require_relative 'item_extractors/current_time'
+require_relative 'item_extractors/href'
+require_relative 'item_extractors/html'
+require_relative 'item_extractors/static'
+require_relative 'item_extractors/text'
+module Html2rss
+  ##
+  # Provides a namespace for item extractors.
+  module ItemExtractors
+    DEFAULT = 'text'.freeze
+    def self.get_extractor(name)
+      name ||= DEFAULT
+      camel_cased_name = name.split('_').map(&:capitalize).join
+      class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
+      Object.const_get(class_name)
+    end
+    ##
+    # @return [Nokogiri::XML::Element]
+    def self.element(xml, options)
+      selector = options['selector']
+      selector ? xml.css(selector) : xml
+    end
+  end
+end

data/lib/html2rss/item_extractors/attribute.rb ADDED

@@ -0,0 +1,37 @@
+module Html2rss
+  module ItemExtractors
+    ##
+    # Returns the value of the attribute.
+    #
+    # Imagine this +time+ HTML element with a +datetime+ attribute:
+    #
+    #     <time datetime="2019-07-01">...</time>
+    #
+    # YAML usage example:
+    #
+    #    selectors:
+    #      link:
+    #        selector: time
+    #        extractor: attribute
+    #        attribute: datetime
+    #
+    # Would return:
+    #    '2019-07-01'
+    #
+    # In case you're extracting a date or a time, do not forget to parse it
+    # during post processing with
+    # {AttributePostProcessors::ParseTime}[rdoc-ref:Html2rss::AttributePostProcessors::ParseTime].
+    class Attribute
+      def initialize(xml, options)
+        @options = options
+        @element = ItemExtractors.element(xml, options)
+      end
+      ##
+      # @return [String]
+      def get
+        @element.attr(@options['attribute']).to_s
+      end
+    end
+  end
+end

data/lib/html2rss/item_extractors/current_time.rb ADDED

@@ -0,0 +1,21 @@
+module Html2rss
+  module ItemExtractors
+    ##
+    # Returns the current Time.
+    #
+    # YAML usage example:
+    #
+    #    selectors:
+    #      updated:
+    #        extractor: current_time
+    class CurrentTime
+      def initialize(_xml, _options); end
+      ##
+      # @return [Time]
+      def get
+        Time.new
+      end
+    end
+  end
+end

data/lib/html2rss/item_extractors/href.rb ADDED

@@ -0,0 +1,36 @@
+module Html2rss
+  module ItemExtractors
+    ##
+    # Returns the value of the +href+ attribute.
+    # It always returns absolute URLs. If the extracted +href+ value is a
+    # relative URL, it prepends the channel's URL.
+    #
+    # Imagine this +a+ HTML element with a +href+ attribute:
+    #
+    #     <a href="/posts/latest-findings">...</a>
+    #
+    # YAML usage example:
+    #    channel:
+    #      url: http://blog-without-a-feed.example.com
+    #      ...
+    #    selectors:
+    #      link:
+    #        selector: a
+    #        extractor: href
+    #
+    # Would return:
+    #    'http://blog-without-a-feed.example.com/posts/latest-findings'
+    class Href
+      def initialize(xml, options)
+        @options = options
+        element = ItemExtractors.element(xml, options)
+        @href = element.attr('href').to_s
+      end
+      # @return [URI::HTTPS, URI::HTTP]
+      def get
+        Html2rss::Utils.build_absolute_url_from_relative(@href, @options['channel']['url'])
+      end
+    end
+  end
+end

data/lib/html2rss/item_extractors/html.rb ADDED

@@ -0,0 +1,34 @@
+module Html2rss
+  module ItemExtractors
+    ##
+    # Return the HTML of the attribute.
+    #
+    # Imagine this HTML structure:
+    #
+    #     <p>Lorem <b>ipsum</b> dolor ...</p>
+    #
+    # YAML usage example:
+    #
+    #    selectors:
+    #      description:
+    #        selector: p
+    #        extractor: html
+    #
+    # Would return:
+    #    '<p>Lorem <b>ipsum</b> dolor ...</p>'
+    #
+    # Always make sure to sanitize the HTML during post processing with
+    # {AttributePostProcessors::SanitizeHtml}[rdoc-ref:Html2rss::AttributePostProcessors::SanitizeHtml].
+    class Html
+      def initialize(xml, options)
+        @element = ItemExtractors.element(xml, options)
+      end
+      ##
+      # @return [String]
+      def get
+        @element.to_s
+      end
+    end
+  end
+end

data/lib/html2rss/item_extractors/static.rb ADDED

@@ -0,0 +1,28 @@
+module Html2rss
+  module ItemExtractors
+    ##
+    # YAML usage example:
+    #
+    #    selectors:
+    #      autor:
+    #        extractor: static
+    #        static: Foobar
+    #
+    # Would return:
+    #    'Foobar'
+    class Static
+      def initialize(_xml, options)
+        @options = options
+      end
+      # Returns what options['static'] holds.
+      #
+      #    options = { static: 'Foobar' }
+      #    Static.new(xml, options).get
+      #    # => 'Foobar'
+      def get
+        @options['static']
+      end
+    end
+  end
+end

data/lib/html2rss/item_extractors/text.rb ADDED

@@ -0,0 +1,32 @@
+module Html2rss
+  module ItemExtractors
+    ##
+    # Return the text of the attribute. This is the default extractor used,
+    # when no extractor is explicitly given.
+    #
+    # Imagine this HTML structure:
+    #
+    #     <p>Lorem <b>ipsum</b> dolor ...</p>
+    #
+    # YAML usage example:
+    #
+    #    selectors:
+    #      description:
+    #        selector: p
+    #        extractor: text
+    #
+    # Would return:
+    #    'Lorem ipsum dolor ...'
+    class Text
+      def initialize(xml, options)
+        @element = ItemExtractors.element(xml, options)
+      end
+      ##
+      # @return [String]
+      def get
+        @element.text.to_s.strip.split.join(' ')
+      end
+    end
+  end
+end

data/lib/html2rss/utils.rb ADDED

@@ -0,0 +1,25 @@
+require 'hashie'
+module Html2rss
+  module Utils
+    ##
+    # A Hash with indifferent access, build with {https://github.com/intridea/hashie Hashie}.
+    class IndifferentAccessHash < Hash
+      include Hashie::Extensions::MergeInitializer
+      include Hashie::Extensions::IndifferentAccess
+    end
+    def self.build_absolute_url_from_relative(url, channel_url)
+      url = URI(url) if url.is_a?(String)
+      return url if url.absolute?
+      path, query = url.to_s.split('?')
+      URI(channel_url).tap do |uri|
+        uri.path = path.to_s.start_with?('/') ? path : "/#{path}"
+        uri.query = query
+      end
+    end
+  end
+end

data/lib/html2rss/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Html2rss
-  VERSION = '0.3.3'.freeze
+  VERSION = '0.4.0'.freeze
 end

metadata CHANGED

@@ -1,15 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: html2rss
 version: !ruby/object:Gem::Version
-  version: 0.3.3
+  version: 0.4.0
 platform: ruby
 authors:
 - Gil Desmarais
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-07-01 00:00:00.000000000 Z
+date: 2019-09-07 00:00:00.000000000 Z
 dependencies:
+- !ruby/object:Gem::Dependency
+  name: activesupport
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '5.0'
 - !ruby/object:Gem::Dependency
   name: faraday
   requirement: !ruby/object:Gem::Requirement
@@ -156,6 +170,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: rubocop-rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: simplecov
   requirement: !ruby/object:Gem::Requirement
@@ -184,6 +212,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: yard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance
   in return.
 email:
@@ -197,6 +239,7 @@ files:
 - ".rspec"
 - ".rubocop.yml"
 - ".travis.yml"
+- ".yardopts"
 - CHANGELOG.md
 - Gemfile
 - Gemfile.lock
@@ -204,6 +247,39 @@ files:
 - README.md
 - bin/console
 - bin/setup
+- docs/Html2rss.html
+- docs/Html2rss/AttributePostProcessors.html
+- docs/Html2rss/AttributePostProcessors/ParseTime.html
+- docs/Html2rss/AttributePostProcessors/ParseUri.html
+- docs/Html2rss/AttributePostProcessors/SanitizeHtml.html
+- docs/Html2rss/AttributePostProcessors/Substring.html
+- docs/Html2rss/AttributePostProcessors/Template.html
+- docs/Html2rss/Config.html
+- docs/Html2rss/FeedBuilder.html
+- docs/Html2rss/Item.html
+- docs/Html2rss/ItemExtractors.html
+- docs/Html2rss/ItemExtractors/Attribute.html
+- docs/Html2rss/ItemExtractors/CurrentTime.html
+- docs/Html2rss/ItemExtractors/Href.html
+- docs/Html2rss/ItemExtractors/Html.html
+- docs/Html2rss/ItemExtractors/Static.html
+- docs/Html2rss/ItemExtractors/Text.html
+- docs/Html2rss/Utils.html
+- docs/Html2rss/Utils/IndifferentAccessHash.html
+- docs/_index.html
+- docs/class_list.html
+- docs/css/common.css
+- docs/css/full_list.css
+- docs/css/style.css
+- docs/file.README.html
+- docs/file_list.html
+- docs/frames.html
+- docs/index.html
+- docs/js/app.js
+- docs/js/full_list.js
+- docs/js/jquery.js
+- docs/method_list.html
+- docs/top-level-namespace.html
 - html2rss.gemspec
 - lib/html2rss.rb
 - lib/html2rss/attribute_post_processors.rb
@@ -215,7 +291,14 @@ files:
 - lib/html2rss/config.rb
 - lib/html2rss/feed_builder.rb
 - lib/html2rss/item.rb
-- lib/html2rss/item_extractor.rb
+- lib/html2rss/item_extractors.rb
+- lib/html2rss/item_extractors/attribute.rb
+- lib/html2rss/item_extractors/current_time.rb
+- lib/html2rss/item_extractors/href.rb
+- lib/html2rss/item_extractors/html.rb
+- lib/html2rss/item_extractors/static.rb
+- lib/html2rss/item_extractors/text.rb
+- lib/html2rss/utils.rb
 - lib/html2rss/version.rb
 - support/logo.png
 homepage: https://github.com/gildesmarais/html2rss
@@ -238,7 +321,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.4
+rubyforge_project:
+rubygems_version: 2.7.7
 signing_key:
 specification_version: 4
 summary: Returns an RSS::Rss object by scraping a URL.