RubyGems - html2rss - Versions diffs - 0.16.0 → 0.18.0 - Mend

html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

checksums.yaml +4 -4
data/README.md +48 -657
data/exe/html2rss +1 -1
data/html2rss.gemspec +7 -4
data/lib/html2rss/articles/deduplicator.rb +49 -0
data/lib/html2rss/auto_source/cleanup.rb +33 -5
data/lib/html2rss/auto_source/scraper/html.rb +118 -43
data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
data/lib/html2rss/auto_source/scraper.rb +142 -8
data/lib/html2rss/auto_source.rb +119 -47
data/lib/html2rss/blocked_surface.rb +64 -0
data/lib/html2rss/category_extractor.rb +82 -0
data/lib/html2rss/cli.rb +170 -23
data/lib/html2rss/config/class_methods.rb +189 -0
data/lib/html2rss/config/dynamic_params.rb +68 -0
data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
data/lib/html2rss/config/request_headers.rb +130 -0
data/lib/html2rss/config/schema.rb +208 -0
data/lib/html2rss/config/validator.rb +108 -0
data/lib/html2rss/config.rb +112 -61
data/lib/html2rss/error.rb +6 -0
data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
data/lib/html2rss/html_extractor.rb +136 -0
data/lib/html2rss/html_navigator.rb +46 -0
data/lib/html2rss/json_feed_builder/item.rb +94 -0
data/lib/html2rss/json_feed_builder.rb +58 -0
data/lib/html2rss/rendering/audio_renderer.rb +31 -0
data/lib/html2rss/rendering/description_builder.rb +88 -0
data/lib/html2rss/rendering/image_renderer.rb +31 -0
data/lib/html2rss/rendering/media_renderer.rb +33 -0
data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
data/lib/html2rss/rendering/video_renderer.rb +31 -0
data/lib/html2rss/rendering.rb +14 -0
data/lib/html2rss/request_controls.rb +128 -0
data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
data/lib/html2rss/request_service/budget.rb +39 -0
data/lib/html2rss/request_service/context.rb +64 -20
data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
data/lib/html2rss/request_service/policy.rb +248 -0
data/lib/html2rss/request_service/puppet_commander.rb +212 -13
data/lib/html2rss/request_service/response.rb +42 -2
data/lib/html2rss/request_service/response_guard.rb +62 -0
data/lib/html2rss/request_service.rb +31 -15
data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
data/lib/html2rss/request_session/runtime_input.rb +57 -0
data/lib/html2rss/request_session/runtime_policy.rb +76 -0
data/lib/html2rss/request_session.rb +118 -0
data/lib/html2rss/rss_builder/article.rb +166 -0
data/lib/html2rss/rss_builder/channel.rb +96 -11
data/lib/html2rss/rss_builder/enclosure.rb +48 -0
data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
data/lib/html2rss/rss_builder.rb +72 -71
data/lib/html2rss/selectors/config.rb +122 -0
data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
data/lib/html2rss/selectors/extractors/href.rb +53 -0
data/lib/html2rss/selectors/extractors/html.rb +48 -0
data/lib/html2rss/selectors/extractors/static.rb +41 -0
data/lib/html2rss/selectors/extractors/text.rb +46 -0
data/lib/html2rss/selectors/extractors.rb +52 -0
data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
data/lib/html2rss/selectors/post_processors/base.rb +74 -0
data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
data/lib/html2rss/selectors/post_processors/template.rb +73 -0
data/lib/html2rss/selectors/post_processors.rb +43 -0
data/lib/html2rss/selectors.rb +294 -0
data/lib/html2rss/url.rb +262 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +129 -70
data/lib/tasks/config_schema.rake +17 -0
data/schema/html2rss-config.schema.json +469 -0
metadata +120 -46
data/lib/html2rss/attribute_post_processors/base.rb +0 -74
data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
data/lib/html2rss/attribute_post_processors/template.rb +0 -101
data/lib/html2rss/attribute_post_processors.rb +0 -44
data/lib/html2rss/auto_source/article.rb +0 -127
data/lib/html2rss/auto_source/channel.rb +0 -78
data/lib/html2rss/auto_source/reducer.rb +0 -48
data/lib/html2rss/auto_source/rss_builder.rb +0 -70
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
data/lib/html2rss/config/channel.rb +0 -125
data/lib/html2rss/config/selectors.rb +0 -103
data/lib/html2rss/item.rb +0 -186
data/lib/html2rss/item_extractors/attribute.rb +0 -50
data/lib/html2rss/item_extractors/href.rb +0 -52
data/lib/html2rss/item_extractors/html.rb +0 -46
data/lib/html2rss/item_extractors/static.rb +0 -39
data/lib/html2rss/item_extractors/text.rb +0 -44
data/lib/html2rss/item_extractors.rb +0 -88
data/lib/html2rss/object_to_xml_converter.rb +0 -56
data/lib/html2rss/rss_builder/item.rb +0 -83
data/lib/html2rss/utils.rb +0 -113

data/lib/html2rss/rss_builder/enclosure.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+require 'mime/types'
+module Html2rss
+  class RssBuilder
+    ##
+    # Represents an enclosure for an RSS item.
+    class Enclosure
+      ##
+      # Guesses the content type based on the file extension of the URL.
+      #
+      # @param url [Html2rss::Url]
+      # @param default [String] default content type
+      # @return [String] guessed content type, or default
+      def self.guess_content_type_from_url(url, default: 'application/octet-stream')
+        return default unless url
+        url = url.path.split('?').first
+        content_type = MIME::Types.type_for(File.extname(url).delete('.'))
+        content_type.first&.to_s || 'application/octet-stream'
+      end
+      def self.add(enclosure, maker)
+        return unless enclosure
+        maker.enclosure.tap do |enclosure_maker|
+          enclosure_maker.url = enclosure.url.to_s
+          enclosure_maker.type = enclosure.type
+          enclosure_maker.length = enclosure.bits_length
+        end
+      end
+      def initialize(url:, type: nil, bits_length: 0)
+        raise ArgumentError, 'An Enclosure requires an absolute URL' if !url || !url.absolute?
+        @url = url
+        @type = type
+        @bits_length = bits_length
+      end
+      def type = @type || self.class.guess_content_type_from_url(url)
+      attr_reader :bits_length, :url
+    end
+  end
+end

data/lib/html2rss/rss_builder/stylesheet.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # frozen_string_literal: true
 module Html2rss
-  module RssBuilder
+  class RssBuilder
     ##
     # Represents a stylesheet.
     class Stylesheet
@@ -10,7 +10,7 @@ module Html2rss
         # Adds the stylesheet XML tags to the RSS.
         #
         # @param maker [RSS::Maker::RSS20] RSS maker object.
-        # @param stylesheets [Array<Html2rss::Config::Stylesheet>] Array of stylesheet configurations.
+        # @param stylesheets [Array<Html2rss::RssBuilder::Stylesheet>] Array of stylesheet configurations.
         # @return [nil]
         def add(maker, stylesheets)
           stylesheets.each do |stylesheet|
@@ -24,7 +24,7 @@ module Html2rss
         # Adds a single Stylesheet to the RSS.
         #
         # @param maker [RSS::Maker::RSS20] RSS maker object.
-        # @param stylesheet [Html2rss::Config::Stylesheet] Stylesheet configuration.
+        # @param stylesheet [Html2rss::RssBuilder::Stylesheet] Stylesheet configuration.
         # @return [nil]
         def add_stylesheet(maker, stylesheet)
           maker.xml_stylesheets.new_xml_stylesheet do |xss|
@@ -35,7 +35,7 @@ module Html2rss
         end
       end
-      TYPES = ['text/css', 'text/xsl'].freeze
+      TYPES = ['text/css', 'text/xsl'].to_set.freeze
       def initialize(href:, type:, media: 'all')
         raise ArgumentError, 'stylesheet.href must be a String' unless href.is_a?(String)

data/lib/html2rss/rss_builder.rb CHANGED Viewed

@@ -4,93 +4,94 @@ require 'rss'
 module Html2rss
   ##
-  # Builds the RSS 2.0 feed, which consists of the '<channel>' and the '<item>'s
-  # tags in the RSS.
-  module RssBuilder
-    # Possible tags inside a RSS 2.0 <channel> tag.
-    CHANNEL_TAGS = %i[language author title description link ttl].freeze
-    # Possible tags inside a RSS 2.0 <item> tag.
-    ITEM_TAGS = %i[title link description author comments updated].freeze
+  # Builds an RSS Feed by providing channel, articles and stylesheets.
+  class RssBuilder
+    class << self
+      def add_item(article, item_maker)
+        add_item_string_values(article, item_maker)
+        add_item_categories(article, item_maker)
+        Enclosure.add(article.enclosure, item_maker)
+        add_item_guid(article, item_maker)
+      end
-    ##
-    # Builds an RSS 2.0 feed based on the provided configuration.
-    #
-    # @param config [Html2rss::Config] Configuration object containing feed details.
-    # @return [RSS::Rss] RSS feed object.
-    def self.build(config)
-      RSS::Maker.make('2.0') do |maker|
-        add_stylesheets(maker, config.stylesheets)
-        add_channel(maker, config)
-        add_items(maker, config)
+      private
+      def add_item_string_values(article, item_maker)
+        %i[title description author].each do |attr|
+          next unless (value = article.send(attr))
+          next if value.empty?
+          item_maker.send(:"#{attr}=", value)
+        end
+        item_maker.link = article.url.to_s if article.url
+        item_maker.pubDate = article.published_at&.rfc2822
       end
-    end
-    ##
-    # Adds stylesheets to the RSS maker.
-    #
-    # @param maker [RSS::Maker] RSS maker instance.
-    # @param stylesheets [Array<String>] Array of stylesheets to add.
-    def self.add_stylesheets(maker, stylesheets)
-      Stylesheet.add(maker, stylesheets)
-    end
+      def add_item_categories(article, item_maker)
+        article.categories.each { |category| item_maker.categories.new_category.content = category }
+      end
-    ##
-    # Adds channel information to the RSS maker.
-    #
-    # @param maker [RSS::Maker] RSS maker instance.
-    # @param config [Html2rss::Config] Configuration object containing feed details.
-    def self.add_channel(maker, config)
-      channel = maker.channel
-      CHANNEL_TAGS.each do |tag|
-        Channel.add(channel, config, [tag])
+      def add_item_guid(article, item_maker)
+        item_maker.guid.tap do |guid|
+          guid.content = article.guid
+          guid.isPermaLink = false
+        end
       end
     end
     ##
-    # Adds items to the RSS maker based on configuration.
-    #
-    # @param maker [RSS::Maker] RSS maker instance.
-    # @param config [Html2rss::Config] Configuration object containing feed details.
-    def self.add_items(maker, config)
-      item_attributes = extract_item_attributes(config)
-      items = fetch_items(config)
-      items.reverse! if config.items_order == :reverse
-      items.each do |item|
-        add_item(maker, item, item_attributes)
+    # @param channel [Html2rss::RssBuilder::Channel] The channel information for the RSS feed.
+    # @param articles [Array<Html2rss::RssBuilder::Article>] The list of articles to include in the RSS feed.
+    # @param stylesheets [Array<Hash>] An optional array of stylesheet configurations.
+    def initialize(channel:, articles:, stylesheets: [])
+      @channel = channel
+      @articles = articles
+      @stylesheets = stylesheets
+    end
+    def call
+      RSS::Maker.make('2.0') do |maker|
+        Stylesheet.add(maker, stylesheets)
+        make_channel(maker.channel)
+        make_items(maker)
       end
     end
-    ##
-    # Adds a single item to the RSS maker.
-    #
-    # @param maker [RSS::Maker] RSS maker instance.
-    # @param item [Html2rss::Item] Item to add.
-    # @param item_attributes [Array<Symbol>] Array of item attributes.
-    # @return [nil]
-    def self.add_item(maker, item, item_attributes)
-      new_item = maker.items.new_item
-      Item.add(new_item, item, item_attributes)
+    private
+    attr_reader :channel, :articles
+    def stylesheets
+      @stylesheets.map { |style| Stylesheet.new(**style) }
     end
-    ##
-    # Extracts item attributes from configuration.
-    #
-    # @param config [Html2rss::Config] Configuration object containing feed details.
-    # @return [Array<Symbol>] Array of item attributes.
-    def self.extract_item_attributes(config)
-      config.item_selector_names & ITEM_TAGS
+    def make_channel(maker)
+      %i[language title description ttl].each do |key|
+        maker.public_send(:"#{key}=", channel.public_send(key))
+      end
+      maker.link = channel.url.to_s
+      maker.generator = generator
+      maker.updated = channel.last_build_date
     end
-    ##
-    # Fetches items from the URL specified in configuration.
-    #
-    # @param config [Html2rss::Config] Configuration object containing feed details.
-    # @return [Array<Html2rss::Item>] Array of items.
-    def self.fetch_items(config)
-      Html2rss::Item.from_url(config.url, config)
+    def make_items(maker)
+      articles.each do |article|
+        maker.items.new_item { |item_maker| self.class.add_item(article, item_maker) }
+      end
     end
-    private_class_method :extract_item_attributes, :fetch_items, :add_item
+    def generator
+      scraper_namespace_regex = /(?<namespace>Html2rss|Scraper)::/
+      scraper_counts = articles.flat_map(&:scraper).tally.map do |klass, count|
+        scraper_name = klass.to_s.gsub(scraper_namespace_regex, '')
+        "#{scraper_name} (#{count})"
+      end
+      "html2rss V. #{Html2rss::VERSION} (scrapers: #{scraper_counts.join(', ')})"
+    end
   end
 end

data/lib/html2rss/selectors/config.rb ADDED Viewed

@@ -0,0 +1,122 @@
+# frozen_string_literal: true
+require 'dry-validation'
+module Html2rss
+  class Selectors
+    ##
+    # Validates the configuration hash for :selectors.
+    class Config < Dry::Validation::Contract
+      NESTING_KEY = :dynamic_keys_workaround
+      ##
+      # Validates the configuration of the :items selector
+      class Items < Dry::Validation::Contract
+        params do
+          required(:selector).filled(:string)
+          optional(:order).filled(included_in?: %w[reverse])
+          optional(:enhance).filled(:bool?)
+          optional(:pagination).hash do
+            required(:max_pages).filled(:integer, gt?: 0)
+          end
+        end
+      end
+      ##
+      # Validates the configuration of a single selector.
+      class Selector < Dry::Validation::Contract
+        params do
+          optional(:selector)
+          optional(:extractor).filled(:string)
+          optional(:attribute).filled(:string)
+          optional(:static).filled(:string)
+          optional(:post_process).array(:hash)
+        end
+        rule(:selector) do
+          key(:selector).failure('`selector` must be a string') if value && !value.is_a?(String)
+        end
+        rule(:extractor) do
+          # dependent on the extractor, validate required fields, (i.e. static, attribute)
+          case value
+          when 'attribute'
+            key(:attribute).failure('`attribute` must be a string') unless values[:attribute].is_a?(String)
+          when 'static'
+            key(:static).failure('`static` must be a string') unless values[:static].is_a?(String)
+          end
+        end
+        rule(:post_process).each do
+          case (name = value[:name])
+          when 'gsub'
+            key(:pattern).failure('`pattern` must be a string') unless value[:pattern].is_a?(String)
+            key(:replacement).failure('`replacement` must be a string') unless value[:replacement].is_a?(String)
+          when 'substring'
+            key(:start).failure('`start` must be an integer') unless value[:start].is_a?(Integer)
+            key(:end).failure('`end` must be an integer or omitted') if !value[:end].nil? && !value[:end].is_a?(Integer)
+          when 'template'
+            key(:string).failure('`string` must be a string') unless value[:string].is_a?(String)
+          when 'html_to_markdown', 'markdown_to_html', 'parse_time', 'parse_uri', 'sanitize_html'
+            # nothing to validate
+          when nil
+            key(:post_process).failure('Missing post_processor `name`')
+          else
+            key(:post_process).failure("Unknown post_processor `name`: #{name}")
+          end
+        end
+      end
+      ##
+      # Validates the configuration of the :enclosure Selector
+      class Enclosure < Selector
+        params do
+          optional(:content_type).filled(:string, format?: %r{^[\w-]+/[\w-]+$})
+        end
+      end
+      params do
+        required(NESTING_KEY).hash
+      end
+      rule(NESTING_KEY) do
+        value.each_pair do |selector_key, selector|
+          case selector_key.to_sym
+          when Selectors::ITEMS_SELECTOR_KEY
+            Items.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
+          when :enclosure
+            Enclosure.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
+          when :guid, :categories
+            unless selector.is_a?(Array)
+              key(selector_key).failure("`#{selector_key}` must be an array")
+              next
+            end
+            key(selector_key).failure("`#{selector_key}` must contain at least one element") if selector.empty?
+            selector.each do |name|
+              next if values[NESTING_KEY].key?(name.to_sym)
+              key(selector_key).failure("`#{selector_key}` references unspecified `#{name}`")
+            end
+          else
+            # From here on, the selector is found under its "dynamic" selector_key
+            Selector.new.call(selector).errors.each { |error| key(selector_key).failure(error) }
+          end
+        end
+      end
+      ##
+      # Shortcut to validate the config.
+      # @param config [Hash] the configuration hash to validate
+      # @return [Dry::Validation::Result] the result of the validation
+      def self.call(config)
+        # dry-validation/schema does not support "Dynamic Keys" yet: https://github.com/dry-rb/dry-schema/issues/37
+        # But :selectors contains mostly "dynamic" keys, as the user defines them to extract article attributes.
+        # --> Validate the dynamic keys manually.
+        # To be able to specify a `rule`, nest the config under NESTING_KEY and mark that as `required`.
+        new.call(NESTING_KEY => config)
+      end
+    end
+  end
+end

data/lib/html2rss/selectors/extractors/attribute.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+module Html2rss
+  class Selectors
+    module Extractors
+      ##
+      # Returns the value of the attribute.
+      #
+      # Imagine this +time+ HTML tag with a +datetime+ attribute:
+      #
+      #     <time datetime="2019-07-01">...</time>
+      #
+      # YAML usage example:
+      #
+      #    selectors:
+      #      link:
+      #        selector: time
+      #        extractor: attribute
+      #        attribute: datetime
+      #
+      # Would return:
+      #    '2019-07-01'
+      #
+      # In case you're extracting a date or a time, consider parsing it
+      # during post processing with {PostProcessors::ParseTime}.
+      class Attribute
+        # The available options for the attribute extractor.
+        Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        ##
+        # Initializes the Attribute extractor.
+        #
+        # @param xml [Nokogiri::XML::Element]
+        # @param options [Options]
+        def initialize(xml, options)
+          @options = options
+          @element = Extractors.element(xml, options.selector)
+        end
+        ##
+        # Retrieves and returns the attribute's value as a string.
+        #
+        # @return [String] The value of the attribute.
+        def get
+          @element.attr(@options.attribute).to_s
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/selectors/extractors/href.rb ADDED Viewed

@@ -0,0 +1,53 @@
+# frozen_string_literal: true
+module Html2rss
+  class Selectors
+    module Extractors
+      ##
+      # Returns the value of the +href+ attribute.
+      # It always returns absolute URLs. If the extracted +href+ value is a
+      # relative URL, it prepends the channel's URL.
+      #
+      # Imagine this +a+ HTML element with a +href+ attribute:
+      #
+      #     <a href="/posts/latest-findings">...</a>
+      #
+      # YAML usage example:
+      #    channel:
+      #      url: http://blog-without-a-feed.example.com
+      #      ...
+      #    selectors:
+      #      link:
+      #        selector: a
+      #        extractor: href
+      #
+      # Would return:
+      #    'http://blog-without-a-feed.example.com/posts/latest-findings'
+      class Href
+        # The available options for the href (attribute) extractor.
+        Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        ##
+        # Initializes the Href extractor.
+        #
+        # @param xml [Nokogiri::XML::Element]
+        # @param options [Options]
+        def initialize(xml, options)
+          @options = options
+          @element = Extractors.element(xml, options.selector)
+          @href = @element.attr('href').to_s
+        end
+        ##
+        # Retrieves and returns the normalized absolute URL.
+        #
+        # @return [String] The absolute URL.
+        def get
+          return nil unless @href
+          Url.from_relative(@href, @options.channel[:url])
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/selectors/extractors/html.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+module Html2rss
+  class Selectors
+    module Extractors
+      ##
+      # Returns the HTML content of the specified element.
+      #
+      # Example HTML structure:
+      #
+      #     <p>Lorem <b>ipsum</b> dolor ...</p>
+      #
+      # YAML usage example:
+      #
+      #    selectors:
+      #      description:
+      #        selector: p
+      #        extractor: html
+      #
+      # Would return:
+      #    '<p>Lorem <b>ipsum</b> dolor ...</p>'
+      #
+      # Always ensure to sanitize the HTML during post-processing with
+      # {PostProcessors::SanitizeHtml}.
+      class Html
+        # The available options for the html extractor.
+        Options = Struct.new('HtmlOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        ##
+        # Initializes the Html extractor.
+        #
+        # @param xml [Nokogiri::XML::Element]
+        # @param options [Options]
+        def initialize(xml, options)
+          @element = Extractors.element(xml, options.selector)
+        end
+        ##
+        # Retrieves and returns the HTML content of the element.
+        #
+        # @return [String] The HTML content.
+        def get
+          @element.to_s
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/selectors/extractors/static.rb ADDED Viewed

@@ -0,0 +1,41 @@
+# frozen_string_literal: true
+module Html2rss
+  class Selectors
+    module Extractors
+      ##
+      # Returns a static value provided in the options.
+      #
+      # Example usage in YAML:
+      #
+      #    selectors:
+      #      author:
+      #        extractor: static
+      #        static: Foobar
+      #
+      # Would return:
+      #    'Foobar'
+      class Static
+        # The available option for the static extractor.
+        Options = Struct.new('StaticOptions', :static, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        ##
+        # Initializes the Static extractor.
+        #
+        # @param _xml [nil, Nokogiri::XML::Element] Unused parameter for compatibility with other extractors.
+        # @param options [Options] Options containing the static value.
+        def initialize(_xml, options)
+          @options = options
+        end
+        ##
+        # Retrieves and returns the static value.
+        #
+        # @return [String, Symbol] The static value provided in options.
+        def get
+          @options.static
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/selectors/extractors/text.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# frozen_string_literal: true
+module Html2rss
+  class Selectors
+    module Extractors
+      ##
+      # Return the text content of the attribute. This is the default extractor used,
+      # when no extractor is explicitly given.
+      #
+      # Example HTML structure:
+      #
+      #     <p>Lorem <b>ipsum</b> dolor ...</p>
+      #
+      # YAML usage example:
+      #
+      #    selectors:
+      #      description:
+      #        selector: p
+      #        extractor: text
+      #
+      # Would return:
+      #    'Lorem ipsum dolor ...'
+      class Text
+        # The available options for the text extractor.
+        Options = Struct.new('TextOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        ##
+        # Initializes the Text extractor.
+        #
+        # @param xml [Nokogiri::XML::Element]
+        # @param options [Options]
+        def initialize(xml, options)
+          @element = Extractors.element(xml, options.selector)
+        end
+        ##
+        # Retrieves and returns the text content of the element.
+        #
+        # @return [String] The text content.
+        def get
+          @element.text.to_s.strip.gsub(/\s+/, ' ')
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/selectors/extractors.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+module Html2rss
+  class Selectors
+    ##
+    # Provides a namespace for item extractors.
+    module Extractors
+      ##
+      # Maps the extractor name to the class implementing the extractor.
+      #
+      # The key is the name to use in the feed config.
+      NAME_TO_CLASS = {
+        attribute: Attribute,
+        href: Href,
+        html: Html,
+        static: Static,
+        text: Text
+      }.freeze
+      ##
+      # Maps the extractor class to its corresponding options class.
+      ITEM_OPTION_CLASSES = Hash.new do |hash, klass|
+        hash[klass] = klass.const_get(:Options)
+      end
+      DEFAULT_EXTRACTOR = :text
+      class << self
+        ##
+        # Retrieves an element from Nokogiri XML based on the selector.
+        #
+        # @param xml [Nokogiri::XML::Document]
+        # @param selector [String, nil]
+        # @return [Nokogiri::XML::ElementSet] selected XML elements
+        def element(xml, selector)
+          selector ? xml.css(selector) : xml
+        end
+        # @param attribute_options [Hash<Symbol, Object>]
+        #   Should contain at least `:extractor` (the name) and required options for that extractor.
+        # @param xml [Nokogiri::XML::Document]
+        # @return [Object] instance of the specified item extractor class
+        def get(attribute_options, xml)
+          extractor_class = NAME_TO_CLASS[attribute_options[:extractor]&.to_sym || DEFAULT_EXTRACTOR]
+          options = ITEM_OPTION_CLASSES[extractor_class].new(attribute_options.slice(*extractor_class::Options.members))
+          extractor_class.new(xml, options).get
+        end
+      end
+    end
+  end
+end