RubyGems - html2rss - Versions diffs - 0.16.0 → 0.18.0 - Mend

html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

checksums.yaml +4 -4
data/README.md +48 -657
data/exe/html2rss +1 -1
data/html2rss.gemspec +7 -4
data/lib/html2rss/articles/deduplicator.rb +49 -0
data/lib/html2rss/auto_source/cleanup.rb +33 -5
data/lib/html2rss/auto_source/scraper/html.rb +118 -43
data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
data/lib/html2rss/auto_source/scraper.rb +142 -8
data/lib/html2rss/auto_source.rb +119 -47
data/lib/html2rss/blocked_surface.rb +64 -0
data/lib/html2rss/category_extractor.rb +82 -0
data/lib/html2rss/cli.rb +170 -23
data/lib/html2rss/config/class_methods.rb +189 -0
data/lib/html2rss/config/dynamic_params.rb +68 -0
data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
data/lib/html2rss/config/request_headers.rb +130 -0
data/lib/html2rss/config/schema.rb +208 -0
data/lib/html2rss/config/validator.rb +108 -0
data/lib/html2rss/config.rb +112 -61
data/lib/html2rss/error.rb +6 -0
data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
data/lib/html2rss/html_extractor.rb +136 -0
data/lib/html2rss/html_navigator.rb +46 -0
data/lib/html2rss/json_feed_builder/item.rb +94 -0
data/lib/html2rss/json_feed_builder.rb +58 -0
data/lib/html2rss/rendering/audio_renderer.rb +31 -0
data/lib/html2rss/rendering/description_builder.rb +88 -0
data/lib/html2rss/rendering/image_renderer.rb +31 -0
data/lib/html2rss/rendering/media_renderer.rb +33 -0
data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
data/lib/html2rss/rendering/video_renderer.rb +31 -0
data/lib/html2rss/rendering.rb +14 -0
data/lib/html2rss/request_controls.rb +128 -0
data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
data/lib/html2rss/request_service/budget.rb +39 -0
data/lib/html2rss/request_service/context.rb +64 -20
data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
data/lib/html2rss/request_service/policy.rb +248 -0
data/lib/html2rss/request_service/puppet_commander.rb +212 -13
data/lib/html2rss/request_service/response.rb +42 -2
data/lib/html2rss/request_service/response_guard.rb +62 -0
data/lib/html2rss/request_service.rb +31 -15
data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
data/lib/html2rss/request_session/runtime_input.rb +57 -0
data/lib/html2rss/request_session/runtime_policy.rb +76 -0
data/lib/html2rss/request_session.rb +118 -0
data/lib/html2rss/rss_builder/article.rb +166 -0
data/lib/html2rss/rss_builder/channel.rb +96 -11
data/lib/html2rss/rss_builder/enclosure.rb +48 -0
data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
data/lib/html2rss/rss_builder.rb +72 -71
data/lib/html2rss/selectors/config.rb +122 -0
data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
data/lib/html2rss/selectors/extractors/href.rb +53 -0
data/lib/html2rss/selectors/extractors/html.rb +48 -0
data/lib/html2rss/selectors/extractors/static.rb +41 -0
data/lib/html2rss/selectors/extractors/text.rb +46 -0
data/lib/html2rss/selectors/extractors.rb +52 -0
data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
data/lib/html2rss/selectors/post_processors/base.rb +74 -0
data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
data/lib/html2rss/selectors/post_processors/template.rb +73 -0
data/lib/html2rss/selectors/post_processors.rb +43 -0
data/lib/html2rss/selectors.rb +294 -0
data/lib/html2rss/url.rb +262 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +129 -70
data/lib/tasks/config_schema.rake +17 -0
data/schema/html2rss-config.schema.json +469 -0
metadata +120 -46
data/lib/html2rss/attribute_post_processors/base.rb +0 -74
data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
data/lib/html2rss/attribute_post_processors/template.rb +0 -101
data/lib/html2rss/attribute_post_processors.rb +0 -44
data/lib/html2rss/auto_source/article.rb +0 -127
data/lib/html2rss/auto_source/channel.rb +0 -78
data/lib/html2rss/auto_source/reducer.rb +0 -48
data/lib/html2rss/auto_source/rss_builder.rb +0 -70
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
data/lib/html2rss/config/channel.rb +0 -125
data/lib/html2rss/config/selectors.rb +0 -103
data/lib/html2rss/item.rb +0 -186
data/lib/html2rss/item_extractors/attribute.rb +0 -50
data/lib/html2rss/item_extractors/href.rb +0 -52
data/lib/html2rss/item_extractors/html.rb +0 -46
data/lib/html2rss/item_extractors/static.rb +0 -39
data/lib/html2rss/item_extractors/text.rb +0 -44
data/lib/html2rss/item_extractors.rb +0 -88
data/lib/html2rss/object_to_xml_converter.rb +0 -56
data/lib/html2rss/rss_builder/item.rb +0 -83
data/lib/html2rss/utils.rb +0 -113

data/lib/html2rss/selectors/post_processors/substring.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# frozen_string_literal: true
+module Html2rss
+  class Selectors
+    module PostProcessors
+      ##
+      # Returns a defined part of a String.
+      #
+      # Both parameters must be an Integer and they can be negative.
+      # The +end+ parameter can be omitted, in that case it will not cut the
+      # String at the end.
+      #
+      # A Regexp or a MatchString is not supported.
+      #
+      # See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
+      # documentation for more information.
+      #
+      # Imagine this HTML:
+      #    <h1>Foo bar and baz<h1>
+      #
+      # YAML usage example:
+      #    selectors:
+      #      title:
+      #        selector: h1
+      #        post_process:
+      #          name: substring
+      #          start: 4
+      #          end: 6
+      #
+      # Would return:
+      #    'bar'
+      class Substring < Base
+        def self.validate_args!(value, context)
+          assert_type value, String, :value, context:
+          options = context[:options]
+          assert_type options[:start], Integer, :start, context:
+          end_index = options[:end]
+          assert_type(end_index, Integer, :end, context:) if end_index
+        end
+        ##
+        # Extracts the substring from the original string based on the provided start and end indices.
+        #
+        # @return [String] The extracted substring.
+        def get
+          value[range]
+        end
+        ##
+        # Determines the range for the substring extraction based on the provided start and end indices.
+        #
+        # @return [Range] The range object representing the start and end/Infinity (integers).
+        def range
+          return (start_index..) unless end_index?
+          if start_index == end_index
+            raise ArgumentError,
+                  'The `start` value must be unequal to the `end` value.'
+          end
+          (start_index..end_index)
+        end
+        private
+        def end_index?  = !context[:options][:end].to_s.empty?
+        def end_index   = context[:options][:end].to_i
+        def start_index = context[:options][:start].to_i
+      end
+    end
+  end
+end

data/lib/html2rss/selectors/post_processors/template.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+module Html2rss
+  class Selectors
+    module PostProcessors
+      ##
+      # Returns a formatted String according to the string pattern.
+      # It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
+      #
+      # It supports the format pattern `%<key>s` and `%{key}`, where `key` is the key of the selector.
+      # If `%{self}` is used, the selectors extracted value will be used.
+      #
+      # Imagine this HTML:
+      #
+      #    <li>
+      #      <h1>Product</h1>
+      #      <span class="price">23,42€</span>
+      #    </li>
+      #
+      #
+      # YAML usage example:
+      #
+      #    selectors:
+      #      items:
+      #        selector: 'li'
+      #      price:
+      #        selector: '.price'
+      #      title:
+      #        selector: h1
+      #        post_process:
+      #          name: template
+      #          string: '%{self} (%{price})'
+      #
+      # Would return:
+      #    'Product (23,42€)'
+      class Template < Base
+        def self.validate_args!(value, context)
+          assert_type value, String, :value, context:
+          string = context[:options]&.dig(:string).to_s
+          raise InvalidType, 'The `string` template is absent.' if string.empty?
+        end
+        ##
+        # @param value [String]
+        # @param context [Selectors::Context]
+        def initialize(value, context)
+          super
+          @options = context[:options] || {}
+          @scraper = context[:scraper]
+          @item = context[:item]
+          @string = @options[:string].to_s
+        end
+        ##
+        # @return [String]
+        def get
+          Html2rss::Config::DynamicParams.call(@string, {}, getter: method(:item_value), replace_missing_with: '')
+        end
+        private
+        # @param key [String, Symbol]
+        # @return [String]
+        def item_value(key)
+          key = key.to_sym
+          key == :self ? value : @scraper.select(key, @item)
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/selectors/post_processors.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# frozen_string_literal: true
+module Html2rss
+  class Selectors
+    ##
+    # Provides a namespace for attribute post processors.
+    module PostProcessors
+      ##
+      # Error raised when an unknown post processor name is requested.
+      class UnknownPostProcessorName < Html2rss::Error; end
+      ##
+      # Error raised when a required option is missing.
+      class MissingOption < Html2rss::Error; end
+      ##
+      # Error raised when an invalid type is provided.
+      class InvalidType < Html2rss::Error; end
+      ##
+      # Maps the post processor name to the class implementing the post processor.
+      #
+      # The key is the name to use in the feed config.
+      NAME_TO_CLASS = {
+        gsub: Gsub,
+        html_to_markdown: HtmlToMarkdown,
+        markdown_to_html: MarkdownToHtml,
+        parse_time: ParseTime,
+        parse_uri: ParseUri,
+        sanitize_html: SanitizeHtml,
+        substring: Substring,
+        template: Template
+      }.freeze
+      ##
+      # Shorthand method to instantiate the post processor and call `#get` on it
+      def self.get(name, value, context)
+        klass = NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Unknown name '#{name}'")
+        klass.new(value, context).get
+      end
+    end
+  end
+end

data/lib/html2rss/selectors.rb ADDED Viewed

@@ -0,0 +1,294 @@
+# frozen_string_literal: true
+require 'nokogiri'
+module Html2rss
+  ##
+  # This scraper is designed to scrape articles from a given HTML page using CSS
+  # selectors defined in the feed config.
+  #
+  # It supports the traditional feed configs that html2rss originally provided,
+  # ensuring compatibility with existing setups.
+  #
+  # Additionally, it uniquely offers the capability to convert JSON into XML,
+  # extending its versatility for diverse data processing workflows.
+  class Selectors # rubocop:disable Metrics/ClassLength
+    class InvalidSelectorName < Html2rss::Error; end
+    include Enumerable
+    # A context instance passed to item extractors and post-processors.
+    Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+    DEFAULT_CONFIG = { items: { enhance: true } }.freeze
+    ITEMS_SELECTOR_KEY = :items
+    ITEM_TAGS = %i[title url description author comments published_at guid enclosure categories].freeze
+    SPECIAL_ATTRIBUTES = Set[:guid, :enclosure, :categories].freeze
+    # Mapping of new attribute names to their legacy names for backward compatibility.
+    RENAMED_ATTRIBUTES = { published_at: %i[updated pubDate] }.freeze
+    ##
+    # Initializes a new Selectors instance.
+    #
+    # @param response [RequestService::Response] The response object.
+    # @param selectors [Hash] A hash of CSS selectors.
+    # @param time_zone [String] Time zone string used for date parsing.
+    def initialize(response, selectors:, time_zone:)
+      @response = response
+      @url = response.url
+      @selectors = selectors
+      @time_zone = time_zone
+      prepare_selectors!
+      @rss_item_attributes = @selectors.keys & Html2rss::RssBuilder::Article::PROVIDED_KEYS
+    end
+    ##
+    # Returns articles extracted from the response.
+    # Reverses order if config specifies reverse ordering.
+    #
+    # @return [Array<Html2rss::RssBuilder::Article>]
+    def articles
+      @articles ||= @selectors.dig(ITEMS_SELECTOR_KEY, :order) == 'reverse' ? to_a.tap(&:reverse!) : to_a
+    end
+    ##
+    # Iterates over each scraped article.
+    #
+    # @yield [article] Gives each article as an Html2rss::RssBuilder::Article.
+    # @return [Enumerator] An enumerator if no block is given.
+    def each(&)
+      return enum_for(:each) unless block_given?
+      enhance = enhance?
+      parsed_body.css(items_selector).each do |item|
+        article_hash = extract_article(item, response)
+        enhance_article_hash(article_hash, item, response.url) if enhance
+        yield Html2rss::RssBuilder::Article.new(**article_hash, scraper: self.class)
+      end
+    end
+    ##
+    # Returns the CSS selector for the items.
+    # @return [String] the CSS selector for the items
+    def items_selector = @selectors.dig(ITEMS_SELECTOR_KEY, :selector)
+    ## @return [Boolean] whether to enhance the article hash with auto_source's semantic HTML extraction.
+    def enhance? = !!@selectors.dig(ITEMS_SELECTOR_KEY, :enhance)
+    ##
+    # Extracts an article hash for a given item element.
+    #
+    # @param item [Nokogiri::XML::Element] The element to extract from.
+    # @return [Hash] Hash of attributes for the article.
+    def extract_article(item, page_response = response)
+      @rss_item_attributes.to_h { |key| [key, select(key, item, base_url: page_response.url)] }.compact
+    end
+    ##
+    # Enhances the article hash using semantic HTML extraction.
+    # Only adds keys that are missing from the original hash.
+    #
+    # @param article_hash [Hash] The original article hash.
+    # @param article_tag [Nokogiri::XML::Element] HTML element to extract additional info from.
+    # @return [Hash] The enhanced article hash.
+    def enhance_article_hash(article_hash, article_tag, base_url = @url)
+      selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
+      return article_hash unless selected_anchor
+      extracted = HtmlExtractor.new(article_tag, base_url:, selected_anchor:).call
+      return article_hash unless extracted
+      extracted.each_with_object(article_hash) do |(key, value), hash|
+        next if value.nil? || (hash.key?(key) && hash[key])
+        hash[key] = value
+      end
+    end
+    ##
+    # Selects the value for a given attribute from an HTML element.
+    #
+    # @param name [Symbol, String] Name of the attribute.
+    # @param item [Nokogiri::XML::Element] The HTML element to process.
+    # @return [Object, Array<Object>] The selected value(s).
+    # @raise [InvalidSelectorName] If the attribute name is invalid or not defined.
+    def select(name, item, base_url: @url)
+      name = name.to_sym
+      raise InvalidSelectorName, "Attribute selector '#{name}' is reserved for items." if name == ITEMS_SELECTOR_KEY
+      selector_key, config = selector_config_for(name)
+      if SPECIAL_ATTRIBUTES.member?(selector_key)
+        select_special(selector_key, item:, config:, base_url:)
+      else
+        select_regular(selector_key, item:, config:, base_url:)
+      end
+    end
+    private
+    attr_reader :response
+    def prepare_selectors!
+      validate_url_and_link_exclusivity!
+      fix_url_and_link!
+      handle_renamed_attributes!
+    end
+    def validate_url_and_link_exclusivity!
+      return unless @selectors.key?(:url) && @selectors.key?(:link)
+      raise InvalidSelectorName, 'You must either use "url" or "link" your selectors. Using both is not supported.'
+    end
+    def fix_url_and_link!
+      return if @selectors[:url] || !@selectors.key?(:link)
+      @selectors = @selectors.dup
+      @selectors[:url] = @selectors[:link]
+    end
+    def handle_renamed_attributes!
+      RENAMED_ATTRIBUTES.each_pair do |new_name, old_names|
+        old_names.each do |old_name|
+          next unless @selectors.key?(old_name)
+          Html2rss::Log.warn("Selector '#{old_name}' is deprecated. Please rename to '#{new_name}'.")
+          @selectors[new_name] ||= @selectors.delete(old_name)
+        end
+      end
+    end
+    def parsed_body
+      parsed_body_for(response)
+    end
+    def parsed_body_for(page_response)
+      @parsed_bodies ||= {}
+      @parsed_bodies[page_response.url] ||= if page_response.json_response?
+                                              fragment = ObjectToXmlConverter.new(page_response.parsed_body).call
+                                              Nokogiri::HTML5.fragment(fragment)
+                                            else
+                                              page_response.parsed_body
+                                            end
+    end
+    def select_special(name, item:, config:, base_url:)
+      case name
+      when :enclosure
+        enclosure(item:, config:, base_url:)
+      when :guid
+        Array(config).map { |selector_name| select(selector_name, item, base_url:) }
+      when :categories
+        select_categories(category_selectors: config, item:, base_url:)
+      end
+    end
+    def select_regular(_name, item:, config:, base_url:)
+      value = Extractors.get(config.merge(channel: channel_context(base_url)), item)
+      if value && (post_process_steps = config[:post_process])
+        steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
+        value = post_process(item, value, steps, base_url:)
+      end
+      value
+    end
+    def post_process(item, value, post_process_steps, base_url:)
+      post_process_steps.each do |options|
+        context = Context.new(config: { channel: { url: base_url, time_zone: @time_zone } },
+                              item:, scraper: self, options:)
+        value = PostProcessors.get(options[:name], value, context)
+      end
+      value
+    end
+    def select_categories(category_selectors:, item:, base_url:)
+      Array(category_selectors).flat_map do |selector_name|
+        extract_category_values(selector_name, item:, base_url:)
+      end
+    end
+    def extract_category_values(selector_name, item:, base_url:)
+      selector_key, config = selector_config_for(selector_name, allow_nil: true)
+      return [] unless config
+      nodes = extract_nodes(item:, config:)
+      unless node_set_with_multiple_elements?(nodes)
+        return Array(select_regular(selector_key, item:, config:, base_url:))
+      end
+      Array(nodes).flat_map { |node| extract_categories_from_node(node, item:, config:, base_url:) }
+    end
+    def extract_categories_from_node(node, item:, config:, base_url:)
+      values = Extractors.get(category_node_options(config, base_url:), node)
+      values = apply_post_process_steps(item:, value: values, post_process_steps: config[:post_process], base_url:)
+      Array(values).filter_map { |category| extract_category_text(category) }
+    end
+    def extract_category_text(category)
+      text = case category
+             when Nokogiri::XML::Node, Nokogiri::XML::NodeSet
+               HtmlExtractor.extract_visible_text(category)
+             else
+               category&.to_s
+             end
+      stripped = text&.strip
+      stripped unless stripped.nil? || stripped.empty?
+    end
+    def node_set_with_multiple_elements?(nodes)
+      nodes.is_a?(Nokogiri::XML::NodeSet) && nodes.length > 1
+    end
+    def category_node_options(selector_config, base_url:)
+      selector_config.merge(channel: channel_context(base_url), selector: nil)
+    end
+    def apply_post_process_steps(item:, value:, post_process_steps:, base_url:)
+      return value unless value && post_process_steps
+      steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
+      post_process(item, value, steps, base_url:)
+    end
+    def selector_config_for(name, allow_nil: false)
+      selector_key = name.to_sym
+      return [selector_key, @selectors[selector_key]] if @selectors.key?(selector_key)
+      return [selector_key, nil] if allow_nil
+      raise InvalidSelectorName, "Selector for '#{selector_key}' is not defined."
+    end
+    def extract_nodes(item:, config:)
+      return unless config.respond_to?(:[]) && config[:selector]
+      Extractors.element(item, config[:selector])
+    end
+    def channel_context(base_url)
+      { url: base_url, time_zone: @time_zone }
+    end
+    # @return [Hash] enclosure details.
+    def enclosure(item:, config:, base_url:)
+      url = Url.from_relative(select_regular(:enclosure, item:, config:, base_url:), base_url)
+      { url:, type: config[:content_type] }
+    end
+  end
+end