RubyGems - html2rss - Versions diffs - 0.17.0 → 0.18.0 - Mend

html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

checksums.yaml +4 -4
data/README.md +48 -656
data/exe/html2rss +1 -1
data/html2rss.gemspec +5 -2
data/lib/html2rss/articles/deduplicator.rb +49 -0
data/lib/html2rss/auto_source/cleanup.rb +33 -5
data/lib/html2rss/auto_source/scraper/html.rb +118 -43
data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
data/lib/html2rss/auto_source/scraper.rb +142 -8
data/lib/html2rss/auto_source.rb +119 -47
data/lib/html2rss/blocked_surface.rb +64 -0
data/lib/html2rss/category_extractor.rb +82 -0
data/lib/html2rss/cli.rb +170 -23
data/lib/html2rss/config/class_methods.rb +189 -0
data/lib/html2rss/config/dynamic_params.rb +68 -0
data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
data/lib/html2rss/config/request_headers.rb +130 -0
data/lib/html2rss/config/schema.rb +208 -0
data/lib/html2rss/config/validator.rb +108 -0
data/lib/html2rss/config.rb +112 -61
data/lib/html2rss/error.rb +6 -0
data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
data/lib/html2rss/html_extractor.rb +136 -0
data/lib/html2rss/html_navigator.rb +46 -0
data/lib/html2rss/json_feed_builder/item.rb +94 -0
data/lib/html2rss/json_feed_builder.rb +58 -0
data/lib/html2rss/rendering/audio_renderer.rb +31 -0
data/lib/html2rss/rendering/description_builder.rb +88 -0
data/lib/html2rss/rendering/image_renderer.rb +31 -0
data/lib/html2rss/rendering/media_renderer.rb +33 -0
data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
data/lib/html2rss/rendering/video_renderer.rb +31 -0
data/lib/html2rss/rendering.rb +14 -0
data/lib/html2rss/request_controls.rb +128 -0
data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
data/lib/html2rss/request_service/budget.rb +39 -0
data/lib/html2rss/request_service/context.rb +64 -20
data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
data/lib/html2rss/request_service/policy.rb +248 -0
data/lib/html2rss/request_service/puppet_commander.rb +212 -13
data/lib/html2rss/request_service/response.rb +42 -2
data/lib/html2rss/request_service/response_guard.rb +62 -0
data/lib/html2rss/request_service.rb +31 -15
data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
data/lib/html2rss/request_session/runtime_input.rb +57 -0
data/lib/html2rss/request_session/runtime_policy.rb +76 -0
data/lib/html2rss/request_session.rb +118 -0
data/lib/html2rss/rss_builder/article.rb +166 -0
data/lib/html2rss/rss_builder/channel.rb +96 -11
data/lib/html2rss/rss_builder/enclosure.rb +48 -0
data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
data/lib/html2rss/rss_builder.rb +72 -71
data/lib/html2rss/selectors/config.rb +122 -0
data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
data/lib/html2rss/selectors/extractors/href.rb +53 -0
data/lib/html2rss/selectors/extractors/html.rb +48 -0
data/lib/html2rss/selectors/extractors/static.rb +41 -0
data/lib/html2rss/selectors/extractors/text.rb +46 -0
data/lib/html2rss/selectors/extractors.rb +52 -0
data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
data/lib/html2rss/selectors/post_processors/base.rb +74 -0
data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
data/lib/html2rss/selectors/post_processors/template.rb +73 -0
data/lib/html2rss/selectors/post_processors.rb +43 -0
data/lib/html2rss/selectors.rb +294 -0
data/lib/html2rss/url.rb +262 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +129 -70
data/lib/tasks/config_schema.rake +17 -0
data/schema/html2rss-config.schema.json +469 -0
metadata +115 -38
data/lib/html2rss/attribute_post_processors/base.rb +0 -74
data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
data/lib/html2rss/attribute_post_processors/template.rb +0 -101
data/lib/html2rss/attribute_post_processors.rb +0 -44
data/lib/html2rss/auto_source/article.rb +0 -127
data/lib/html2rss/auto_source/channel.rb +0 -78
data/lib/html2rss/auto_source/reducer.rb +0 -48
data/lib/html2rss/auto_source/rss_builder.rb +0 -70
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
data/lib/html2rss/config/channel.rb +0 -125
data/lib/html2rss/config/selectors.rb +0 -103
data/lib/html2rss/item.rb +0 -186
data/lib/html2rss/item_extractors/attribute.rb +0 -50
data/lib/html2rss/item_extractors/href.rb +0 -52
data/lib/html2rss/item_extractors/html.rb +0 -46
data/lib/html2rss/item_extractors/static.rb +0 -39
data/lib/html2rss/item_extractors/text.rb +0 -44
data/lib/html2rss/item_extractors.rb +0 -88
data/lib/html2rss/object_to_xml_converter.rb +0 -56
data/lib/html2rss/rss_builder/item.rb +0 -83
data/lib/html2rss/utils.rb +0 -113

data/exe/html2rss CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env ruby
 # frozen_string_literal: true
-require 'html2rss/cli'
+require 'html2rss'
 Html2rss::CLI.start(ARGV)

data/html2rss.gemspec CHANGED Viewed

@@ -26,15 +26,18 @@ Gem::Specification.new do |spec|
   end
   spec.files = `git ls-files -z`.split("\x0").select do |f|
-    f.match(%r{^(lib/|exe/|README.md|LICENSE|html2rss.gemspec)})
+    f.match(%r{^(lib/|exe/|schema/|README.md|LICENSE|html2rss.gemspec)})
   end
   spec.bindir        = 'exe'
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ['lib']
   spec.add_dependency 'addressable', '~> 2.7'
+  spec.add_dependency 'brotli'
+  spec.add_dependency 'dry-validation'
   spec.add_dependency 'faraday', '> 2.0.1', '< 3.0'
   spec.add_dependency 'faraday-follow_redirects'
+  spec.add_dependency 'faraday-gzip', '~> 3'
   spec.add_dependency 'kramdown'
   spec.add_dependency 'mime-types', '> 3.0'
   spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
@@ -43,7 +46,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency 'regexp_parser'
   spec.add_dependency 'reverse_markdown', '~> 3.0'
   spec.add_dependency 'rss'
-  spec.add_dependency 'sanitize', '~> 6.0'
+  spec.add_dependency 'sanitize'
   spec.add_dependency 'thor'
   spec.add_dependency 'tzinfo'
   spec.add_dependency 'zeitwerk'

data/lib/html2rss/articles/deduplicator.rb ADDED Viewed

@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+require 'set' # rubocop:disable Lint/RedundantRequireStatement
+module Html2rss
+  module Articles
+    ##
+    # Deduplicates a list of articles while preserving their original order.
+    #
+    # The deduplicator prefers each article's URL (combined with its ID when
+    # available) to determine uniqueness. When no URL is present, it falls
+    # back to the article ID, then to the GUID enriched with title and
+    # description metadata. If none of these identifiers are available it
+    # defaults to the article object's hash to preserve the original entry.
+    class Deduplicator
+      ##
+      # @param articles [Array<Html2rss::RssBuilder::Article>]
+      # @raise [ArgumentError] if articles are not provided
+      def initialize(articles)
+        raise ArgumentError, 'articles must be provided' unless articles
+        @articles = articles
+      end
+      ##
+      # Returns the list of unique articles, preserving the order of the
+      # original collection and keeping the first occurrence of a duplicate.
+      # @return [Array<Html2rss::RssBuilder::Article>]
+      def call
+        seen = Set.new
+        articles.filter do |article|
+          fingerprint = deduplication_fingerprint_for(article) || article.hash
+          seen.add?(fingerprint)
+        end
+      end
+      private
+      attr_reader :articles
+      def deduplication_fingerprint_for(article)
+        return unless article.respond_to?(:deduplication_fingerprint)
+        article.deduplication_fingerprint
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/cleanup.rb CHANGED Viewed

@@ -7,8 +7,15 @@ module Html2rss
     # :reek:MissingSafeMethod { enabled: false }
     # It applies various strategies to filter and refine the article list.
     class Cleanup
+      DEFAULT_CONFIG = {
+        keep_different_domain: false,
+        min_words_title: 3
+      }.freeze
+      VALID_SCHEMES = %w[http https].to_set.freeze
       class << self
-        def call(articles, url:, keep_different_domain: false)
+        def call(articles, url:, keep_different_domain:, min_words_title:)
           Log.debug "Cleanup: start with #{articles.size} articles"
           articles.select!(&:valid?)
@@ -17,13 +24,12 @@ module Html2rss
           keep_only_http_urls!(articles)
           reject_different_domain!(articles, url) unless keep_different_domain
+          keep_only_with_min_words_title!(articles, min_words_title:)
           Log.debug "Cleanup: end with #{articles.size} articles"
           articles
         end
-        private
         ##
         # Deduplicates articles by a given key.
         #
@@ -42,18 +48,40 @@ module Html2rss
         #
         # @param articles [Array<Article>] The list of articles to process.
         def keep_only_http_urls!(articles)
-          articles.select! { |article| %w[http https].include?(article.url&.scheme) }
+          articles.select! { |article| VALID_SCHEMES.include?(article.url&.scheme) }
         end
         ##
         # Rejects articles that have a URL not on the same domain as the source.
         #
         # @param articles [Array<Article>] The list of articles to process.
-        # @param base_url [Addressable::URI] The source URL to compare against.
+        # @param base_url [Html2rss::Url] The source URL to compare against.
         def reject_different_domain!(articles, base_url)
           base_host = base_url.host
           articles.select! { |article| article.url&.host == base_host }
         end
+        ##
+        # Keeps only articles with a title that is present and has at least `min_words_title` words.
+        #
+        # @param articles [Array<Article>] The list of articles to process.
+        # @param min_words_title [Integer] The minimum number of words in the title.
+        def keep_only_with_min_words_title!(articles, min_words_title:)
+          articles.select! do |article|
+            article.title ? word_count_at_least?(article.title, min_words_title) : true
+          end
+        end
+        private
+        def word_count_at_least?(str, min_words)
+          count = 0
+          str.to_s.scan(/\p{Alnum}+/) do
+            count += 1
+            return true if count >= min_words
+          end
+          false
+        end
       end
     end
   end

data/lib/html2rss/auto_source/scraper/html.rb CHANGED Viewed

@@ -6,34 +6,58 @@ module Html2rss
   class AutoSource
     module Scraper
       ##
-      # Scrapes articles from HTML pages by
-      # finding similar structures around anchor tags in the parsed_body.
+      # Scrapes article-like blocks from plain HTML by looking for repeated link
+      # structures when richer structured data is unavailable.
+      #
+      # The approach is intentionally heuristic:
+      # 1. collect repeated anchor paths
+      # 2. walk upward to a shared container shape
+      # 3. extract the best anchor found inside each container
+      #
+      # This scraper is broader and noisier than `SemanticHtml`, so it acts as a
+      # fallback for pages without stronger semantic signals.
       class Html
         include Enumerable
-        TAGS_TO_IGNORE = /(nav|footer|header)/i
+        TAGS_TO_IGNORE = /(nav|footer|header|svg|script|style)/i
-        def self.articles?(parsed_body)
-          new(parsed_body, url: '').any?
-        end
+        DEFAULT_MINIMUM_SELECTOR_FREQUENCY = 2
+        DEFAULT_USE_TOP_SELECTORS = 5
-        def self.parent_until_condition(node, condition)
-          return nil if !node || node.document? || node.parent.name == 'html'
-          return node if condition.call(node)
+        ##
+        # @return [Symbol] config key used to enable or configure this scraper
+        def self.options_key = :html
-          parent_until_condition(node.parent, condition)
+        ##
+        # Probes whether the document appears to contain repeated anchor
+        # structures that this fallback scraper can cluster into article-like
+        # containers.
+        #
+        # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+        # @return [Boolean] true when the scraper can likely extract articles
+        def self.articles?(parsed_body)
+          new(parsed_body, url: '').any?
         end
         ##
         # Simplify an XPath selector by removing the index notation.
+        # This keeps repeated anchor paths comparable across sibling blocks.
+        #
+        # @param xpath [String] original XPath
+        # @return [String] XPath without positional indexes
         def self.simplify_xpath(xpath)
           xpath.gsub(/\[\d+\]/, '')
         end
-        def initialize(parsed_body, url:)
+        # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
+        # @param url [String] The base URL.
+        # @param extractor [Class] The extractor class to handle article extraction.
+        # @param opts [Hash] Additional options.
+        def initialize(parsed_body, url:, extractor: HtmlExtractor, **opts)
           @parsed_body = parsed_body
           @url = url
-          @selectors = Hash.new(0)
+          @extractor = extractor
+          @opts = opts
         end
         attr_reader :parsed_body
@@ -44,51 +68,102 @@ module Html2rss
         def each
           return enum_for(:each) unless block_given?
-          return if frequent_selectors.empty?
+          each_article_tag do |article_tag|
+            article_hash = extract_article(article_tag)
+            yield article_hash if article_hash
+          end
+        end
-          frequent_selectors.each do |selector|
-            parsed_body.xpath(selector).each do |selected_tag|
-              article_tag = self.class.parent_until_condition(selected_tag, method(:article_condition))
+        ##
+        # Decides whether a traversed node has reached a useful article-like
+        # boundary for the generic HTML scraper.
+        #
+        # The predicate prefers containers that add surrounding link context,
+        # which helps the scraper move from a leaf anchor toward a repeated
+        # teaser/card wrapper.
+        #
+        # @param node [Nokogiri::XML::Node] candidate boundary node
+        # @return [Boolean] true when the node is a good extraction boundary
+        def article_tag_condition?(node)
+          # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
+          return false if node.path.match?(TAGS_TO_IGNORE)
+          return true if %w[body html].include?(node.name)
+          return false unless (parent = node.parent)
-              if article_tag && (article_hash = SemanticHtml::Extractor.new(article_tag, url: @url).call)
-                yield article_hash
-              end
-            end
+          anchor_count(parent) > anchor_count(node)
+        end
+        private
+        ##
+        # Find relevant anchors in root.
+        # @return [Set<String>] The set of XPath selectors
+        def selectors
+          @selectors ||= Hash.new(0).tap do |selectors|
+            each_relevant_anchor { |node| increment_selector_count(selectors, node) }
           end
         end
         ##
-        # Find all the anchors in root.
-        # @param root [Nokogiri::XML::Node] The root node to search for anchors
-        # @return [Set<String>] The set of XPath selectors which exist at least min_frequency times
-        def frequent_selectors(root = @parsed_body.at_css('body'), min_frequency: 2)
-          @frequent_selectors ||= begin
-            root.traverse do |node|
-              next if !node.element? || node.name != 'a'
-              @selectors[self.class.simplify_xpath(node.path)] += 1
-            end
+        # Filter the frequent selectors by the minimum_selector_frequency and use_top_selectors.
+        # @return [Array<String>] The filtered selectors
+        def filtered_selectors
+          selectors.select { |_selector, count| count >= minimum_selector_frequency }
+                   .max_by(use_top_selectors, &:last)
+                   .map(&:first)
+        end
+        def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
+        def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
+        def anchor_count(node)
+          @anchor_counts ||= {}
+          @anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
+        end
+        def each_relevant_anchor
+          return enum_for(:each_relevant_anchor) unless block_given?
-            @selectors.keys
-                      .select { |selector| (@selectors[selector]).to_i >= min_frequency }
-                      .to_set
+          traversal_root&.traverse do |node|
+            yield node if relevant_anchor?(node)
           end
         end
-        def article_condition(node)
-          # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
-          return false if node.path.match?(TAGS_TO_IGNORE)
+        def relevant_anchor?(node)
+          node.element? && node.name == 'a' && !String(node['href']).empty?
+        end
-          # Ignore tags that are below a tag which has a class which matches TAGS_TO_IGNORE.
-          return false if self.class.parent_until_condition(node, proc do |current_node|
-            current_node.classes.any? { |klass| klass.match?(TAGS_TO_IGNORE) }
-          end)
+        def increment_selector_count(selectors, node)
+          path = self.class.simplify_xpath(node.path)
+          selectors[path] += 1 unless path.match?(TAGS_TO_IGNORE)
+        end
-          return true if %w[body html].include?(node.name)
+        def traversal_root
+          parsed_body.at_css('body, html') || parsed_body.root
+        end
+        def each_article_tag
+          return enum_for(:each_article_tag) unless block_given?
+          filtered_selectors.each do |selector|
+            parsed_body.xpath(selector).each do |selected_tag|
+              article_tag = article_tag_for(selected_tag)
+              yield article_tag if article_tag
+            end
+          end
+        end
+        def article_tag_for(selected_tag)
+          return if selected_tag.path.match?(Html::TAGS_TO_IGNORE)
+          HtmlNavigator.parent_until_condition(selected_tag, method(:article_tag_condition?))
+        end
-          return true if node.parent.css('a').size > 1
+        def extract_article(article_tag)
+          selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
+          return unless selected_anchor
-          false
+          @extractor.new(article_tag, base_url: @url, selected_anchor:).call
         end
       end
     end