RubyGems - html2rss - Versions diffs - 0.21.0 → 0.22.0 - Mend

html2rss 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/lib/html2rss/auto_source/scraper/html/class_clustering.rb +185 -0
data/lib/html2rss/auto_source/scraper/html.rb +14 -2
data/lib/html2rss/auto_source/scraper/json_state.rb +1 -1
data/lib/html2rss/auto_source/scraper/microdata.rb +2 -2
data/lib/html2rss/auto_source.rb +3 -1
data/lib/html2rss/cli.rb +61 -10
data/lib/html2rss/config/class_methods.rb +3 -3
data/lib/html2rss/config/validator.rb +1 -0
data/lib/html2rss/config.rb +2 -2
data/lib/html2rss/html_extractor/heading_extractor.rb +50 -0
data/lib/html2rss/html_extractor/id_generator.rb +67 -0
data/lib/html2rss/html_extractor/text_extractor.rb +77 -0
data/lib/html2rss/html_extractor.rb +50 -52
data/lib/html2rss/rendering.rb +2 -2
data/lib/html2rss/request_service/local_file_strategy.rb +29 -0
data/lib/html2rss/request_service/puppet_commander.rb +4 -0
data/lib/html2rss/request_service.rb +2 -1
data/lib/html2rss/selectors/extractors/attribute.rb +1 -1
data/lib/html2rss/selectors/extractors/href.rb +1 -1
data/lib/html2rss/selectors/extractors/html.rb +1 -1
data/lib/html2rss/selectors/extractors/static.rb +1 -1
data/lib/html2rss/selectors/extractors/text.rb +1 -1
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +8 -1
data/lib/html2rss/selectors.rb +1 -1
data/lib/html2rss/url.rb +26 -13
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +28 -6
data/schema/html2rss-config.schema.json +12 -1
metadata +7 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8168109d2cc60920d8a18b6b99970a5558e43163ad5cd11cb3d3f0d944d46943
-  data.tar.gz: 833a936f89f9ce31c0b4fb0036020c7962a4ac77e0dfa72f1134a0bae8bea4c4
+  metadata.gz: 9ff7cdc4e25f3abc6da000e4f672d6832fb6027e49885aecc0cad38329c5e6ae
+  data.tar.gz: e42c216e328bb2c56971dd58871f023f15e672563e27741c56c6cf7fe4cb322a
 SHA512:
-  metadata.gz: 734f286a486d49c86ab7baf48d157cdee9d988fdc8b693ac7d79bf3c64c661fcd54538d5e94dc19bdc8a6f3021168c1ecac2d8e34417f56879392d71600c7340
-  data.tar.gz: f008a767b452557cff1b45b1abb0eccb26f38d839417e95d21b8cf74f4546f9143b067e2d11f9fc00f5955c3f145f8933a1b1d4912ad25756c890280d4bb1a37
+  metadata.gz: e52812f947561b9a52537f1b28c530f63e116194642d13ff526fac1ad32f02d7ea6ff8ca9b5ee16e2d7f686e1babec1908ca51399ba42ef9461ed3dbe0d02117
+  data.tar.gz: 4754495a5947aca6de71846d1c88128d9fc1826e1014fd00806f58e7cd1e1575dc3bd2380ced3aeecd0cd6d51e0366ddee7a1f7db0220750e13f66645de2edde

data/lib/html2rss/auto_source/scraper/html/class_clustering.rb ADDED Viewed

@@ -0,0 +1,185 @@
+# frozen_string_literal: true
+module Html2rss
+  class AutoSource
+    module Scraper
+      class Html
+        ##
+        # ClassClustering clusters DOM elements on anchorless pages by class lists and scores
+        # candidate groups to find the best list of content cards/articles.
+        # rubocop:disable Metrics/ClassLength
+        class ClassClustering
+          # Node tags considered layout containers
+          LAYOUT_TAG_NAMES = Set['div', 'section', 'article'].freeze
+          # HTML/layout tags excluded from candidate nodes
+          EXCLUDED_TAGS = Set['html', 'body', 'nav', 'footer', 'header', 'svg', 'script', 'style'].freeze
+          class << self
+            ##
+            # Clusters elements in parsed_body and returns the best set of content card nodes.
+            #
+            # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
+            # @param minimum_selector_frequency [Integer] minimum frequency for class groups
+            # @return [Array<Nokogiri::XML::Node>] candidate nodes of the top-scoring class group
+            def call(parsed_body, minimum_selector_frequency:)
+              new(parsed_body, minimum_selector_frequency:).call
+            end
+          end
+          # @param parsed_body [Nokogiri::HTML::Document]
+          # @param minimum_selector_frequency [Integer]
+          def initialize(parsed_body, minimum_selector_frequency:)
+            @parsed_body = parsed_body
+            @minimum_frequency = minimum_selector_frequency
+            @text_words = {}.compare_by_identity
+            @has_date = {}.compare_by_identity
+          end
+          # @return [Array<Nokogiri::XML::Node>]
+          def call
+            candidate_groups = collect_candidate_groups
+            return [] if candidate_groups.empty?
+            non_containers = filter_containers(candidate_groups)
+            final_groups = filter_1_to_1_overlap(non_containers)
+            select_best_group(final_groups)
+          end
+          private
+          def collect_candidate_groups
+            class_groups = Hash.new { |h, k| h[k] = [] }
+            cache = {}.compare_by_identity
+            @parsed_body.css('[class]').each { |node| add_node_to_groups(node, class_groups, cache) }
+            class_groups.select { |_, nodes| nodes.size >= @minimum_frequency }
+          end
+          def add_node_to_groups(node, class_groups, cache)
+            return if EXCLUDED_TAGS.include?(node.name) || HtmlExtractor.ignored_container_path?(node, cache)
+            cls = normalize_class(node['class'])
+            class_groups[cls] << node unless cls.empty?
+          end
+          def normalize_class(class_attr)
+            class_str = class_attr.to_s.strip
+            return '' if class_str.empty?
+            # Bypass split/sort/join allocation for single-class lists
+            if class_str.include?(' ')
+              class_str.split(/\s+/).sort.join(' ')
+            else
+              class_str
+            end
+          end
+          # Discard group A if any node of A contains > 1 node of another group B
+          def filter_containers(groups)
+            groups.reject do |cls_a, nodes_a|
+              groups.any? { |cls_b, nodes_b| cls_a != cls_b && container_of?(nodes_a, nodes_b) }
+            end
+          end
+          def container_of?(nodes_a, nodes_b)
+            return false unless LAYOUT_TAG_NAMES.include?(nodes_b.first.name)
+            nodes_a.any? do |node_a|
+              nodes_b.count { |node_b| node_a != node_b && node_b.ancestors.include?(node_a) } > 1
+            end
+          end
+          # If group A contains group B, and they have the same size:
+          # - If B (the descendant) contains >= 80% of A's words, AND B's tag is div/section/article,
+          #   B is the actual content card. Discard A.
+          # - Otherwise, B is a sub-element (header, metadata line, button). Discard B.
+          def filter_1_to_1_overlap(groups)
+            discarded = Set.new
+            groups.each_key do |cls_a|
+              groups.each_key do |cls_b|
+                next if cls_a == cls_b || discarded.include?(cls_a) || discarded.include?(cls_b)
+                resolve_1_to_1_overlap(cls_a, cls_b, groups, discarded)
+              end
+            end
+            groups.except(*discarded)
+          end
+          def resolve_1_to_1_overlap(cls_a, cls_b, groups, discarded)
+            nodes_a = groups[cls_a]
+            nodes_b = groups[cls_b]
+            return if nodes_a.size != nodes_b.size
+            return unless nodes_a.zip(nodes_b).all? { |a, b| a != b && b.ancestors.include?(a) }
+            discarded << (keep_descendant?(nodes_a, nodes_b) ? cls_a : cls_b)
+          end
+          def keep_descendant?(nodes_a, nodes_b)
+            avg_words(nodes_b) >= 0.8 * avg_words(nodes_a) &&
+              LAYOUT_TAG_NAMES.include?(nodes_b.first.name)
+          end
+          def select_best_group(groups)
+            best_nodes = []
+            best_score = -1
+            groups.each_value do |nodes|
+              score = score_group(nodes)
+              next if score.negative?
+              (best_nodes = nodes) && (best_score = score) if score > best_score
+            end
+            best_nodes
+          end
+          def score_group(nodes)
+            avg_w = avg_words(nodes)
+            return -1 if avg_w < 5
+            score = nodes.size + (avg_w / 5.0)
+            score += 20 if nodes_heading?(nodes)
+            score += 20 if nodes_time?(nodes)
+            score += 40 if nodes_date?(nodes)
+            score
+          end
+          def nodes_heading?(nodes)
+            nodes.any? do |n|
+              n.at_css(HtmlExtractor::HEADING_TAGS.join(',')) ||
+                n.at_css('.font-bold, .font-semibold')
+            end
+          end
+          def nodes_time?(nodes)
+            nodes.any? { |n| n.at_css('time, [datetime]') }
+          end
+          def nodes_date?(nodes)
+            nodes.any? { |n| date?(n) }
+          end
+          def avg_words(nodes)
+            nodes.sum { |n| text_words(n) } / nodes.size.to_f
+          end
+          def text_words(node)
+            @text_words[node] ||= HtmlExtractor.extract_visible_text(node).to_s.scan(/\p{Alnum}+/).size
+          end
+          def date?(node)
+            @has_date[node] ||= begin
+              text = HtmlExtractor.extract_visible_text(node).to_s
+              text.match?(%r{\b\d{4}[-/]\d{2}[-/]\d{2}\b}) ||
+                text.match?(/\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b/i)
+            end
+          end
+          # rubocop:enable Metrics/ClassLength
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/auto_source/scraper/html.rb CHANGED Viewed

@@ -62,6 +62,7 @@ module Html2rss
           @url = url
           @extractor = extractor
           @opts = opts
+          @fallback_anchorless = opts.fetch(:fallback_anchorless, false)
           @link_heuristics = LinkHeuristics.new(url)
           @ignored_cache = {}.compare_by_identity
         end
@@ -105,8 +106,19 @@ module Html2rss
         private
         def articles
-          @articles ||= each_article_tag.filter_map do |article_tag, selected_anchor|
-            extract_article(article_tag, selected_anchor:)
+          @articles ||= begin
+            extracted = each_article_tag.filter_map do |article_tag, selected_anchor|
+              extract_article(article_tag, selected_anchor:)
+            end
+            extracted += find_anchorless_articles if @fallback_anchorless
+            extracted
+          end
+        end
+        def find_anchorless_articles
+          ClassClustering.call(parsed_body, minimum_selector_frequency:).map do |node|
+            @extractor.new(node, base_url: @url, selected_anchor: nil, fallback_anchorless: true).call
           end
         end

data/lib/html2rss/auto_source/scraper/json_state.rb CHANGED Viewed

@@ -305,7 +305,7 @@ module Html2rss
           # rubocop:disable Metrics/MethodLength
           # @param entry [Hash] raw article entry candidate
           # @param base_url [String, Html2rss::Url] base URL for relative link resolution
-          # @return [Hash{Symbol => Object}, nil] normalized article hash for downstream extraction
+          # @return [Hash{Symbol => Object, nil}] normalized article hash for downstream extraction
           def normalise(entry, base_url:)
             return unless entry.is_a?(Hash)

data/lib/html2rss/auto_source/scraper/microdata.rb CHANGED Viewed

@@ -92,7 +92,7 @@ module Html2rss
         attr_reader :parsed_body, :url
         # @param root [Nokogiri::XML::Element] supported Microdata root node
-        # @return [Hash{Symbol => Object}, nil] normalized article hash
+        # @return [Hash{Symbol => Object, nil}] normalized article hash
         def article_from(root)
           schema_object = SchemaObjectBuilder.call(root)
           return unless schema_object
@@ -378,7 +378,7 @@ module Html2rss
           extend ValueNormalizer
           # @param root [Nokogiri::XML::Element] supported microdata root node
-          # @return [Hash{Symbol => Object}, nil] compact schema-like object
+          # @return [Hash{Symbol => Object, nil}] compact schema-like object
           def call(root)
             type = Microdata.supported_type_name(root)
             return unless type

data/lib/html2rss/auto_source.rb CHANGED Viewed

@@ -37,7 +37,8 @@ module Html2rss
         html: {
           enabled: true,
           minimum_selector_frequency: Scraper::Html::DEFAULT_MINIMUM_SELECTOR_FREQUENCY,
-          use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS
+          use_top_selectors: Scraper::Html::DEFAULT_USE_TOP_SELECTORS,
+          fallback_anchorless: true
         }
       },
       cleanup: Cleanup::DEFAULT_CONFIG
@@ -63,6 +64,7 @@ module Html2rss
         optional(:enabled).filled(:bool)
         optional(:minimum_selector_frequency).filled(:integer, gt?: 0)
         optional(:use_top_selectors).filled(:integer, gt?: 0)
+        optional(:fallback_anchorless).filled(:bool)
       end
     end.freeze
     private_constant :SCRAPER_CONFIG

data/lib/html2rss/cli.rb CHANGED Viewed

@@ -48,6 +48,9 @@ module Html2rss
     method_option :max_requests,
                   type: :numeric,
                   desc: 'Maximum requests to allow for this feed build'
+    method_option :input,
+                  type: :string,
+                  desc: 'Local HTML file path to read input from'
     # @param yaml_file [String] path to YAML config
     # @param feed_name [String, nil] optional named feed in multi-feed config
     # @return [void]
@@ -55,6 +58,7 @@ module Html2rss
       config = Html2rss.config_from_yaml_file(yaml_file, feed_name)
       config[:params] = options[:params] || {}
       apply_runtime_request_overrides!(config)
+      apply_local_file_input!(config, options[:input]) if options[:input]
       puts(execute_feed { Html2rss.feed(config) })
     end
@@ -76,20 +80,17 @@ module Html2rss
     method_option :max_requests,
                   type: :numeric,
                   desc: 'Maximum requests to allow for this feed build'
-    # @param url [String] source page URL for auto discovery
+    method_option :input,
+                  type: :string,
+                  desc: 'Local HTML file path to read input from'
+    # @param url [String, nil] source page URL for auto discovery
     # @return [void]
-    def auto(url) # rubocop:disable Metrics/MethodLength
+    def auto(url = nil)
       format = options.fetch(:format, 'rss')
-      source_method = format == 'jsonfeed' ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
+      strategy, local_file_path, url = prepare_auto_inputs(url, options[:input])
       result = execute_feed do
-        source_method.call(
-          url,
-          strategy: current_strategy,
-          items_selector: options[:items_selector],
-          max_redirects: options[:max_redirects],
-          max_requests: options[:max_requests]
-        )
+        source_call(url, strategy, local_file_path, format == 'jsonfeed')
       end
       puts(format == 'jsonfeed' ? JSON.pretty_generate(result) : result)
@@ -159,6 +160,33 @@ module Html2rss
       config.delete(:request) if request_config.empty?
     end
+    def apply_local_file_input!(config, input_path)
+      file_path = check_file_exists!(input_path)
+      config[:strategy] = :local_file
+      config[:request] = (config[:request] || {}).merge(local_file_path: file_path)
+      return unless config.dig(:channel, :url).to_s.empty?
+      config[:channel] = (config[:channel] || {}).merge(
+        url: detect_base_url!(file_path, 'Please specify a channel.url in the config.')
+      )
+    end
+    def prepare_auto_inputs(url, input_option)
+      if input_option.nil?
+        raise Thor::Error, 'A URL is required unless --input is specified' unless url
+        return [current_strategy, nil, url]
+      end
+      file_path = check_file_exists!(input_option)
+      detected_url = url || detect_base_url!(
+        file_path, 'Please specify a URL: html2rss auto [URL] --input <file>'
+      )
+      [:local_file, file_path, detected_url]
+    end
     def request_controls
       Html2rss::RequestControls.new(
         strategy: options[:strategy]&.to_sym,
@@ -213,5 +241,28 @@ module Html2rss
            Html2rss::NoFeedItemsExtracted => error
       raise Thor::Error, error.message
     end
+    def source_call(url, strategy, local_file_path, is_json)
+      method = is_json ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
+      method.call(
+        url,
+        strategy:,
+        items_selector: options[:items_selector],
+        max_redirects: options[:max_redirects],
+        max_requests: options[:max_requests],
+        local_file_path:
+      )
+    end
+    def check_file_exists!(path)
+      File.expand_path(path).tap do |file_path|
+        raise Thor::Error, "Input file does not exist: #{path}" unless File.exist?(file_path)
+      end
+    end
+    def detect_base_url!(file_path, error_hint)
+      Html2rss::Url.extract_from_html(File.read(file_path))&.to_s ||
+        raise(Thor::Error, "Could not auto-detect a base URL from HTML metadata. #{error_hint}")
+    end
   end
 end

data/lib/html2rss/config/class_methods.rb CHANGED Viewed

@@ -29,7 +29,7 @@ module Html2rss
       # Validates a configuration hash with the runtime validator.
       #
       # @param config [Hash{Symbol => Object}] the configuration hash
-      # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
+      # @param params [Hash{Symbol => Object, Hash{String => Object, nil}}] dynamic parameters for string formatting
       # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
       def validate(config, params: UNSET)
         prepared_config = prepare_for_validation(resolve_effective_config(config, params:))
@@ -56,7 +56,7 @@ module Html2rss
       # @param file [String] the YAML file to load
       # @param feed_name [String, nil] optional feed name for multi-feed files
       # @param multiple_feeds_key [Symbol] key under which multiple feeds are defined
-      # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
+      # @param params [Hash{Symbol => Object, Hash{String => Object, nil}}] dynamic parameters for string formatting
       # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
       def validate_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS, params: UNSET)
         validate(load_yaml(file, feed_name, multiple_feeds_key:), params:)
@@ -99,7 +99,7 @@ module Html2rss
       # and returns a new configuration object.
       #
       # @param config [Hash{Symbol => Object}] the configuration hash.
-      # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting.
+      # @param params [Hash{Symbol => Object, Hash{String => Object, nil}}] dynamic parameters for string formatting.
       # @return [Html2rss::Config] the configuration object.
       def from_hash(config, params: UNSET)
         new(resolve_effective_config(config, params:))

data/lib/html2rss/config/validator.rb CHANGED Viewed

@@ -83,6 +83,7 @@ module Html2rss
         optional(:total_timeout_seconds).filled(:integer, gt?: 0)
         optional(:browserless).hash(BrowserlessRequestConfig)
         optional(:botasaurus).hash(BotasaurusRequestConfig)
+        optional(:local_file_path).filled(:string)
       end
       params do

data/lib/html2rss/config.rb CHANGED Viewed

@@ -69,9 +69,9 @@ module Html2rss
     # @return [Hash{Symbol => Object}] request envelope configuration
     def request = config[:request]
-    # @return [Hash{Symbol => Object}, nil] selectors configuration
+    # @return [Hash{Symbol => Object, nil}] selectors configuration
     def selectors = config[:selectors]
-    # @return [Hash{Symbol => Object}, nil] auto-source configuration
+    # @return [Hash{Symbol => Object, nil}] auto-source configuration
     def auto_source = config[:auto_source]
     private

data/lib/html2rss/html_extractor/heading_extractor.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+module Html2rss
+  class HtmlExtractor
+    ##
+    # HeadingExtractor identifies and returns the best heading element within a container.
+    class HeadingExtractor
+      # Heading tags used to prioritize title extraction.
+      HEADING_TAGS = HtmlExtractor::HEADING_TAGS
+      class << self
+        ##
+        # @param article_tag [Nokogiri::XML::Element] container node
+        # @param fallback_anchorless [Boolean] whether to use fallback search
+        # @param selected_anchor [Nokogiri::XML::Node, nil] anchor element
+        # @return [Nokogiri::XML::Node, nil] the heading node, if found
+        def call(article_tag, fallback_anchorless:, selected_anchor:)
+          tags = article_tag.css(HEADING_TAGS.join(','))
+          if tags.any?
+            select_best_heading(tags)
+          elsif fallback_anchorless && selected_anchor.nil?
+            fallback_heading(article_tag)
+          end
+        end
+        private
+        def select_best_heading(tags)
+          min_tag_name = tags.map(&:name).min
+          best_tag = nil
+          max_size = -1
+          tags.each do |tag|
+            next if tag.name != min_tag_name
+            size = TextExtractor.call(tag)&.size.to_i
+            (best_tag = tag) && (max_size = size) if size > max_size
+          end
+          best_tag
+        end
+        def fallback_heading(article_tag)
+          fallback_tags = article_tag.css('strong, b, [class*="title"], [class*="font-bold"], [class*="font-semibold"]')
+          fallback_tags.find { |t| !TextExtractor.call(t).to_s.strip.empty? }
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/html_extractor/id_generator.rb ADDED Viewed

@@ -0,0 +1,67 @@
+# frozen_string_literal: true
+require 'zlib'
+module Html2rss
+  class HtmlExtractor
+    ##
+    # IdGenerator determines the unique ID for an article container node.
+    class IdGenerator
+      class << self
+        ##
+        # @param article_tag [Nokogiri::XML::Element] container node
+        # @param heading [Nokogiri::XML::Node, nil] heading node
+        # @param url [Html2rss::Url, nil] absolute article URL
+        # @param selected_anchor [Nokogiri::XML::Node, nil] anchor element
+        # @param fallback_anchorless [Boolean] whether to use fallback hashing
+        # @return [String, nil] the generated ID, if any
+        def call(article_tag, heading:, url:, selected_anchor:, fallback_anchorless:)
+          id_from_dom = parse_id_from_dom(article_tag, url, selected_anchor)
+          return id_from_dom if id_from_dom
+          heading_text = resolve_heading_text(article_tag, heading, fallback_anchorless)
+          if heading_text && !heading_text.strip.empty?
+            generate_slug(heading_text)
+          elsif fallback_anchorless
+            generate_content_hash(article_tag)
+          end
+        end
+        private
+        def parse_id_from_dom(article_tag, url, selected_anchor)
+          candidates = [article_tag['id'], article_tag.at_css('[id]')&.attr('id')]
+          candidates += [url&.path, url&.query] if selected_anchor
+          candidates.compact.reject(&:empty?).first
+        end
+        def resolve_heading_text(article_tag, heading, fallback_anchorless)
+          text = heading ? TextExtractor.call(heading) : nil
+          if text.nil? || text.strip.empty?
+            fallback_text_node_content(article_tag, fallback_anchorless)
+          else
+            text
+          end
+        end
+        def fallback_text_node_content(article_tag, fallback_anchorless)
+          return unless fallback_anchorless
+          article_tag.xpath('.//text()').find { |t| !t.text.strip.empty? }&.text&.strip
+        end
+        def generate_slug(text)
+          slug = text.downcase.gsub(/[^a-z0-9]+/, '-')
+          slug = slug[1..] if slug.start_with?('-')
+          slug = slug[0..-2] if slug.end_with?('-')
+          slug unless slug.empty?
+        end
+        def generate_content_hash(article_tag)
+          text = TextExtractor.call(article_tag).to_s.strip
+          Zlib.crc32(text).to_s(36) unless text.empty?
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/html_extractor/text_extractor.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+module Html2rss
+  class HtmlExtractor
+    ##
+    # TextExtractor extracts visible text from DOM elements, preserving lists
+    # and block spacing while sanitizing white spaces.
+    class TextExtractor
+      # HTML block elements that trigger line breaks or special formatting.
+      BLOCK_TAGS = %w[p div li ul ol h1 h2 h3 h4 h5 h6 tr br].to_set.freeze
+      # Tags ignored when extracting visible text content.
+      INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
+      class << self
+        ##
+        # @param tag [Nokogiri::XML::Node] the node from which to extract visible text
+        # @param separator [String] separator used to join text fragments (default is a space)
+        # @param exclude_nodes [Array<Nokogiri::XML::Node>, nil] nodes to exclude from extraction
+        # @return [String, nil] the concatenated visible text, or nil if none is found
+        def call(tag, separator: ' ', exclude_nodes: nil)
+          return tag.text.gsub(/\s+/, ' ').strip if tag.respond_to?(:text?) && tag.text?
+          parts = iterate_children(tag, separator, exclude_nodes)
+          return if parts.empty?
+          parts.join.squeeze(' ').strip
+        end
+        private
+        def iterate_children(tag, separator, exclude_nodes)
+          last = false
+          tag.children.each_with_object([]) do |c, p|
+            next if exclude_nodes&.include?(c) || !visible_child?(c)
+            text, block = process_child_node(c, separator, exclude_nodes)
+            next if text.empty?
+            append_separator!(p, separator, block, last)
+            (p << text) && (last = block)
+          end
+        end
+        def process_child_node(child, separator, exclude_nodes)
+          child_text = get_child_text(child, separator, exclude_nodes)
+          return ['', false] if child_text.empty?
+          child_text = "- #{child_text}" if child.name == 'li'
+          [child_text, BLOCK_TAGS.include?(child.name)]
+        end
+        def get_child_text(child, separator, exclude_nodes)
+          if child.children.empty?
+            child.text.to_s.gsub(/\s+/, ' ').strip
+          else
+            call(child, separator:, exclude_nodes:).to_s.strip
+          end
+        end
+        def append_separator!(parts, separator, is_block, last_was_block)
+          return if parts.empty?
+          parts << if is_block || last_was_block
+                     (separator == ' ' ? "\n" : separator)
+                   else
+                     ' '
+                   end
+        end
+        def visible_child?(node)
+          !INVISIBLE_CONTENT_TAGS.include?(node.name) &&
+            !(node.name == 'a' && node['href']&.start_with?('#'))
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/html_extractor.rb CHANGED Viewed

@@ -4,13 +4,11 @@ module Html2rss
   ##
   # HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
   # from an article_tag.
-  class HtmlExtractor # rubocop:disable Metrics/ClassLength
-    # Tags ignored when extracting visible text content from article containers.
-    INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
+  # rubocop:disable Metrics/ClassLength
+  class HtmlExtractor
     # Heading tags used to prioritize title extraction.
     HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
-    # Selector used to derive non-headline description nodes.
-    NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
     # Element tags that indicate ignored DOM chrome when found in a container path.
     IGNORED_CONTAINER_TAGS = %w[nav footer header svg script style].to_set.freeze
@@ -26,20 +24,14 @@ module Html2rss
     class << self
       ##
       # Extracts visible text from a given node and its children.
+      # Delegates to TextExtractor.
       #
       # @param tag [Nokogiri::XML::Node] the node from which to extract visible text
       # @param separator [String] separator used to join text fragments (default is a space)
+      # @param exclude_nodes [Array<Nokogiri::XML::Node>, nil] nodes to exclude from extraction
       # @return [String, nil] the concatenated visible text, or nil if none is found
-      def extract_visible_text(tag, separator: ' ')
-        parts = tag.children.filter_map do |child|
-          next unless visible_child?(child)
-          raw_text = child.children.empty? ? child.text : extract_visible_text(child)
-          text = raw_text&.strip
-          text unless text.to_s.empty?
-        end
-        parts.join(separator).squeeze(' ').strip unless parts.empty?
+      def extract_visible_text(tag, separator: ' ', exclude_nodes: nil)
+        TextExtractor.call(tag, separator:, exclude_nodes:)
       end
       ##
@@ -74,23 +66,20 @@ module Html2rss
         end
         false
       end
-      def visible_child?(node)
-        !INVISIBLE_CONTENT_TAGS.include?(node.name) &&
-          !(node.name == 'a' && node['href']&.start_with?('#'))
-      end
     end
     ##
     # @param article_tag [Nokogiri::XML::Node] article-like container to extract from
     # @param base_url [String, Html2rss::Url] base url used to resolve relative links
     # @param selected_anchor [Nokogiri::XML::Node, nil] explicit primary anchor for the container
-    def initialize(article_tag, base_url:, selected_anchor:)
+    # @param fallback_anchorless [Boolean] whether to fall back to anchorless extraction
+    def initialize(article_tag, base_url:, selected_anchor:, fallback_anchorless: false)
       raise ArgumentError, 'article_tag is required' unless article_tag
       @article_tag = article_tag
       @base_url = base_url
       @selected_anchor = selected_anchor
+      @fallback_anchorless = fallback_anchorless
     end
     # @return [Hash{Symbol => Object}] extracted article attributes
@@ -115,54 +104,62 @@ module Html2rss
       @extract_url ||= begin
         href = selected_anchor&.[]('href').to_s
-        Url.from_relative(href.split('#').first.strip, base_url) unless href.empty?
+        if href.empty?
+          anchorless_url_fallback
+        else
+          Url.from_relative(href.split('#').first.strip, base_url)
+        end
       end
     end
-    def extract_title
-      title_source = heading || selected_anchor
-      self.class.extract_visible_text(title_source) if title_source
+    def anchorless_url_fallback
+      return unless @fallback_anchorless
+      id = generate_id
+      Url.from_relative("##{id}", base_url) if id
     end
-    def heading
-      @heading ||= begin
-        tags = article_tag.css(HEADING_TAGS.join(','))
-        tags.any? ? select_best_heading(tags) : nil
+    def extract_title
+      title_source = heading || selected_anchor
+      if title_source
+        self.class.extract_visible_text(title_source)
+      else
+        fallback_anchorless_title
       end
     end
-    def select_best_heading(tags)
-      min_tag_name = tags.map(&:name).min
-      best_tag = nil
-      max_size = -1
-      tags.each do |tag|
-        next if tag.name != min_tag_name
+    def fallback_anchorless_title
+      return unless @fallback_anchorless && selected_anchor.nil?
-        size = self.class.extract_visible_text(tag)&.size.to_i
-        (best_tag = tag) && (max_size = size) if size > max_size
-      end
+      text_node = article_tag.xpath('.//text()').find { |t| !t.text.strip.empty? }
+      text_node&.text&.strip
+    end
-      best_tag
+    def heading
+      @heading ||= HeadingExtractor.call(
+        article_tag,
+        fallback_anchorless: @fallback_anchorless,
+        selected_anchor:
+      )
     end
     def extract_description
-      text = self.class.extract_visible_text(article_tag.css(NON_HEADLINE_SELECTOR), separator: '<br>')
-      return text if text && !text.empty?
-      description = self.class.extract_visible_text(article_tag)
-      return nil if description.nil? || description.strip.empty?
+      exclude = [heading, selected_anchor].compact.to_set
+      description = self.class.extract_visible_text(article_tag, exclude_nodes: exclude)
+      return if description.nil?
-      description.strip
+      desc = description.strip
+      desc.empty? ? nil : desc
     end
     def generate_id
-      [
-        article_tag['id'],
-        article_tag.at_css('[id]')&.attr('id'),
-        extract_url&.path,
-        extract_url&.query
-      ].compact.reject(&:empty?).first
+      @generate_id ||= IdGenerator.call(
+        article_tag,
+        heading:,
+        url: (selected_anchor ? extract_url : nil),
+        selected_anchor:,
+        fallback_anchorless: @fallback_anchorless
+      )
     end
     def extract_image = ImageExtractor.call(article_tag, base_url:)
@@ -170,4 +167,5 @@ module Html2rss
     def extract_enclosures = EnclosureExtractor.call(article_tag, base_url)
     def extract_categories = CategoryExtractor.call(article_tag)
   end
+  # rubocop:enable Metrics/ClassLength
 end

data/lib/html2rss/rendering.rb CHANGED Viewed

@@ -4,6 +4,8 @@ module Html2rss
   # Namespace for HTML rendering logic, used to generate rich content such as
   # images, audio, video, or embedded documents for feed descriptions.
   #
+  # @see Html2rss::Rendering::DescriptionBuilder
+  #
   # @example
   #   Html2rss::Rendering::ImageRenderer.new(
   #     url: "https://example.com/image.jpg",
@@ -16,8 +18,6 @@ module Html2rss
   #     image: "https://example.com/image.jpg",
   #     title: "Example"
   #   )
-  #
-  # @see Html2rss::Rendering::DescriptionBuilder
   module Rendering
   end
 end

data/lib/html2rss/request_service/local_file_strategy.rb ADDED Viewed

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+module Html2rss
+  class RequestService
+    ##
+    # Strategy to read a local HTML file.
+    class LocalFileStrategy < Strategy
+      ##
+      # Executes the local file read.
+      #
+      # @return [Response] the mock response wrapped around the file contents
+      # @raise [ArgumentError] if the local file path is missing
+      # @raise [Errno::ENOENT] if the file does not exist
+      def execute
+        file_path = ctx.request[:local_file_path]
+        raise ArgumentError, 'Local file path is required for local_file strategy' unless file_path
+        raise Errno::ENOENT, "File not found: #{file_path}" unless File.exist?(file_path)
+        body = File.read(file_path)
+        Response.new(
+          body:,
+          headers: { 'content-type' => 'text/html; charset=utf-8' },
+          url: ctx.url,
+          status: 200
+        )
+      end
+    end
+  end
+end

data/lib/html2rss/request_service/puppet_commander.rb CHANGED Viewed

@@ -97,6 +97,10 @@ module Html2rss
       attr_reader :ctx, :browser, :skip_request_resources, :referer, :latest_navigation_response, :main_frame
+      ##
+      # Re-raises a deferred navigation error when one was captured.
+      #
+      # @raise [Html2rss::Error] when a navigation request or response validation failed
       def raise_navigation_error_if_any
         raise @navigation_error if @navigation_error
       end

data/lib/html2rss/request_service.rb CHANGED Viewed

@@ -57,7 +57,8 @@ module Html2rss
       @strategies = {
         faraday: FaradayStrategy,
         botasaurus: BotasaurusStrategy,
-        browserless: BrowserlessStrategy
+        browserless: BrowserlessStrategy,
+        local_file: LocalFileStrategy
       }
       @default_strategy_name = :faraday
     end

data/lib/html2rss/selectors/extractors/attribute.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module Html2rss
       # during post processing with {PostProcessors::ParseTime}.
       class Attribute
         # The available options for the attribute extractor.
-        Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
         ##
         # Initializes the Attribute extractor.

data/lib/html2rss/selectors/extractors/href.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module Html2rss
       #    'http://blog-without-a-feed.example.com/posts/latest-findings'
       class Href
         # The available options for the href (attribute) extractor.
-        Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
         ##
         # Initializes the Href extractor.

data/lib/html2rss/selectors/extractors/html.rb CHANGED Viewed

@@ -24,7 +24,7 @@ module Html2rss
       # {PostProcessors::SanitizeHtml}.
       class Html
         # The available options for the html extractor.
-        Options = Struct.new('HtmlOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        Options = Struct.new('HtmlOptions', :selector, keyword_init: true)
         ##
         # Initializes the Html extractor.

data/lib/html2rss/selectors/extractors/static.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module Html2rss
       #    'Foobar'
       class Static
         # The available option for the static extractor.
-        Options = Struct.new('StaticOptions', :static, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        Options = Struct.new('StaticOptions', :static, keyword_init: true)
         ##
         # Initializes the Static extractor.

data/lib/html2rss/selectors/extractors/text.rb CHANGED Viewed

@@ -22,7 +22,7 @@ module Html2rss
       #    'Lorem ipsum dolor ...'
       class Text
         # The available options for the text extractor.
-        Options = Struct.new('TextOptions', :selector, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+        Options = Struct.new('TextOptions', :selector, keyword_init: true)
         ##
         # Initializes the Text extractor.

data/lib/html2rss/selectors/post_processors/sanitize_html.rb CHANGED Viewed

@@ -128,8 +128,15 @@ module Html2rss
         ##
         # @return [String, nil]
         def get
-          sanitized_html = Sanitize.fragment(value, self.class.sanitize_config(channel_url)).to_s
+          # Temporarily replace newlines with a placeholder to preserve them during space collapsing
+          temp_value = value.to_s.gsub("\n", ' __NEWLINE_PLACEHOLDER__ ')
+          sanitized_html = Sanitize.fragment(temp_value, self.class.sanitize_config(channel_url)).to_s
           sanitized_html.gsub!(/\s+/, ' ')
+          # Restore newlines and clean up surrounding whitespace
+          sanitized_html.gsub!(/[ \t\r]*__NEWLINE_PLACEHOLDER__[ \t\r]*/, "\n")
+          sanitized_html.gsub!(/\n{3,}/, "\n\n")
           sanitized_html.strip!
           sanitized_html.empty? ? nil : sanitized_html
         end

data/lib/html2rss/selectors.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module Html2rss
     include Enumerable
     # A context instance passed to item extractors and post-processors.
-    Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true) # rubocop:disable Style/RedundantStructKeywordInit
+    Context = Struct.new('Context', :options, :item, :config, :scraper, keyword_init: true)
     # Default selectors options merged into user configuration.
     DEFAULT_CONFIG = { items: { enhance: true } }.freeze

data/lib/html2rss/url.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require 'addressable/uri'
 require 'cgi'
+require 'nokogiri'
 module Html2rss
   ##
@@ -55,10 +56,9 @@ module Html2rss
     # @return [Url, nil] the sanitized URL, or nil if no valid URL found
     def self.sanitize(raw_url)
       match = raw_url.to_s.match(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
-      url = match ? match[0].strip : ''
-      return nil if url.empty?
+      return unless match
-      new(Addressable::URI.parse(url).normalize)
+      new(Addressable::URI.parse(match[0].strip).normalize)
     end
     ##
@@ -70,10 +70,9 @@ module Html2rss
     def self.from_absolute(url_string)
       return url_string if url_string.is_a?(self)
-      url = new(Addressable::URI.parse(url_string.to_s.strip).normalize)
-      raise ArgumentError, 'URL must be absolute' unless url.absolute?
-      url
+      new(Addressable::URI.parse(url_string.to_s.strip).normalize).tap do |url|
+        raise ArgumentError, 'URL must be absolute' unless url.absolute?
+      end
     rescue Addressable::URI::InvalidURIError
       raise ArgumentError, 'URL must be absolute'
     end
@@ -92,14 +91,28 @@ module Html2rss
     #   Url.for_channel('/relative/path')
     #   # => raises ArgumentError: "URL must be absolute"
     def self.for_channel(url_string)
-      return nil if url_string.nil? || url_string.empty?
-      stripped = url_string.strip
+      stripped = url_string.to_s.strip
       return nil if stripped.empty?
-      url = from_absolute(stripped)
-      validate_channel_url(url)
-      url
+      from_absolute(stripped).tap { validate_channel_url(_1) }
+    end
+    ##
+    # Extracts a base URL from HTML metadata tags.
+    #
+    # @param html [String] raw HTML content
+    # @return [Url, nil] the extracted absolute URL, or nil if none is found
+    def self.extract_from_html(html)
+      doc = Nokogiri::HTML(html)
+      tags = { 'link[rel="canonical"]' => 'href', 'meta[property="og:url"]' => 'content',
+               'meta[name="twitter:url"]' => 'content', 'base[href]' => 'href' }
+      tags.each do |sel, attr|
+        val = doc.at_css(sel)&.[](attr).to_s.strip
+        return from_absolute(val) unless val.empty?
+      rescue ArgumentError
+        next
+      end
+      nil
     end
     ##

data/lib/html2rss/version.rb CHANGED Viewed

@@ -4,6 +4,6 @@
 # The Html2rss namespace.
 module Html2rss
   # Current application version.
-  VERSION = '0.21.0'
+  VERSION = '0.22.0'
   public_constant :VERSION
 end

data/lib/html2rss.rb CHANGED Viewed

@@ -52,6 +52,8 @@ module Html2rss
     FeedPipeline.new(raw_config).to_json_feed
   end
+  # rubocop:disable Metrics/ParameterLists
   ##
   # Scrapes the provided URL and returns an RSS object.
   #
@@ -60,9 +62,15 @@ module Html2rss
   # @param items_selector [String, nil] optional selector hint for item extraction
   # @param max_redirects [Integer, nil] optional redirect limit override
   # @param max_requests [Integer, nil] optional request budget override
+  # @param local_file_path [String, nil] optional local HTML file path
   # @return [RSS::Rss] generated RSS feed
-  def self.auto_source(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
-    feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
+  def self.auto_source(url,
+                       strategy: :auto,
+                       items_selector: nil,
+                       max_redirects: nil,
+                       max_requests: nil,
+                       local_file_path: nil)
+    feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:, local_file_path:))
   end
   ##
@@ -73,11 +81,20 @@ module Html2rss
   # @param items_selector [String, nil] optional selector hint for item extraction
   # @param max_redirects [Integer, nil] optional redirect limit override
   # @param max_requests [Integer, nil] optional request budget override
+  # @param local_file_path [String, nil] optional local HTML file path
   # @return [Hash] JSONFeed-compliant hash
-  def self.auto_json_feed(url, strategy: :auto, items_selector: nil, max_redirects: nil, max_requests: nil)
-    json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
+  def self.auto_json_feed(url,
+                          strategy: :auto,
+                          items_selector: nil,
+                          max_redirects: nil,
+                          max_requests: nil,
+                          local_file_path: nil)
+    json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:,
+                                       local_file_path:))
   end
+  # rubocop:enable Metrics/ParameterLists
   # rubocop:disable ThreadSafety/ClassInstanceVariable
   class << self
     ##
@@ -125,12 +142,17 @@ module Html2rss
   class << self
     private
-    def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:)
-      Config.auto_source_config(
+    def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:, local_file_path: nil) # rubocop:disable Metrics/ParameterLists
+      config = Config.auto_source_config(
         url:,
         items_selector:,
         request_controls: shortcut_request_controls(strategy:, max_redirects:, max_requests:)
       )
+      if local_file_path
+        config[:request] ||= {}
+        config[:request][:local_file_path] = local_file_path
+      end
+      config
     end
     def shortcut_request_controls(strategy:, max_redirects:, max_requests:)

data/schema/html2rss-config.schema.json CHANGED Viewed

@@ -179,6 +179,12 @@
                     "type": "null"
                   },
                   "exclusiveMinimum": 0
+                },
+                "fallback_anchorless": {
+                  "type": "boolean",
+                  "not": {
+                    "type": "null"
+                  }
                 }
               },
               "required": []
@@ -227,7 +233,8 @@
           "html": {
             "enabled": true,
             "minimum_selector_frequency": 2,
-            "use_top_selectors": 5
+            "use_top_selectors": 5,
+            "fallback_anchorless": true
           }
         },
         "cleanup": {
@@ -535,6 +542,10 @@
             }
           },
           "required": []
+        },
+        "local_file_path": {
+          "type": "string",
+          "minLength": 1
         }
       },
       "required": []

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: html2rss
 version: !ruby/object:Gem::Version
-  version: 0.21.0
+  version: 0.22.0
 platform: ruby
 authors:
 - Gil Desmarais
@@ -278,6 +278,7 @@ files:
 - lib/html2rss/auto_source/cleanup.rb
 - lib/html2rss/auto_source/scraper.rb
 - lib/html2rss/auto_source/scraper/html.rb
+- lib/html2rss/auto_source/scraper/html/class_clustering.rb
 - lib/html2rss/auto_source/scraper/json_state.rb
 - lib/html2rss/auto_source/scraper/link_heuristics.rb
 - lib/html2rss/auto_source/scraper/microdata.rb
@@ -310,10 +311,13 @@ files:
 - lib/html2rss/html_extractor.rb
 - lib/html2rss/html_extractor/date_extractor.rb
 - lib/html2rss/html_extractor/enclosure_extractor.rb
+- lib/html2rss/html_extractor/heading_extractor.rb
+- lib/html2rss/html_extractor/id_generator.rb
 - lib/html2rss/html_extractor/image_extractor.rb
 - lib/html2rss/html_extractor/list_candidates.rb
 - lib/html2rss/html_extractor/semantic_anchor_candidates.rb
 - lib/html2rss/html_extractor/semantic_containers.rb
+- lib/html2rss/html_extractor/text_extractor.rb
 - lib/html2rss/html_navigator.rb
 - lib/html2rss/json_feed_builder.rb
 - lib/html2rss/json_feed_builder/item.rb
@@ -332,6 +336,7 @@ files:
 - lib/html2rss/request_service/budget.rb
 - lib/html2rss/request_service/context.rb
 - lib/html2rss/request_service/faraday_strategy.rb
+- lib/html2rss/request_service/local_file_strategy.rb
 - lib/html2rss/request_service/policy.rb
 - lib/html2rss/request_service/puppet_commander.rb
 - lib/html2rss/request_service/response.rb
@@ -376,7 +381,7 @@ licenses:
 - MIT
 metadata:
   allowed_push_host: https://rubygems.org
-  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.21.0
+  changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.0
   rubygems_mfa_required: 'true'
 rdoc_options: []
 require_paths: