RubyGems - html2rss - Versions diffs - 0.17.0 → 0.19.0 - Mend

html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

checksums.yaml +4 -4
data/README.md +90 -639
data/exe/html2rss +1 -1
data/html2rss.gemspec +5 -2
data/lib/html2rss/articles/deduplicator.rb +50 -0
data/lib/html2rss/auto_source/cleanup.rb +44 -5
data/lib/html2rss/auto_source/scraper/html.rb +123 -43
data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
data/lib/html2rss/auto_source/scraper.rb +160 -8
data/lib/html2rss/auto_source.rb +123 -47
data/lib/html2rss/blocked_surface.rb +65 -0
data/lib/html2rss/category_extractor.rb +82 -0
data/lib/html2rss/cli.rb +194 -23
data/lib/html2rss/config/class_methods.rb +178 -0
data/lib/html2rss/config/dynamic_params.rb +70 -0
data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
data/lib/html2rss/config/request_headers.rb +136 -0
data/lib/html2rss/config/schema.rb +240 -0
data/lib/html2rss/config/validator.rb +146 -0
data/lib/html2rss/config.rb +118 -61
data/lib/html2rss/error.rb +31 -0
data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
data/lib/html2rss/feed_pipeline.rb +127 -0
data/lib/html2rss/hash_util.rb +101 -0
data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
data/lib/html2rss/html_extractor.rb +141 -0
data/lib/html2rss/html_navigator.rb +54 -0
data/lib/html2rss/json_feed_builder/item.rb +94 -0
data/lib/html2rss/json_feed_builder.rb +59 -0
data/lib/html2rss/rendering/audio_renderer.rb +36 -0
data/lib/html2rss/rendering/description_builder.rb +87 -0
data/lib/html2rss/rendering/image_renderer.rb +41 -0
data/lib/html2rss/rendering/media_renderer.rb +37 -0
data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
data/lib/html2rss/rendering/video_renderer.rb +36 -0
data/lib/html2rss/rendering.rb +23 -0
data/lib/html2rss/request_controls.rb +123 -0
data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
data/lib/html2rss/request_service/budget.rb +39 -0
data/lib/html2rss/request_service/context.rb +77 -21
data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
data/lib/html2rss/request_service/policy.rb +252 -0
data/lib/html2rss/request_service/puppet_commander.rb +212 -13
data/lib/html2rss/request_service/response.rb +51 -3
data/lib/html2rss/request_service/response_guard.rb +62 -0
data/lib/html2rss/request_service.rb +50 -15
data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
data/lib/html2rss/request_session/runtime_input.rb +71 -0
data/lib/html2rss/request_session/runtime_policy.rb +83 -0
data/lib/html2rss/request_session.rb +122 -0
data/lib/html2rss/rss_builder/article.rb +187 -0
data/lib/html2rss/rss_builder/channel.rb +105 -11
data/lib/html2rss/rss_builder/enclosure.rb +62 -0
data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
data/lib/html2rss/rss_builder.rb +76 -71
data/lib/html2rss/selectors/config.rb +123 -0
data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
data/lib/html2rss/selectors/extractors/href.rb +55 -0
data/lib/html2rss/selectors/extractors/html.rb +49 -0
data/lib/html2rss/selectors/extractors/static.rb +42 -0
data/lib/html2rss/selectors/extractors/text.rb +47 -0
data/lib/html2rss/selectors/extractors.rb +53 -0
data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
data/lib/html2rss/selectors/post_processors/base.rb +80 -0
data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
data/lib/html2rss/selectors/post_processors/template.rb +76 -0
data/lib/html2rss/selectors/post_processors.rb +48 -0
data/lib/html2rss/selectors.rb +301 -0
data/lib/html2rss/url.rb +266 -0
data/lib/html2rss/version.rb +2 -1
data/lib/html2rss.rb +67 -71
data/lib/tasks/config_schema.rake +17 -0
data/schema/html2rss-config.schema.json +551 -0
metadata +120 -38
data/lib/html2rss/attribute_post_processors/base.rb +0 -74
data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
data/lib/html2rss/attribute_post_processors/template.rb +0 -101
data/lib/html2rss/attribute_post_processors.rb +0 -44
data/lib/html2rss/auto_source/article.rb +0 -127
data/lib/html2rss/auto_source/channel.rb +0 -78
data/lib/html2rss/auto_source/reducer.rb +0 -48
data/lib/html2rss/auto_source/rss_builder.rb +0 -70
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
data/lib/html2rss/config/channel.rb +0 -125
data/lib/html2rss/config/selectors.rb +0 -103
data/lib/html2rss/item.rb +0 -186
data/lib/html2rss/item_extractors/attribute.rb +0 -50
data/lib/html2rss/item_extractors/href.rb +0 -52
data/lib/html2rss/item_extractors/html.rb +0 -46
data/lib/html2rss/item_extractors/static.rb +0 -39
data/lib/html2rss/item_extractors/text.rb +0 -44
data/lib/html2rss/item_extractors.rb +0 -88
data/lib/html2rss/object_to_xml_converter.rb +0 -56
data/lib/html2rss/rss_builder/item.rb +0 -83
data/lib/html2rss/utils.rb +0 -113

data/lib/html2rss/feed_pipeline/auto_fallback.rb ADDED Viewed

@@ -0,0 +1,127 @@
+# frozen_string_literal: true
+##
+# The Html2rss namespace.
+module Html2rss
+  ##
+  # Coordinates feed generation pipeline stages.
+  class FeedPipeline
+    # Retries feed extraction across concrete request strategies for :auto mode.
+    class AutoFallback
+      # Ordered list of concrete request strategies attempted by auto mode.
+      CHAIN = %i[faraday botasaurus browserless].freeze
+      # Error classes that should abort auto fallback immediately.
+      NON_FALLBACK_ERRORS = [
+        RequestService::UnknownStrategy,
+        RequestService::InvalidUrl,
+        RequestService::UnsupportedUrlScheme,
+        RequestService::UnsupportedResponseContentType,
+        RequestService::RequestBudgetExceeded,
+        RequestService::PrivateNetworkDenied,
+        RequestService::CrossOriginFollowUpDenied,
+        RequestService::ResponseTooLarge,
+        RequestService::BrowserlessConfigurationError
+      ].freeze
+      ##
+      # @param strategies [Array<Symbol>] ordered concrete strategies for fallback
+      # @param budget [RequestService::Budget] shared request budget across retries
+      # @param session_for [Proc] request session factory proc
+      # @param articles_for [Proc] article extraction proc
+      # @return [void]
+      def initialize(strategies:, budget:, session_for:, articles_for:)
+        @strategies = strategies
+        @budget = budget
+        @session_for = session_for
+        @articles_for = articles_for
+      end
+      ##
+      # @return [Hash{Symbol => Object}] pipeline state containing :response and :articles
+      def call
+        state, attempts = run_attempts
+        return state if state
+        finalize_failure(attempts:)
+      end
+      private
+      attr_reader :strategies, :budget, :session_for, :articles_for
+      def run_attempts
+        state = { result: nil, attempts: [] }
+        strategies.each_with_index do |strategy, index|
+          run_attempt_for(strategy:, next_strategy: strategies[index + 1], state:)
+          break if state.fetch(:result)
+        end
+        [state.fetch(:result), state.fetch(:attempts)]
+      end
+      def run_attempt_for(strategy:, next_strategy:, state:)
+        result, attempts = attempt(
+          strategy:,
+          next_strategy:,
+          state: { attempts: state.fetch(:attempts) }
+        )
+        state[:result] = result
+        state[:attempts] = attempts
+      end
+      def attempt(strategy:, next_strategy:, state:)
+        request_session = session_for.call(strategy:, budget:)
+        response, state = fetch_response(
+          request_session:,
+          strategy:,
+          next_strategy:,
+          state:
+        )
+        return [nil, state.fetch(:attempts)] unless response
+        process_response(response:, strategy:, next_strategy:, request_session:, state:)
+      end
+      def fetch_response(request_session:, strategy:, next_strategy:, state:)
+        [request_session.fetch_initial_response, state]
+      rescue *NON_FALLBACK_ERRORS
+        raise
+      rescue StandardError => error
+        state[:attempts] << { strategy:, items_count: nil, error_class: error.class.name }
+        log_warn_fallback_error(strategy:, next_strategy:, error:) if next_strategy
+        Log.debug("#{self.class}: strategy=#{strategy} error=#{error.class}: #{error.message}")
+        [nil, state]
+      end
+      def process_response(response:, strategy:, next_strategy:, request_session:, state:)
+        articles = articles_for.call(response:, request_session:)
+        items_count = articles.size
+        state[:attempts] << { strategy:, items_count:, error_class: nil }
+        Log.debug("#{self.class}: strategy=#{strategy} items=#{items_count}")
+        return success_state(response:, strategy:, articles:, state:) if items_count.positive?
+        log_info_fallback_zero_items(strategy:, next_strategy:) if next_strategy
+        [nil, state.fetch(:attempts)]
+      end
+      def success_state(response:, strategy:, articles:, state:)
+        if state.fetch(:attempts).size > 1
+          Log.info("#{self.class}: auto selected strategy=#{strategy} after attempts=#{state.fetch(:attempts).size}")
+        end
+        [{ response:, articles: }, state.fetch(:attempts)]
+      end
+      def finalize_failure(attempts:)
+        raise NoFeedItemsExtracted.new(attempts:)
+      end
+      def log_warn_fallback_error(strategy:, next_strategy:, error:)
+        Log.warn("#{self.class}: auto fallback #{strategy} -> #{next_strategy} after error=#{error.class}")
+      end
+      def log_info_fallback_zero_items(strategy:, next_strategy:)
+        Log.info("#{self.class}: auto fallback #{strategy} -> #{next_strategy} after zero extracted items")
+      end
+    end
+  end
+end

data/lib/html2rss/feed_pipeline.rb ADDED Viewed

@@ -0,0 +1,127 @@
+# frozen_string_literal: true
+module Html2rss
+  ##
+  # Builds feeds from validated config through request, extraction, and rendering stages.
+  class FeedPipeline
+    ##
+    # @param raw_config [Hash{Symbol => Object}] user-provided feed config
+    def initialize(raw_config)
+      @raw_config = raw_config
+    end
+    ##
+    # @return [RSS::Rss] generated RSS feed
+    def to_rss
+      run do |response:, config:, articles:|
+        channel = RssBuilder::Channel.new(response, overrides: config.channel)
+        RssBuilder.new(channel:, articles:, stylesheets: config.stylesheets).call
+      end
+    end
+    ##
+    # @return [Hash] generated JSONFeed 1.1 payload
+    def to_json_feed
+      run do |response:, config:, articles:|
+        channel = RssBuilder::Channel.new(response, overrides: config.channel)
+        JsonFeedBuilder.new(channel:, articles:).call
+      end
+    end
+    private
+    attr_reader :raw_config
+    def run
+      config = Config.from_hash(raw_config, params: raw_config[:params])
+      state = pipeline_state_for(config)
+      yield response: state.fetch(:response), config:, articles: state.fetch(:articles)
+    end
+    def pipeline_state_for(config)
+      if config.strategy == :auto
+        run_auto_pipeline(config)
+      else
+        run_pipeline_for_strategy(config, strategy: config.strategy)
+      end
+    end
+    def run_pipeline_for_strategy(config, strategy:, budget: nil)
+      request_session = request_session_for(config, strategy:, budget:)
+      response = request_session.fetch_initial_response
+      articles = deduplicated_articles(response:, config:, request_session:)
+      { response:, articles: }
+    end
+    def request_session_for(config, strategy:, budget: nil)
+      RequestSession.from_runtime_input(runtime_input_for(config, strategy:), budget:)
+    end
+    def runtime_input_for(config, strategy:)
+      RequestSession::RuntimeInput.new(
+        url: config.url,
+        headers: config.headers,
+        request: config.request,
+        strategy:,
+        request_policy: RequestSession::RuntimePolicy.from_config(config)
+      )
+    end
+    def deduplicated_articles(response:, config:, request_session:)
+      Articles::Deduplicator.new(
+        collect_articles(response:, config:, request_session:)
+      ).call
+    end
+    def run_auto_pipeline(config)
+      auto_fallback_for(config).call
+    end
+    def auto_fallback_for(config)
+      AutoFallback.new(
+        strategies: AutoFallback::CHAIN,
+        budget: auto_pipeline_budget(config),
+        session_for: lambda do |strategy:, budget:|
+          request_session_for(config, strategy:, budget:)
+        end,
+        articles_for: lambda do |response:, request_session:|
+          deduplicated_articles(response:, config:, request_session:)
+        end
+      )
+    end
+    def auto_pipeline_budget(config)
+      max_requests = RequestSession::RuntimePolicy.from_config(config).max_requests
+      RequestService::Budget.new(max_requests:)
+    end
+    def collect_articles(response:, config:, request_session:)
+      selector_articles(response:, config:, request_session:) +
+        auto_source_articles(response:, config:, request_session:)
+    end
+    def selector_articles(response:, config:, request_session:) # rubocop:disable Metrics/MethodLength
+      return [] unless (selectors = config.selectors)
+      page_responses = if (max_pages = selectors.dig(:items, :pagination, :max_pages))
+                         RequestSession::RelNextPager.new(
+                           session: request_session,
+                           initial_response: response,
+                           max_pages:
+                         ).to_a
+                       else
+                         [response]
+                       end
+      page_responses.flat_map do |page_response|
+        Selectors.new(page_response, selectors:, time_zone: config.time_zone).articles
+      end
+    end
+    def auto_source_articles(response:, config:, request_session:)
+      return [] unless (auto_source = config.auto_source)
+      AutoSource.new(response, auto_source, request_session:).articles
+    end
+  end
+end

data/lib/html2rss/hash_util.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+module Html2rss
+  # Shared helpers for hash normalization and structural operations.
+  module HashUtil
+    module_function
+    # Deeply duplicates nested arrays and hashes.
+    #
+    # @param object [Object] nested value from configuration or runtime state
+    # @return [Object] deep duplicated object
+    def deep_dup(object)
+      case object
+      in Hash
+        object.transform_values { deep_dup(_1) }
+      in Array
+        object.map { deep_dup(_1) }
+      else
+        object.dup rescue StandardError # rubocop:disable Style/RescueModifier
+      end
+    end
+    # Deeply merges nested hashes while replacing non-hash values from override.
+    #
+    # @param base [Hash] base hash
+    # @param override [Hash] override hash
+    # @return [Hash] merged hash
+    def deep_merge(base, override)
+      base.merge(override) do |_key, old_val, new_val|
+        case [old_val, new_val]
+        in [Hash, Hash]
+          deep_merge(old_val, new_val)
+        else
+          new_val
+        end
+      end
+    end
+    # Converts string-keyed hashes to symbol-keyed hashes recursively.
+    #
+    # @param object [Object] value to normalize
+    # @param context [String] error context
+    # @return [Object] normalized value
+    def deep_symbolize_keys(object, context: 'hash')
+      case object
+      in Hash
+        object.each_with_object({}) do |(k, v), memo|
+          memo[symbol_key(k, context:)] = deep_symbolize_keys(v, context:)
+        end
+      in Array
+        object.map { deep_symbolize_keys(_1, context:) }
+      else
+        object
+      end
+    end
+    # Validates that hash keys are symbols.
+    #
+    # @param value [Object] candidate hash container whose keys must be symbols
+    # @param context [String] error context
+    # @param deep [Boolean] whether nested hashes should also be validated
+    # @return [void]
+    def assert_symbol_keys!(value, context: 'hash', deep: true)
+      return unless value in Hash
+      unless value.each_key.all?(Symbol)
+        invalid_key = value.keys.find { _1.class != Symbol }
+        raise ArgumentError, "#{context} must use symbol keys (found #{invalid_key.inspect})"
+      end
+      value.each_value { assert_symbol_keys!(_1, context:, deep:) } if deep
+    end
+    # Validates that hash keys are strings.
+    #
+    # @param value [Object] candidate hash container whose keys must be strings
+    # @param context [String] error context
+    # @param deep [Boolean] whether nested hashes should also be validated
+    # @return [void]
+    def assert_string_keys!(value, context: 'hash', deep: true)
+      return unless value in Hash
+      unless value.each_key.all?(String)
+        invalid_key = value.keys.find { _1.class != String }
+        raise ArgumentError, "#{context} must use string keys (found #{invalid_key.inspect})"
+      end
+      value.each_value { assert_string_keys!(_1, context:, deep:) } if deep
+    end
+    def symbol_key(key, context:)
+      case key
+      in Symbol then key
+      in String then key.to_sym
+      else
+        raise ArgumentError, "#{context} must use string or symbol keys (found #{key.inspect})"
+      end
+    end
+    private_class_method :symbol_key
+  end
+end

data/lib/html2rss/html_extractor/date_extractor.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# frozen_string_literal: true
+module Html2rss
+  class HtmlExtractor
+    # Extracts the earliest date from an article_tag.
+    class DateExtractor
+      # @param article_tag [Nokogiri::XML::Element] article container node
+      # @return [DateTime, nil]
+      def self.call(article_tag)
+        times = article_tag.css('[datetime]').filter_map do |tag|
+          DateTime.parse(tag['datetime'])
+        rescue ArgumentError, TypeError
+          nil
+        end
+        times.min
+      end
+    end
+  end
+end

data/lib/html2rss/html_extractor/enclosure_extractor.rb ADDED Viewed

@@ -0,0 +1,120 @@
+# frozen_string_literal: true
+module Html2rss
+  class HtmlExtractor
+    ##
+    # Extracts enclosures from HTML tags using various strategies.
+    class EnclosureExtractor
+      # @param article_tag [Nokogiri::XML::Element] article container node
+      # @param base_url [String, Html2rss::Url] base URL for relative enclosure links
+      # @return [Array<Hash{Symbol => Object}>] normalized enclosure hashes
+      def self.call(article_tag, base_url)
+        [
+          Extractors::Image,
+          Extractors::Media,
+          Extractors::Pdf,
+          Extractors::Iframe,
+          Extractors::Archive
+        ].flat_map { |strategy| strategy.call(article_tag, base_url:) }
+      end
+    end
+    # Extraction strategies for enclosure-like media/link tags.
+    module Extractors
+      # Extracts image enclosures from HTML tags.
+      # Finds all image sources and returns them in a format suitable for RSS.
+      class Image
+        # @param article_tag [Nokogiri::XML::Element] article container node
+        # @param base_url [String, Html2rss::Url] base URL for relative image sources
+        # @return [Array<Hash{Symbol => Object}>] image enclosure hashes
+        def self.call(article_tag, base_url:)
+          article_tag.css('img[src]:not([src^="data"])').filter_map do |img|
+            src = img['src'].to_s
+            next if src.empty?
+            abs_url = Url.from_relative(src, base_url)
+            {
+              url: abs_url,
+              type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
+            }
+          end
+        end
+      end
+      # Extracts media enclosures (video/audio) from HTML tags.
+      class Media
+        # @param article_tag [Nokogiri::XML::Element] article container node
+        # @param base_url [String, Html2rss::Url] base URL for relative media sources
+        # @return [Array<Hash{Symbol => Object}>] media enclosure hashes
+        def self.call(article_tag, base_url:)
+          article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
+            src = element['src'].to_s
+            next if src.empty?
+            {
+              url: Url.from_relative(src, base_url),
+              type: element['type']
+            }
+          end
+        end
+      end
+      # Extracts PDF enclosures from HTML tags.
+      class Pdf
+        # @param article_tag [Nokogiri::XML::Element] article container node
+        # @param base_url [String, Html2rss::Url] base URL for relative PDF links
+        # @return [Array<Hash{Symbol => Object}>] PDF enclosure hashes
+        def self.call(article_tag, base_url:)
+          article_tag.css('a[href$=".pdf"]').filter_map do |link|
+            href = link['href'].to_s
+            next if href.empty?
+            abs_url = Url.from_relative(href, base_url)
+            {
+              url: abs_url,
+              type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url)
+            }
+          end
+        end
+      end
+      # Extracts iframe enclosures from HTML tags.
+      class Iframe
+        # @param article_tag [Nokogiri::XML::Element] article container node
+        # @param base_url [String, Html2rss::Url] base URL for relative iframe links
+        # @return [Array<Hash{Symbol => Object}>] iframe enclosure hashes
+        def self.call(article_tag, base_url:)
+          article_tag.css('iframe[src]').filter_map do |iframe|
+            src = iframe['src']
+            next if src.nil? || src.empty?
+            abs_url = Url.from_relative(src, base_url)
+            {
+              url: abs_url,
+              type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
+            }
+          end
+        end
+      end
+      # Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
+      class Archive
+        # @param article_tag [Nokogiri::XML::Element] article container node
+        # @param base_url [String, Html2rss::Url] base URL for relative archive links
+        # @return [Array<Hash{Symbol => Object}>] archive enclosure hashes
+        def self.call(article_tag, base_url:)
+          article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
+            href = link['href'].to_s
+            next if href.empty?
+            abs_url = Url.from_relative(href, base_url)
+            {
+              url: abs_url,
+              type: 'application/zip'
+            }
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html2rss/html_extractor/image_extractor.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module Html2rss
+  class HtmlExtractor
+    ##
+    # Image is responsible for extracting image URLs the article_tag.
+    class ImageExtractor
+      # @param article_tag [Nokogiri::XML::Element] article container node
+      # @param base_url [String, Html2rss::Url] base URL for relative image URLs
+      # @return [Html2rss::Url, nil] best candidate image URL
+      def self.call(article_tag, base_url:)
+        img_src = from_source(article_tag) ||
+                  from_img(article_tag) ||
+                  from_style(article_tag)
+        Url.from_relative(img_src, base_url) if img_src
+      end
+      # @param article_tag [Nokogiri::XML::Element] article container node
+      # @return [String, nil] src attribute from first matching image tag
+      def self.from_img(article_tag)
+        article_tag.at_css('img[src]:not([src^="data"])')&.[]('src')
+      end
+      ##
+      # Extracts the largest image source from the srcset attribute
+      # of an img tag or a source tag inside a picture tag.
+      #
+      # @param article_tag [Nokogiri::XML::Element] article container node
+      # @return [String, nil] largest srcset URL candidate
+      # @see <https://developer.mozilla.org/en-US/docs/Learn/HTML/Multimedia_and_embedding/Responsive_images>
+      # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/img#srcset>
+      # @see <https://developer.mozilla.org/en-US/docs/Web/HTML/Element/picture>
+      def self.from_source(article_tag) # rubocop:disable Metrics/AbcSize
+        hash = article_tag.css('img[srcset], picture > source[srcset]').flat_map do |source|
+          source['srcset'].to_s.scan(/(\S+)\s+(\d+w|\d+h)[\s,]?/).map do |url, width|
+            next if url.nil? || url.start_with?('data:')
+            width_value = width.to_i.zero? ? 0 : width.scan(/\d+/).first.to_i
+            [width_value, url.strip]
+          end
+        end.compact.to_h
+        hash[hash.keys.max]
+      end
+      # @param article_tag [Nokogiri::XML::Element] article container node
+      # @return [String, nil] best style-based background image URL
+      def self.from_style(article_tag)
+        article_tag.css('[style*="url"]')
+                   .filter_map { |tag| tag['style'][/url\(['"]?(.*?)['"]?\)/, 1] }
+                   .reject { |src| src.start_with?('data:') }
+                   .max_by(&:size)
+      end
+    end
+  end
+end

data/lib/html2rss/html_extractor.rb ADDED Viewed

@@ -0,0 +1,141 @@
+# frozen_string_literal: true
+module Html2rss
+  ##
+  # HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
+  # from an article_tag.
+  class HtmlExtractor
+    # Tags ignored when extracting visible text content from article containers.
+    INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
+    # Heading tags used to prioritize title extraction.
+    HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
+    # Selector used to derive non-headline description nodes.
+    NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
+    # Anchor selector used to identify the canonical article link element.
+    MAIN_ANCHOR_SELECTOR = begin
+      buf = +'a[href]:not([href=""])'
+      %w[# javascript: mailto: tel: file:// sms: data:].each do |prefix|
+        buf << %[:not([href^="#{prefix}"])]
+      end
+      buf.freeze
+    end
+    class << self
+      ##
+      # Extracts visible text from a given node and its children.
+      #
+      # @param tag [Nokogiri::XML::Node] the node from which to extract visible text
+      # @param separator [String] separator used to join text fragments (default is a space)
+      # @return [String, nil] the concatenated visible text, or nil if none is found
+      def extract_visible_text(tag, separator: ' ')
+        parts = tag.children.filter_map do |child|
+          next unless visible_child?(child)
+          raw_text = child.children.empty? ? child.text : extract_visible_text(child)
+          text = raw_text&.strip
+          text unless text.to_s.empty?
+        end
+        parts.join(separator).squeeze(' ').strip unless parts.empty?
+      end
+      private
+      def visible_child?(node)
+        !INVISIBLE_CONTENT_TAGS.include?(node.name) &&
+          !(node.name == 'a' && node['href']&.start_with?('#'))
+      end
+    end
+    ##
+    # @param article_tag [Nokogiri::XML::Node] article-like container to extract from
+    # @param base_url [String, Html2rss::Url] base url used to resolve relative links
+    # @param selected_anchor [Nokogiri::XML::Node, nil] explicit primary anchor for the container
+    def initialize(article_tag, base_url:, selected_anchor:)
+      raise ArgumentError, 'article_tag is required' unless article_tag
+      @article_tag = article_tag
+      @base_url = base_url
+      @selected_anchor = selected_anchor
+    end
+    # @return [Hash{Symbol => Object}] extracted article attributes
+    def call
+      {
+        title: extract_title,
+        url: extract_url,
+        image: extract_image,
+        description: extract_description,
+        id: generate_id,
+        published_at: extract_published_at,
+        enclosures: extract_enclosures,
+        categories: extract_categories
+      }
+    end
+    private
+    attr_reader :article_tag, :base_url, :selected_anchor
+    class << self
+      ##
+      # @param article_tag [Nokogiri::XML::Node] article-like container to search within
+      # @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
+      def main_anchor_for(article_tag)
+        return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
+        article_tag.at_css(MAIN_ANCHOR_SELECTOR)
+      end
+    end
+    def extract_url
+      @extract_url ||= begin
+        href = selected_anchor&.[]('href').to_s
+        Url.from_relative(href.split('#').first.strip, base_url) unless href.empty?
+      end
+    end
+    def extract_title
+      title_source = heading || selected_anchor
+      self.class.extract_visible_text(title_source) if title_source
+    end
+    def heading
+      @heading ||= begin
+        heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
+        smallest_heading = heading_tags.keys.min
+        if smallest_heading
+          heading_tags[smallest_heading]&.max_by do |tag|
+            self.class.extract_visible_text(tag)&.size.to_i
+          end
+        end
+      end
+    end
+    def extract_description
+      text = self.class.extract_visible_text(article_tag.css(NON_HEADLINE_SELECTOR), separator: '<br>')
+      return text if text && !text.empty?
+      description = self.class.extract_visible_text(article_tag)
+      return nil if description.nil? || description.strip.empty?
+      description.strip
+    end
+    def generate_id
+      [
+        article_tag['id'],
+        article_tag.at_css('[id]')&.attr('id'),
+        extract_url&.path,
+        extract_url&.query
+      ].compact.reject(&:empty?).first
+    end
+    def extract_image = ImageExtractor.call(article_tag, base_url:)
+    def extract_published_at = DateExtractor.call(article_tag)
+    def extract_enclosures = EnclosureExtractor.call(article_tag, base_url)
+    def extract_categories = CategoryExtractor.call(article_tag)
+  end
+end