RubyGems - html2rss - Versions diffs - 0.16.0 → 0.18.0 - Mend

html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

checksums.yaml +4 -4
data/README.md +48 -657
data/exe/html2rss +1 -1
data/html2rss.gemspec +7 -4
data/lib/html2rss/articles/deduplicator.rb +49 -0
data/lib/html2rss/auto_source/cleanup.rb +33 -5
data/lib/html2rss/auto_source/scraper/html.rb +118 -43
data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
data/lib/html2rss/auto_source/scraper.rb +142 -8
data/lib/html2rss/auto_source.rb +119 -47
data/lib/html2rss/blocked_surface.rb +64 -0
data/lib/html2rss/category_extractor.rb +82 -0
data/lib/html2rss/cli.rb +170 -23
data/lib/html2rss/config/class_methods.rb +189 -0
data/lib/html2rss/config/dynamic_params.rb +68 -0
data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
data/lib/html2rss/config/request_headers.rb +130 -0
data/lib/html2rss/config/schema.rb +208 -0
data/lib/html2rss/config/validator.rb +108 -0
data/lib/html2rss/config.rb +112 -61
data/lib/html2rss/error.rb +6 -0
data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
data/lib/html2rss/html_extractor.rb +136 -0
data/lib/html2rss/html_navigator.rb +46 -0
data/lib/html2rss/json_feed_builder/item.rb +94 -0
data/lib/html2rss/json_feed_builder.rb +58 -0
data/lib/html2rss/rendering/audio_renderer.rb +31 -0
data/lib/html2rss/rendering/description_builder.rb +88 -0
data/lib/html2rss/rendering/image_renderer.rb +31 -0
data/lib/html2rss/rendering/media_renderer.rb +33 -0
data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
data/lib/html2rss/rendering/video_renderer.rb +31 -0
data/lib/html2rss/rendering.rb +14 -0
data/lib/html2rss/request_controls.rb +128 -0
data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
data/lib/html2rss/request_service/budget.rb +39 -0
data/lib/html2rss/request_service/context.rb +64 -20
data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
data/lib/html2rss/request_service/policy.rb +248 -0
data/lib/html2rss/request_service/puppet_commander.rb +212 -13
data/lib/html2rss/request_service/response.rb +42 -2
data/lib/html2rss/request_service/response_guard.rb +62 -0
data/lib/html2rss/request_service.rb +31 -15
data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
data/lib/html2rss/request_session/runtime_input.rb +57 -0
data/lib/html2rss/request_session/runtime_policy.rb +76 -0
data/lib/html2rss/request_session.rb +118 -0
data/lib/html2rss/rss_builder/article.rb +166 -0
data/lib/html2rss/rss_builder/channel.rb +96 -11
data/lib/html2rss/rss_builder/enclosure.rb +48 -0
data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
data/lib/html2rss/rss_builder.rb +72 -71
data/lib/html2rss/selectors/config.rb +122 -0
data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
data/lib/html2rss/selectors/extractors/href.rb +53 -0
data/lib/html2rss/selectors/extractors/html.rb +48 -0
data/lib/html2rss/selectors/extractors/static.rb +41 -0
data/lib/html2rss/selectors/extractors/text.rb +46 -0
data/lib/html2rss/selectors/extractors.rb +52 -0
data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
data/lib/html2rss/selectors/post_processors/base.rb +74 -0
data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
data/lib/html2rss/selectors/post_processors/template.rb +73 -0
data/lib/html2rss/selectors/post_processors.rb +43 -0
data/lib/html2rss/selectors.rb +294 -0
data/lib/html2rss/url.rb +262 -0
data/lib/html2rss/version.rb +1 -1
data/lib/html2rss.rb +129 -70
data/lib/tasks/config_schema.rake +17 -0
data/schema/html2rss-config.schema.json +469 -0
metadata +120 -46
data/lib/html2rss/attribute_post_processors/base.rb +0 -74
data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
data/lib/html2rss/attribute_post_processors/template.rb +0 -101
data/lib/html2rss/attribute_post_processors.rb +0 -44
data/lib/html2rss/auto_source/article.rb +0 -127
data/lib/html2rss/auto_source/channel.rb +0 -78
data/lib/html2rss/auto_source/reducer.rb +0 -48
data/lib/html2rss/auto_source/rss_builder.rb +0 -70
data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
data/lib/html2rss/config/channel.rb +0 -125
data/lib/html2rss/config/selectors.rb +0 -103
data/lib/html2rss/item.rb +0 -186
data/lib/html2rss/item_extractors/attribute.rb +0 -50
data/lib/html2rss/item_extractors/href.rb +0 -52
data/lib/html2rss/item_extractors/html.rb +0 -46
data/lib/html2rss/item_extractors/static.rb +0 -39
data/lib/html2rss/item_extractors/text.rb +0 -44
data/lib/html2rss/item_extractors.rb +0 -88
data/lib/html2rss/object_to_xml_converter.rb +0 -56
data/lib/html2rss/rss_builder/item.rb +0 -83
data/lib/html2rss/utils.rb +0 -113

data/lib/html2rss/request_service/policy.rb ADDED Viewed

@@ -0,0 +1,248 @@
+# frozen_string_literal: true
+require 'ipaddr'
+require 'resolv'
+require 'socket'
+module Html2rss
+  class RequestService
+    ##
+    # Describes the runtime request envelope for a single feed build.
+    class Policy # rubocop:disable Metrics/ClassLength
+      MAX_REQUESTS_CEILING = 10
+      LOCAL_HOSTS = %w[localhost localhost.localdomain metadata.google.internal].to_set.freeze
+      BLOCKED_IP_RANGES = [
+        IPAddr.new('0.0.0.0/8'),
+        IPAddr.new('10.0.0.0/8'),
+        IPAddr.new('127.0.0.0/8'),
+        IPAddr.new('169.254.0.0/16'),
+        IPAddr.new('172.16.0.0/12'),
+        IPAddr.new('192.168.0.0/16'),
+        IPAddr.new('224.0.0.0/4'),
+        IPAddr.new('::/128'),
+        IPAddr.new('::1/128'),
+        IPAddr.new('fe80::/10'),
+        IPAddr.new('fc00::/7'),
+        IPAddr.new('ff00::/8')
+      ].freeze
+      DEFAULTS = {
+        connect_timeout_seconds: 5,
+        read_timeout_seconds: 10,
+        total_timeout_seconds: 30,
+        max_redirects: 3,
+        max_response_bytes: 5_242_880,
+        max_decompressed_bytes: 10_485_760,
+        max_requests: 1,
+        allow_private_networks: false,
+        allow_cross_origin_followups: false
+      }.freeze
+      ##
+      # @param connect_timeout_seconds [Integer] maximum connection setup time
+      # @param read_timeout_seconds [Integer] maximum read stall time
+      # @param total_timeout_seconds [Integer] maximum total request time
+      # @param max_redirects [Integer] maximum redirect count
+      # @param max_response_bytes [Integer] maximum streamed response bytes
+      # @param max_decompressed_bytes [Integer] maximum final body size
+      # @param max_requests [Integer] maximum requests per feed build
+      # @param allow_private_networks [Boolean] whether private network targets are allowed
+      # @param allow_cross_origin_followups [Boolean] whether follow-up requests may leave the origin host
+      # @param resolver [#each_address] DNS resolver used for hostname classification
+      def initialize(connect_timeout_seconds: DEFAULTS[:connect_timeout_seconds], # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
+                     read_timeout_seconds: DEFAULTS[:read_timeout_seconds],
+                     total_timeout_seconds: DEFAULTS[:total_timeout_seconds],
+                     max_redirects: DEFAULTS[:max_redirects],
+                     max_response_bytes: DEFAULTS[:max_response_bytes],
+                     max_decompressed_bytes: DEFAULTS[:max_decompressed_bytes],
+                     max_requests: DEFAULTS[:max_requests],
+                     allow_private_networks: DEFAULTS[:allow_private_networks],
+                     allow_cross_origin_followups: DEFAULTS[:allow_cross_origin_followups],
+                     resolver: Socket)
+        @connect_timeout_seconds = validate_positive_integer!(:connect_timeout_seconds, connect_timeout_seconds)
+        @read_timeout_seconds = validate_positive_integer!(:read_timeout_seconds, read_timeout_seconds)
+        @total_timeout_seconds = validate_positive_integer!(:total_timeout_seconds, total_timeout_seconds)
+        @max_redirects = validate_non_negative_integer!(:max_redirects, max_redirects)
+        @max_response_bytes = validate_positive_integer!(:max_response_bytes, max_response_bytes)
+        @max_decompressed_bytes = validate_positive_integer!(:max_decompressed_bytes, max_decompressed_bytes)
+        @max_requests = [validate_positive_integer!(:max_requests, max_requests), MAX_REQUESTS_CEILING].min
+        @allow_private_networks = allow_private_networks ? true : false
+        @allow_cross_origin_followups = allow_cross_origin_followups ? true : false
+        @resolver = resolver
+        freeze
+      end
+      attr_reader :connect_timeout_seconds,
+                  :read_timeout_seconds,
+                  :total_timeout_seconds,
+                  :max_redirects,
+                  :max_response_bytes,
+                  :max_decompressed_bytes,
+                  :max_requests
+      ##
+      # @return [Boolean] whether private network targets may be requested
+      def allow_private_networks?
+        @allow_private_networks
+      end
+      ##
+      # @return [Boolean] whether follow-up requests may leave the initial origin
+      def allow_cross_origin_followups?
+        @allow_cross_origin_followups
+      end
+      ##
+      # Returns the default request policy.
+      #
+      # @return [Policy] a default, frozen policy instance
+      # rubocop:disable Layout/ClassStructure
+      def self.default
+        new
+      end
+      # rubocop:enable Layout/ClassStructure
+      ##
+      # Validates whether a request target is permitted for the given context.
+      #
+      # @param url [Html2rss::Url] destination URL
+      # @param origin_url [Html2rss::Url] initial URL of the feed build
+      # @param relation [Symbol] logical reason for the request
+      # @return [void]
+      # @raise [CrossOriginFollowUpDenied] if a follow-up leaves the origin host
+      # @raise [PrivateNetworkDenied] if the target resolves to a private address
+      def validate_request!(url:, origin_url:, relation:)
+        enforce_same_origin!(url, origin_url, relation)
+        enforce_public_network!(url)
+      end
+      ##
+      # Validates a redirect hop before it is followed.
+      #
+      # @param from_url [Html2rss::Url] URL that produced the redirect
+      # @param to_url [Html2rss::Url] redirect destination
+      # @param origin_url [Html2rss::Url] initial URL of the feed build
+      # @param relation [Symbol] logical reason for the request
+      # @return [void]
+      # @raise [UnsupportedUrlScheme] if the redirect downgrades from HTTPS to HTTP
+      def validate_redirect!(from_url:, to_url:, origin_url:, relation:)
+        if from_url.scheme == 'https' && to_url.scheme == 'http'
+          raise UnsupportedUrlScheme, 'Redirect downgraded from https to http'
+        end
+        validate_request!(url: to_url, origin_url:, relation:)
+      end
+      ##
+      # Validates the resolved remote IP for a completed request.
+      #
+      # @param ip [String, nil] remote IP address reported by the client
+      # @param url [Html2rss::Url] URL associated with the response
+      # @return [void]
+      # @raise [PrivateNetworkDenied] if the response came from a blocked address
+      def validate_remote_ip!(ip:, url:)
+        return if allow_private_networks?
+        return if ip.nil? || ip.empty?
+        parsed_ip = parse_ip(ip)
+        raise PrivateNetworkDenied, "Remote IP could not be validated for #{url}" unless parsed_ip
+        return unless blocked_ip?(parsed_ip)
+        raise PrivateNetworkDenied, "Private network target denied for #{url}"
+      end
+      private
+      attr_reader :resolver
+      def validate_positive_integer!(name, value)
+        raise ArgumentError, "#{name} must be positive" unless value.is_a?(Integer) && value.positive?
+        value
+      end
+      def validate_non_negative_integer!(name, value)
+        raise ArgumentError, "#{name} must be non-negative" unless value.is_a?(Integer) && !value.negative?
+        value
+      end
+      def enforce_same_origin!(url, origin_url, relation)
+        return if relation == :initial || allow_cross_origin_followups?
+        enforce_follow_up_scheme!(url, origin_url)
+        return if comparable_origin(url) == comparable_origin(origin_url)
+        raise CrossOriginFollowUpDenied, "Cross-origin follow-up denied for #{url}"
+      end
+      def enforce_follow_up_scheme!(url, origin_url)
+        return unless origin_url.scheme == 'https' && url.scheme == 'http'
+        raise UnsupportedUrlScheme, "Follow-up downgraded from https to http for #{url}"
+      end
+      def comparable_origin(url)
+        [url.host, normalized_port(url)]
+      end
+      def normalized_port(url)
+        return url.port if url.port
+        url.scheme == 'https' ? 443 : 80
+      end
+      def enforce_public_network!(url)
+        host = url.host
+        return if allow_private_networks?
+        return unless blocked_host?(host) || resolved_ip_addresses(host).any? { |address| blocked_ip?(address) }
+        raise PrivateNetworkDenied, "Private network target denied for #{url}"
+      end
+      def blocked_host?(host)
+        LOCAL_HOSTS.include?(host.to_s.downcase)
+      end
+      def resolved_ip_addresses(host)
+        literal = parse_ip(host)
+        return [literal] if literal
+        if resolver.respond_to?(:each_address)
+          addresses_from_each_address(host)
+        else
+          addresses_from_getaddrinfo(host)
+        end
+      rescue Resolv::ResolvError, SocketError, SystemCallError
+        []
+      end
+      def addresses_from_each_address(host)
+        [].tap do |addresses|
+          resolver.each_address(host) do |address|
+            parsed = parse_ip(address)
+            addresses << parsed if parsed
+          end
+        end
+      end
+      def addresses_from_getaddrinfo(host)
+        resolver.getaddrinfo(host, nil).filter_map do |entry|
+          parse_ip(entry[3])
+        end
+      end
+      def parse_ip(value)
+        IPAddr.new(value)
+      rescue IPAddr::AddressFamilyError, IPAddr::InvalidAddressError
+        nil
+      end
+      def blocked_ip?(address)
+        BLOCKED_IP_RANGES.any? { |range| range.include?(address) }
+      end
+    end
+    Policy::DEFAULT_POLICY = Policy.new
+  end
+end

data/lib/html2rss/request_service/puppet_commander.rb CHANGED Viewed

@@ -4,7 +4,13 @@ module Html2rss
   class RequestService
     ##
     # Commands the Puppeteer Browser to the website and builds the Response.
-    class PuppetCommander
+    class PuppetCommander # rubocop:disable Metrics/ClassLength
+      BROWSER_UNSAFE_HEADERS = %w[
+        host connection content-length transfer-encoding
+        sec-fetch-dest sec-fetch-mode sec-fetch-site sec-fetch-user
+        upgrade-insecure-requests
+      ].to_set.freeze
       # @param ctx [Context]
       # @param browser [Puppeteer::Browser]
       # @param skip_request_resources [Set<String>] the resource types not to request
@@ -19,13 +25,18 @@ module Html2rss
         @referer = referer
       end
-      # @return [Response]
+      ##
+      # Visits the request URL and normalizes the page into a response object.
+      #
+      # @return [Response] rendered page response
       def call
         page = new_page
-        response = navigate_to_destination(page, ctx.url)
-        Response.new(body: body(page), headers: response.headers)
+        navigation_response = navigate_to_destination(page, ctx.url)
+        perform_preload(page)
+        raise_navigation_error_if_any
+        final_navigation_response = latest_navigation_response || navigation_response
+        validate_navigation_response!(final_navigation_response)
+        build_response(page, final_navigation_response)
       ensure
         page&.close
       end
@@ -35,27 +46,215 @@ module Html2rss
       # @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
       def new_page
         page = browser.new_page
-        page.extra_http_headers = ctx.headers
+        @main_frame = page.main_frame if page.respond_to?(:main_frame)
+        configure_page(page)
+        configure_navigation_guards(page)
+        page
+      end
-        return page if skip_request_resources.empty?
+      ##
+      # @param page [Puppeteer::Page]
+      # @return [void]
+      def configure_page(page)
+        page.extra_http_headers = browser_headers
+        page.default_navigation_timeout = navigation_timeout_ms
+        page.default_timeout = navigation_timeout_ms
+      end
+      ##
+      # @param page [Puppeteer::Page]
+      # @return [void]
+      def configure_navigation_guards(page)
         page.request_interception = true
         page.on('request') do |request|
-          skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
+          handle_request(request)
         end
-        page
+        page.on('response') { |response| handle_response(response) }
       end
+      ##
+      # @param page [Puppeteer::Page] browser page
+      # @param url [Html2rss::Url] target URL
+      # @return [Puppeteer::HTTPResponse, nil] the navigation response if one was produced
       def navigate_to_destination(page, url)
-        page.goto(url, wait_until: 'networkidle0', referer:)
+        @navigation_error = nil
+        @latest_navigation_response = nil
+        page.goto(url, wait_until: 'networkidle0', referer:, timeout: navigation_timeout_ms).tap do
+          raise_navigation_error_if_any
+        end
+      rescue StandardError
+        raise_navigation_error_if_any
+        raise
       end
+      ##
+      # @param page [Puppeteer::Page] browser page
+      # @return [String] rendered HTML content
       def body(page) = page.content
       private
-      attr_reader :ctx, :browser, :skip_request_resources, :referer
+      attr_reader :ctx, :browser, :skip_request_resources, :referer, :latest_navigation_response, :main_frame
+      def raise_navigation_error_if_any
+        raise @navigation_error if @navigation_error
+      end
+      def navigation_timeout_ms
+        ctx.policy.total_timeout_seconds * 1000
+      end
+      def browser_headers
+        ctx.headers.reject { |key, _| BROWSER_UNSAFE_HEADERS.include?(key.to_s.downcase) }
+      end
+      def handle_request(request)
+        validate_request!(request)
+        skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
+      rescue Html2rss::Error => error
+        store_navigation_error(error, navigation_request: request.navigation_request?)
+        request.abort
+      end
+      def handle_response(response)
+        @latest_navigation_response = response if main_frame_navigation_response?(response)
+        validate_response!(response)
+      rescue Html2rss::Error => error
+        store_navigation_error(error, navigation_request: response.request.navigation_request?)
+      end
+      def validate_request!(request)
+        validate_navigation_redirect_chain!(request)
+        validate_navigation_target!(request)
+      end
+      def main_frame_navigation_response?(response)
+        request = response.request
+        return false unless request.navigation_request?
+        return true unless request.respond_to?(:frame)
+        frame = request.frame
+        return true if frame.nil?
+        return frame == main_frame unless main_frame.nil?
+        return true unless frame.respond_to?(:parent_frame)
+        frame.parent_frame.nil?
+      end
+      def build_response(page, navigation_response)
+        page_body = body(page)
+        ResponseGuard.new(policy: ctx.policy).inspect_body!(page_body)
+        Response.new(
+          body: page_body,
+          headers: navigation_response&.headers || {},
+          url: response_url(navigation_response, ctx.url),
+          status: navigation_response&.status
+        )
+      end
+      def validate_navigation_response!(navigation_response)
+        final_url = response_url(navigation_response, ctx.url)
+        ctx.policy.validate_remote_ip!(ip: remote_ip(navigation_response), url: final_url)
+      end
+      def validate_response!(response)
+        validate_navigation_response!(response)
+      end
+      def response_url(navigation_response, fallback_url)
+        raw_url = navigation_response&.url || fallback_url.to_s
+        Html2rss::Url.from_absolute(raw_url)
+      end
+      def remote_ip(navigation_response)
+        navigation_response.remote_address&.ip
+      end
+      def request_chain(request)
+        (request.redirect_chain + [request]).map { |entry| request_url(entry) }
+      end
+      def request_url(request)
+        Html2rss::Url.from_absolute(request.url)
+      end
+      def validate_navigation_redirect_chain!(request)
+        request_chain(request).each_cons(2) do |from_url, to_url|
+          ctx.policy.validate_redirect!(from_url:, to_url:, origin_url: ctx.origin_url, relation: ctx.relation)
+        end
+      end
+      def validate_navigation_target!(request)
+        ctx.policy.validate_request!(url: request_url(request), origin_url: ctx.origin_url, relation: ctx.relation)
+      end
+      def store_navigation_error(error, navigation_request:)
+        return unless navigation_request
+        @navigation_error = error if @navigation_error.nil?
+      end
+      def perform_preload(page)
+        preload_config = ctx.browserless_preload
+        return unless preload_config
+        wait_after(page, preload_config[:wait_after_ms])
+        click_selectors(page, preload_config[:click_selectors]) if preload_config[:click_selectors]
+        scroll_down(page, preload_config[:scroll_down]) if preload_config[:scroll_down]
+        wait_after(page, preload_config[:wait_after_ms])
+      end
+      def wait_after(page, timeout_ms)
+        return unless timeout_ms
+        ctx.budget.consume!
+        page.wait_for_timeout(timeout_ms)
+      end
+      def click_selectors(page, selectors)
+        selectors.each { |selector_config| click_selector(page, selector_config) }
+      end
+      def scroll_down(page, config)
+        iterations = config.fetch(:iterations, 1)
+        wait_after_ms = config[:wait_after_ms]
+        previous_height = nil
+        iterations.times do
+          updated_height = perform_scroll_iteration(page, wait_after_ms, previous_height)
+          break unless updated_height
+          previous_height = updated_height
+        end
+      end
+      def click_selector(page, config)
+        selector = config.fetch(:selector)
+        max_clicks = config.fetch(:max_clicks, 1)
+        wait_after_ms = config[:wait_after_ms]
+        max_clicks.times do
+          break unless (element = page.query_selector(selector))
+          ctx.budget.consume!
+          element.click
+          wait_after(page, wait_after_ms)
+        end
+      end
+      def perform_scroll_iteration(page, wait_after_ms, previous_height)
+        ctx.budget.consume!
+        page.evaluate('() => window.scrollTo(0, document.body.scrollHeight)')
+        wait_after(page, wait_after_ms)
+        current_height = page.evaluate('() => document.body.scrollHeight')
+        return if previous_height && current_height <= previous_height
+        current_height
+      end
     end
   end
 end

data/lib/html2rss/request_service/response.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require 'nokogiri'
 module Html2rss
   class RequestService
     ##
@@ -7,21 +9,59 @@ module Html2rss
     class Response
       ##
       # @param body [String] the body of the response
+      # @param url [Html2rss::Url] the final request URL
       # @param headers [Hash] the headers of the response
-      def initialize(body:, headers: {})
+      # @param status [Integer, nil] the HTTP status code when available
+      def initialize(body:, url:, headers: {}, status: nil)
         @body = body
         headers = headers.dup
         headers.transform_keys!(&:to_s)
         @headers = headers
+        @status = status
+        @url = url
       end
-      # @return [String] the body of the response
+      # @return [String] the raw body of the response
       attr_reader :body
       # @return [Hash<String, Object>] the headers of the response
       attr_reader :headers
+      # @return [Integer, nil] the HTTP status code when known
+      attr_reader :status
+      # @return [Html2rss::Url] the URL of the response
+      attr_reader :url
+      def content_type = header('content-type').to_s
+      def json_response? = content_type.include?('application/json')
+      def html_response? = content_type.include?('text/html')
+      ##
+      # @return [Nokogiri::HTML::Document, Hash] the parsed body of the response, frozen object
+      # @raise [UnsupportedResponseContentType] if the content type is not supported
+      def parsed_body
+        @parsed_body ||= if html_response?
+                           Nokogiri::HTML(body).tap do |doc|
+                             # Remove comments from the document to avoid processing irrelevant content
+                             doc.xpath('//comment()').each(&:remove)
+                           end.freeze
+                         elsif json_response?
+                           JSON.parse(body, symbolize_names: true).freeze
+                         else
+                           raise UnsupportedResponseContentType, "Unsupported content type: #{content_type}"
+                         end
+      end
+      private
+      def header(name)
+        headers.fetch(name) do
+          headers.find { |key, _value| key.casecmp?(name) }&.last
+        end
+      end
     end
   end
 end

data/lib/html2rss/request_service/response_guard.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# frozen_string_literal: true
+module Html2rss
+  class RequestService
+    ##
+    # Enforces response-size limits before parsing.
+    class ResponseGuard
+      ##
+      # @param policy [Policy] request policy that defines byte ceilings
+      def initialize(policy:)
+        @policy = policy
+        @streamed_bytes = 0
+      end
+      ##
+      # Validates response headers and streamed byte count.
+      #
+      # @param total_bytes [Integer] cumulative byte count received so far
+      # @param headers [Hash, nil] response headers if known
+      # @return [void]
+      # @raise [ResponseTooLarge] if the response exceeds configured limits
+      def inspect_chunk!(total_bytes:, headers: nil)
+        header_length = headers&.fetch('content-length', headers&.fetch('Content-Length', nil))
+        raise_if_too_large!(header_length.to_i, policy.max_response_bytes) if header_length
+        @streamed_bytes = total_bytes
+        raise_if_too_large!(@streamed_bytes, policy.max_response_bytes)
+      end
+      ##
+      # Validates the final response body after middleware processing.
+      #
+      # @param body [String, nil] final response body
+      # @return [void]
+      # @raise [ResponseTooLarge] if the final body exceeds configured limits
+      # @raise [BlockedSurfaceDetected] if the body matches known anti-bot interstitial signatures
+      def inspect_body!(body)
+        normalized_body = body.to_s
+        size = normalized_body.bytesize
+        raise_if_too_large!(size, policy.max_decompressed_bytes)
+        raise_if_blocked_surface!(normalized_body)
+      end
+      private
+      attr_reader :policy
+      def raise_if_blocked_surface!(body)
+        signature = Html2rss::BlockedSurface.interstitial_signature_for(body)
+        return unless signature
+        raise BlockedSurfaceDetected, signature.fetch(:message)
+      end
+      def raise_if_too_large!(bytes, limit)
+        return unless bytes > limit
+        raise ResponseTooLarge, "Response exceeded #{limit} bytes"
+      end
+    end
+  end
+end