html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,248 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ipaddr'
4
+ require 'resolv'
5
+ require 'socket'
6
+
7
+ module Html2rss
8
+ class RequestService
9
+ ##
10
+ # Describes the runtime request envelope for a single feed build.
11
+ class Policy # rubocop:disable Metrics/ClassLength
12
+ MAX_REQUESTS_CEILING = 10
13
+ LOCAL_HOSTS = %w[localhost localhost.localdomain metadata.google.internal].to_set.freeze
14
+ BLOCKED_IP_RANGES = [
15
+ IPAddr.new('0.0.0.0/8'),
16
+ IPAddr.new('10.0.0.0/8'),
17
+ IPAddr.new('127.0.0.0/8'),
18
+ IPAddr.new('169.254.0.0/16'),
19
+ IPAddr.new('172.16.0.0/12'),
20
+ IPAddr.new('192.168.0.0/16'),
21
+ IPAddr.new('224.0.0.0/4'),
22
+ IPAddr.new('::/128'),
23
+ IPAddr.new('::1/128'),
24
+ IPAddr.new('fe80::/10'),
25
+ IPAddr.new('fc00::/7'),
26
+ IPAddr.new('ff00::/8')
27
+ ].freeze
28
+
29
+ DEFAULTS = {
30
+ connect_timeout_seconds: 5,
31
+ read_timeout_seconds: 10,
32
+ total_timeout_seconds: 30,
33
+ max_redirects: 3,
34
+ max_response_bytes: 5_242_880,
35
+ max_decompressed_bytes: 10_485_760,
36
+ max_requests: 1,
37
+ allow_private_networks: false,
38
+ allow_cross_origin_followups: false
39
+ }.freeze
40
+
41
+ ##
42
+ # @param connect_timeout_seconds [Integer] maximum connection setup time
43
+ # @param read_timeout_seconds [Integer] maximum read stall time
44
+ # @param total_timeout_seconds [Integer] maximum total request time
45
+ # @param max_redirects [Integer] maximum redirect count
46
+ # @param max_response_bytes [Integer] maximum streamed response bytes
47
+ # @param max_decompressed_bytes [Integer] maximum final body size
48
+ # @param max_requests [Integer] maximum requests per feed build
49
+ # @param allow_private_networks [Boolean] whether private network targets are allowed
50
+ # @param allow_cross_origin_followups [Boolean] whether follow-up requests may leave the origin host
51
+ # @param resolver [#each_address] DNS resolver used for hostname classification
52
+ def initialize(connect_timeout_seconds: DEFAULTS[:connect_timeout_seconds], # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
53
+ read_timeout_seconds: DEFAULTS[:read_timeout_seconds],
54
+ total_timeout_seconds: DEFAULTS[:total_timeout_seconds],
55
+ max_redirects: DEFAULTS[:max_redirects],
56
+ max_response_bytes: DEFAULTS[:max_response_bytes],
57
+ max_decompressed_bytes: DEFAULTS[:max_decompressed_bytes],
58
+ max_requests: DEFAULTS[:max_requests],
59
+ allow_private_networks: DEFAULTS[:allow_private_networks],
60
+ allow_cross_origin_followups: DEFAULTS[:allow_cross_origin_followups],
61
+ resolver: Socket)
62
+ @connect_timeout_seconds = validate_positive_integer!(:connect_timeout_seconds, connect_timeout_seconds)
63
+ @read_timeout_seconds = validate_positive_integer!(:read_timeout_seconds, read_timeout_seconds)
64
+ @total_timeout_seconds = validate_positive_integer!(:total_timeout_seconds, total_timeout_seconds)
65
+ @max_redirects = validate_non_negative_integer!(:max_redirects, max_redirects)
66
+ @max_response_bytes = validate_positive_integer!(:max_response_bytes, max_response_bytes)
67
+ @max_decompressed_bytes = validate_positive_integer!(:max_decompressed_bytes, max_decompressed_bytes)
68
+ @max_requests = [validate_positive_integer!(:max_requests, max_requests), MAX_REQUESTS_CEILING].min
69
+ @allow_private_networks = allow_private_networks ? true : false
70
+ @allow_cross_origin_followups = allow_cross_origin_followups ? true : false
71
+ @resolver = resolver
72
+ freeze
73
+ end
74
+
75
+ attr_reader :connect_timeout_seconds,
76
+ :read_timeout_seconds,
77
+ :total_timeout_seconds,
78
+ :max_redirects,
79
+ :max_response_bytes,
80
+ :max_decompressed_bytes,
81
+ :max_requests
82
+
83
+ ##
84
+ # @return [Boolean] whether private network targets may be requested
85
+ def allow_private_networks?
86
+ @allow_private_networks
87
+ end
88
+
89
+ ##
90
+ # @return [Boolean] whether follow-up requests may leave the initial origin
91
+ def allow_cross_origin_followups?
92
+ @allow_cross_origin_followups
93
+ end
94
+
95
+ ##
96
+ # Returns the default request policy.
97
+ #
98
+ # @return [Policy] a default, frozen policy instance
99
+ # rubocop:disable Layout/ClassStructure
100
+ def self.default
101
+ new
102
+ end
103
+ # rubocop:enable Layout/ClassStructure
104
+
105
+ ##
106
+ # Validates whether a request target is permitted for the given context.
107
+ #
108
+ # @param url [Html2rss::Url] destination URL
109
+ # @param origin_url [Html2rss::Url] initial URL of the feed build
110
+ # @param relation [Symbol] logical reason for the request
111
+ # @return [void]
112
+ # @raise [CrossOriginFollowUpDenied] if a follow-up leaves the origin host
113
+ # @raise [PrivateNetworkDenied] if the target resolves to a private address
114
+ def validate_request!(url:, origin_url:, relation:)
115
+ enforce_same_origin!(url, origin_url, relation)
116
+ enforce_public_network!(url)
117
+ end
118
+
119
+ ##
120
+ # Validates a redirect hop before it is followed.
121
+ #
122
+ # @param from_url [Html2rss::Url] URL that produced the redirect
123
+ # @param to_url [Html2rss::Url] redirect destination
124
+ # @param origin_url [Html2rss::Url] initial URL of the feed build
125
+ # @param relation [Symbol] logical reason for the request
126
+ # @return [void]
127
+ # @raise [UnsupportedUrlScheme] if the redirect downgrades from HTTPS to HTTP
128
+ def validate_redirect!(from_url:, to_url:, origin_url:, relation:)
129
+ if from_url.scheme == 'https' && to_url.scheme == 'http'
130
+ raise UnsupportedUrlScheme, 'Redirect downgraded from https to http'
131
+ end
132
+
133
+ validate_request!(url: to_url, origin_url:, relation:)
134
+ end
135
+
136
+ ##
137
+ # Validates the resolved remote IP for a completed request.
138
+ #
139
+ # @param ip [String, nil] remote IP address reported by the client
140
+ # @param url [Html2rss::Url] URL associated with the response
141
+ # @return [void]
142
+ # @raise [PrivateNetworkDenied] if the response came from a blocked address
143
+ def validate_remote_ip!(ip:, url:)
144
+ return if allow_private_networks?
145
+ return if ip.nil? || ip.empty?
146
+
147
+ parsed_ip = parse_ip(ip)
148
+ raise PrivateNetworkDenied, "Remote IP could not be validated for #{url}" unless parsed_ip
149
+ return unless blocked_ip?(parsed_ip)
150
+
151
+ raise PrivateNetworkDenied, "Private network target denied for #{url}"
152
+ end
153
+
154
+ private
155
+
156
+ attr_reader :resolver
157
+
158
+ def validate_positive_integer!(name, value)
159
+ raise ArgumentError, "#{name} must be positive" unless value.is_a?(Integer) && value.positive?
160
+
161
+ value
162
+ end
163
+
164
+ def validate_non_negative_integer!(name, value)
165
+ raise ArgumentError, "#{name} must be non-negative" unless value.is_a?(Integer) && !value.negative?
166
+
167
+ value
168
+ end
169
+
170
+ def enforce_same_origin!(url, origin_url, relation)
171
+ return if relation == :initial || allow_cross_origin_followups?
172
+
173
+ enforce_follow_up_scheme!(url, origin_url)
174
+ return if comparable_origin(url) == comparable_origin(origin_url)
175
+
176
+ raise CrossOriginFollowUpDenied, "Cross-origin follow-up denied for #{url}"
177
+ end
178
+
179
+ def enforce_follow_up_scheme!(url, origin_url)
180
+ return unless origin_url.scheme == 'https' && url.scheme == 'http'
181
+
182
+ raise UnsupportedUrlScheme, "Follow-up downgraded from https to http for #{url}"
183
+ end
184
+
185
+ def comparable_origin(url)
186
+ [url.host, normalized_port(url)]
187
+ end
188
+
189
+ def normalized_port(url)
190
+ return url.port if url.port
191
+
192
+ url.scheme == 'https' ? 443 : 80
193
+ end
194
+
195
+ def enforce_public_network!(url)
196
+ host = url.host
197
+ return if allow_private_networks?
198
+ return unless blocked_host?(host) || resolved_ip_addresses(host).any? { |address| blocked_ip?(address) }
199
+
200
+ raise PrivateNetworkDenied, "Private network target denied for #{url}"
201
+ end
202
+
203
+ def blocked_host?(host)
204
+ LOCAL_HOSTS.include?(host.to_s.downcase)
205
+ end
206
+
207
+ def resolved_ip_addresses(host)
208
+ literal = parse_ip(host)
209
+ return [literal] if literal
210
+
211
+ if resolver.respond_to?(:each_address)
212
+ addresses_from_each_address(host)
213
+ else
214
+ addresses_from_getaddrinfo(host)
215
+ end
216
+ rescue Resolv::ResolvError, SocketError, SystemCallError
217
+ []
218
+ end
219
+
220
+ def addresses_from_each_address(host)
221
+ [].tap do |addresses|
222
+ resolver.each_address(host) do |address|
223
+ parsed = parse_ip(address)
224
+ addresses << parsed if parsed
225
+ end
226
+ end
227
+ end
228
+
229
+ def addresses_from_getaddrinfo(host)
230
+ resolver.getaddrinfo(host, nil).filter_map do |entry|
231
+ parse_ip(entry[3])
232
+ end
233
+ end
234
+
235
+ def parse_ip(value)
236
+ IPAddr.new(value)
237
+ rescue IPAddr::AddressFamilyError, IPAddr::InvalidAddressError
238
+ nil
239
+ end
240
+
241
+ def blocked_ip?(address)
242
+ BLOCKED_IP_RANGES.any? { |range| range.include?(address) }
243
+ end
244
+ end
245
+
246
+ Policy::DEFAULT_POLICY = Policy.new
247
+ end
248
+ end
@@ -4,7 +4,13 @@ module Html2rss
4
4
  class RequestService
5
5
  ##
6
6
  # Commands the Puppeteer Browser to the website and builds the Response.
7
- class PuppetCommander
7
+ class PuppetCommander # rubocop:disable Metrics/ClassLength
8
+ BROWSER_UNSAFE_HEADERS = %w[
9
+ host connection content-length transfer-encoding
10
+ sec-fetch-dest sec-fetch-mode sec-fetch-site sec-fetch-user
11
+ upgrade-insecure-requests
12
+ ].to_set.freeze
13
+
8
14
  # @param ctx [Context]
9
15
  # @param browser [Puppeteer::Browser]
10
16
  # @param skip_request_resources [Set<String>] the resource types not to request
@@ -19,13 +25,18 @@ module Html2rss
19
25
  @referer = referer
20
26
  end
21
27
 
22
- # @return [Response]
28
+ ##
29
+ # Visits the request URL and normalizes the page into a response object.
30
+ #
31
+ # @return [Response] rendered page response
23
32
  def call
24
33
  page = new_page
25
-
26
- response = navigate_to_destination(page, ctx.url)
27
-
28
- Response.new(body: body(page), headers: response.headers)
34
+ navigation_response = navigate_to_destination(page, ctx.url)
35
+ perform_preload(page)
36
+ raise_navigation_error_if_any
37
+ final_navigation_response = latest_navigation_response || navigation_response
38
+ validate_navigation_response!(final_navigation_response)
39
+ build_response(page, final_navigation_response)
29
40
  ensure
30
41
  page&.close
31
42
  end
@@ -35,27 +46,215 @@ module Html2rss
35
46
  # @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
36
47
  def new_page
37
48
  page = browser.new_page
38
- page.extra_http_headers = ctx.headers
49
+ @main_frame = page.main_frame if page.respond_to?(:main_frame)
50
+ configure_page(page)
51
+ configure_navigation_guards(page)
52
+ page
53
+ end
39
54
 
40
- return page if skip_request_resources.empty?
55
+ ##
56
+ # @param page [Puppeteer::Page]
57
+ # @return [void]
58
+ def configure_page(page)
59
+ page.extra_http_headers = browser_headers
60
+ page.default_navigation_timeout = navigation_timeout_ms
61
+ page.default_timeout = navigation_timeout_ms
62
+ end
41
63
 
64
+ ##
65
+ # @param page [Puppeteer::Page]
66
+ # @return [void]
67
+ def configure_navigation_guards(page)
42
68
  page.request_interception = true
43
69
  page.on('request') do |request|
44
- skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
70
+ handle_request(request)
45
71
  end
46
-
47
- page
72
+ page.on('response') { |response| handle_response(response) }
48
73
  end
49
74
 
75
+ ##
76
+ # @param page [Puppeteer::Page] browser page
77
+ # @param url [Html2rss::Url] target URL
78
+ # @return [Puppeteer::HTTPResponse, nil] the navigation response if one was produced
50
79
  def navigate_to_destination(page, url)
51
- page.goto(url, wait_until: 'networkidle0', referer:)
80
+ @navigation_error = nil
81
+ @latest_navigation_response = nil
82
+ page.goto(url, wait_until: 'networkidle0', referer:, timeout: navigation_timeout_ms).tap do
83
+ raise_navigation_error_if_any
84
+ end
85
+ rescue StandardError
86
+ raise_navigation_error_if_any
87
+
88
+ raise
52
89
  end
53
90
 
91
+ ##
92
+ # @param page [Puppeteer::Page] browser page
93
+ # @return [String] rendered HTML content
54
94
  def body(page) = page.content
55
95
 
56
96
  private
57
97
 
58
- attr_reader :ctx, :browser, :skip_request_resources, :referer
98
+ attr_reader :ctx, :browser, :skip_request_resources, :referer, :latest_navigation_response, :main_frame
99
+
100
+ def raise_navigation_error_if_any
101
+ raise @navigation_error if @navigation_error
102
+ end
103
+
104
+ def navigation_timeout_ms
105
+ ctx.policy.total_timeout_seconds * 1000
106
+ end
107
+
108
+ def browser_headers
109
+ ctx.headers.reject { |key, _| BROWSER_UNSAFE_HEADERS.include?(key.to_s.downcase) }
110
+ end
111
+
112
+ def handle_request(request)
113
+ validate_request!(request)
114
+
115
+ skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
116
+ rescue Html2rss::Error => error
117
+ store_navigation_error(error, navigation_request: request.navigation_request?)
118
+ request.abort
119
+ end
120
+
121
+ def handle_response(response)
122
+ @latest_navigation_response = response if main_frame_navigation_response?(response)
123
+ validate_response!(response)
124
+ rescue Html2rss::Error => error
125
+ store_navigation_error(error, navigation_request: response.request.navigation_request?)
126
+ end
127
+
128
+ def validate_request!(request)
129
+ validate_navigation_redirect_chain!(request)
130
+ validate_navigation_target!(request)
131
+ end
132
+
133
+ def main_frame_navigation_response?(response)
134
+ request = response.request
135
+ return false unless request.navigation_request?
136
+ return true unless request.respond_to?(:frame)
137
+
138
+ frame = request.frame
139
+ return true if frame.nil?
140
+ return frame == main_frame unless main_frame.nil?
141
+ return true unless frame.respond_to?(:parent_frame)
142
+
143
+ frame.parent_frame.nil?
144
+ end
145
+
146
+ def build_response(page, navigation_response)
147
+ page_body = body(page)
148
+ ResponseGuard.new(policy: ctx.policy).inspect_body!(page_body)
149
+
150
+ Response.new(
151
+ body: page_body,
152
+ headers: navigation_response&.headers || {},
153
+ url: response_url(navigation_response, ctx.url),
154
+ status: navigation_response&.status
155
+ )
156
+ end
157
+
158
+ def validate_navigation_response!(navigation_response)
159
+ final_url = response_url(navigation_response, ctx.url)
160
+ ctx.policy.validate_remote_ip!(ip: remote_ip(navigation_response), url: final_url)
161
+ end
162
+
163
+ def validate_response!(response)
164
+ validate_navigation_response!(response)
165
+ end
166
+
167
+ def response_url(navigation_response, fallback_url)
168
+ raw_url = navigation_response&.url || fallback_url.to_s
169
+ Html2rss::Url.from_absolute(raw_url)
170
+ end
171
+
172
+ def remote_ip(navigation_response)
173
+ navigation_response.remote_address&.ip
174
+ end
175
+
176
+ def request_chain(request)
177
+ (request.redirect_chain + [request]).map { |entry| request_url(entry) }
178
+ end
179
+
180
+ def request_url(request)
181
+ Html2rss::Url.from_absolute(request.url)
182
+ end
183
+
184
+ def validate_navigation_redirect_chain!(request)
185
+ request_chain(request).each_cons(2) do |from_url, to_url|
186
+ ctx.policy.validate_redirect!(from_url:, to_url:, origin_url: ctx.origin_url, relation: ctx.relation)
187
+ end
188
+ end
189
+
190
+ def validate_navigation_target!(request)
191
+ ctx.policy.validate_request!(url: request_url(request), origin_url: ctx.origin_url, relation: ctx.relation)
192
+ end
193
+
194
+ def store_navigation_error(error, navigation_request:)
195
+ return unless navigation_request
196
+
197
+ @navigation_error = error if @navigation_error.nil?
198
+ end
199
+
200
+ def perform_preload(page)
201
+ preload_config = ctx.browserless_preload
202
+ return unless preload_config
203
+
204
+ wait_after(page, preload_config[:wait_after_ms])
205
+ click_selectors(page, preload_config[:click_selectors]) if preload_config[:click_selectors]
206
+ scroll_down(page, preload_config[:scroll_down]) if preload_config[:scroll_down]
207
+ wait_after(page, preload_config[:wait_after_ms])
208
+ end
209
+
210
+ def wait_after(page, timeout_ms)
211
+ return unless timeout_ms
212
+
213
+ ctx.budget.consume!
214
+ page.wait_for_timeout(timeout_ms)
215
+ end
216
+
217
+ def click_selectors(page, selectors)
218
+ selectors.each { |selector_config| click_selector(page, selector_config) }
219
+ end
220
+
221
+ def scroll_down(page, config)
222
+ iterations = config.fetch(:iterations, 1)
223
+ wait_after_ms = config[:wait_after_ms]
224
+ previous_height = nil
225
+
226
+ iterations.times do
227
+ updated_height = perform_scroll_iteration(page, wait_after_ms, previous_height)
228
+ break unless updated_height
229
+
230
+ previous_height = updated_height
231
+ end
232
+ end
233
+
234
+ def click_selector(page, config)
235
+ selector = config.fetch(:selector)
236
+ max_clicks = config.fetch(:max_clicks, 1)
237
+ wait_after_ms = config[:wait_after_ms]
238
+
239
+ max_clicks.times do
240
+ break unless (element = page.query_selector(selector))
241
+
242
+ ctx.budget.consume!
243
+ element.click
244
+ wait_after(page, wait_after_ms)
245
+ end
246
+ end
247
+
248
+ def perform_scroll_iteration(page, wait_after_ms, previous_height)
249
+ ctx.budget.consume!
250
+ page.evaluate('() => window.scrollTo(0, document.body.scrollHeight)')
251
+ wait_after(page, wait_after_ms)
252
+
253
+ current_height = page.evaluate('() => document.body.scrollHeight')
254
+ return if previous_height && current_height <= previous_height
255
+
256
+ current_height
257
+ end
59
258
  end
60
259
  end
61
260
  end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'nokogiri'
4
+
3
5
  module Html2rss
4
6
  class RequestService
5
7
  ##
@@ -7,21 +9,59 @@ module Html2rss
7
9
  class Response
8
10
  ##
9
11
  # @param body [String] the body of the response
12
+ # @param url [Html2rss::Url] the final request URL
10
13
  # @param headers [Hash] the headers of the response
11
- def initialize(body:, headers: {})
14
+ # @param status [Integer, nil] the HTTP status code when available
15
+ def initialize(body:, url:, headers: {}, status: nil)
12
16
  @body = body
13
17
 
14
18
  headers = headers.dup
15
19
  headers.transform_keys!(&:to_s)
16
20
 
17
21
  @headers = headers
22
+ @status = status
23
+ @url = url
18
24
  end
19
25
 
20
- # @return [String] the body of the response
26
+ # @return [String] the raw body of the response
21
27
  attr_reader :body
22
28
 
23
29
  # @return [Hash<String, Object>] the headers of the response
24
30
  attr_reader :headers
31
+
32
+ # @return [Integer, nil] the HTTP status code when known
33
+ attr_reader :status
34
+
35
+ # @return [Html2rss::Url] the URL of the response
36
+ attr_reader :url
37
+
38
+ def content_type = header('content-type').to_s
39
+ def json_response? = content_type.include?('application/json')
40
+ def html_response? = content_type.include?('text/html')
41
+
42
+ ##
43
+ # @return [Nokogiri::HTML::Document, Hash] the parsed body of the response, frozen object
44
+ # @raise [UnsupportedResponseContentType] if the content type is not supported
45
+ def parsed_body
46
+ @parsed_body ||= if html_response?
47
+ Nokogiri::HTML(body).tap do |doc|
48
+ # Remove comments from the document to avoid processing irrelevant content
49
+ doc.xpath('//comment()').each(&:remove)
50
+ end.freeze
51
+ elsif json_response?
52
+ JSON.parse(body, symbolize_names: true).freeze
53
+ else
54
+ raise UnsupportedResponseContentType, "Unsupported content type: #{content_type}"
55
+ end
56
+ end
57
+
58
+ private
59
+
60
+ def header(name)
61
+ headers.fetch(name) do
62
+ headers.find { |key, _value| key.casecmp?(name) }&.last
63
+ end
64
+ end
25
65
  end
26
66
  end
27
67
  end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # Enforces response-size limits before parsing.
7
+ class ResponseGuard
8
+ ##
9
+ # @param policy [Policy] request policy that defines byte ceilings
10
+ def initialize(policy:)
11
+ @policy = policy
12
+ @streamed_bytes = 0
13
+ end
14
+
15
+ ##
16
+ # Validates response headers and streamed byte count.
17
+ #
18
+ # @param total_bytes [Integer] cumulative byte count received so far
19
+ # @param headers [Hash, nil] response headers if known
20
+ # @return [void]
21
+ # @raise [ResponseTooLarge] if the response exceeds configured limits
22
+ def inspect_chunk!(total_bytes:, headers: nil)
23
+ header_length = headers&.fetch('content-length', headers&.fetch('Content-Length', nil))
24
+ raise_if_too_large!(header_length.to_i, policy.max_response_bytes) if header_length
25
+
26
+ @streamed_bytes = total_bytes
27
+ raise_if_too_large!(@streamed_bytes, policy.max_response_bytes)
28
+ end
29
+
30
+ ##
31
+ # Validates the final response body after middleware processing.
32
+ #
33
+ # @param body [String, nil] final response body
34
+ # @return [void]
35
+ # @raise [ResponseTooLarge] if the final body exceeds configured limits
36
+ # @raise [BlockedSurfaceDetected] if the body matches known anti-bot interstitial signatures
37
+ def inspect_body!(body)
38
+ normalized_body = body.to_s
39
+ size = normalized_body.bytesize
40
+ raise_if_too_large!(size, policy.max_decompressed_bytes)
41
+ raise_if_blocked_surface!(normalized_body)
42
+ end
43
+
44
+ private
45
+
46
+ attr_reader :policy
47
+
48
+ def raise_if_blocked_surface!(body)
49
+ signature = Html2rss::BlockedSurface.interstitial_signature_for(body)
50
+ return unless signature
51
+
52
+ raise BlockedSurfaceDetected, signature.fetch(:message)
53
+ end
54
+
55
+ def raise_if_too_large!(bytes, limit)
56
+ return unless bytes > limit
57
+
58
+ raise ResponseTooLarge, "Response exceeded #{limit} bytes"
59
+ end
60
+ end
61
+ end
62
+ end