html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'ipaddr'
|
|
4
|
+
require 'resolv'
|
|
5
|
+
require 'socket'
|
|
6
|
+
|
|
7
|
+
module Html2rss
|
|
8
|
+
class RequestService
|
|
9
|
+
##
|
|
10
|
+
# Describes the runtime request envelope for a single feed build.
|
|
11
|
+
class Policy # rubocop:disable Metrics/ClassLength
|
|
12
|
+
MAX_REQUESTS_CEILING = 10
|
|
13
|
+
LOCAL_HOSTS = %w[localhost localhost.localdomain metadata.google.internal].to_set.freeze
|
|
14
|
+
BLOCKED_IP_RANGES = [
|
|
15
|
+
IPAddr.new('0.0.0.0/8'),
|
|
16
|
+
IPAddr.new('10.0.0.0/8'),
|
|
17
|
+
IPAddr.new('127.0.0.0/8'),
|
|
18
|
+
IPAddr.new('169.254.0.0/16'),
|
|
19
|
+
IPAddr.new('172.16.0.0/12'),
|
|
20
|
+
IPAddr.new('192.168.0.0/16'),
|
|
21
|
+
IPAddr.new('224.0.0.0/4'),
|
|
22
|
+
IPAddr.new('::/128'),
|
|
23
|
+
IPAddr.new('::1/128'),
|
|
24
|
+
IPAddr.new('fe80::/10'),
|
|
25
|
+
IPAddr.new('fc00::/7'),
|
|
26
|
+
IPAddr.new('ff00::/8')
|
|
27
|
+
].freeze
|
|
28
|
+
|
|
29
|
+
DEFAULTS = {
|
|
30
|
+
connect_timeout_seconds: 5,
|
|
31
|
+
read_timeout_seconds: 10,
|
|
32
|
+
total_timeout_seconds: 30,
|
|
33
|
+
max_redirects: 3,
|
|
34
|
+
max_response_bytes: 5_242_880,
|
|
35
|
+
max_decompressed_bytes: 10_485_760,
|
|
36
|
+
max_requests: 1,
|
|
37
|
+
allow_private_networks: false,
|
|
38
|
+
allow_cross_origin_followups: false
|
|
39
|
+
}.freeze
|
|
40
|
+
|
|
41
|
+
##
|
|
42
|
+
# @param connect_timeout_seconds [Integer] maximum connection setup time
|
|
43
|
+
# @param read_timeout_seconds [Integer] maximum read stall time
|
|
44
|
+
# @param total_timeout_seconds [Integer] maximum total request time
|
|
45
|
+
# @param max_redirects [Integer] maximum redirect count
|
|
46
|
+
# @param max_response_bytes [Integer] maximum streamed response bytes
|
|
47
|
+
# @param max_decompressed_bytes [Integer] maximum final body size
|
|
48
|
+
# @param max_requests [Integer] maximum requests per feed build
|
|
49
|
+
# @param allow_private_networks [Boolean] whether private network targets are allowed
|
|
50
|
+
# @param allow_cross_origin_followups [Boolean] whether follow-up requests may leave the origin host
|
|
51
|
+
# @param resolver [#each_address] DNS resolver used for hostname classification
|
|
52
|
+
def initialize(connect_timeout_seconds: DEFAULTS[:connect_timeout_seconds], # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
|
53
|
+
read_timeout_seconds: DEFAULTS[:read_timeout_seconds],
|
|
54
|
+
total_timeout_seconds: DEFAULTS[:total_timeout_seconds],
|
|
55
|
+
max_redirects: DEFAULTS[:max_redirects],
|
|
56
|
+
max_response_bytes: DEFAULTS[:max_response_bytes],
|
|
57
|
+
max_decompressed_bytes: DEFAULTS[:max_decompressed_bytes],
|
|
58
|
+
max_requests: DEFAULTS[:max_requests],
|
|
59
|
+
allow_private_networks: DEFAULTS[:allow_private_networks],
|
|
60
|
+
allow_cross_origin_followups: DEFAULTS[:allow_cross_origin_followups],
|
|
61
|
+
resolver: Socket)
|
|
62
|
+
@connect_timeout_seconds = validate_positive_integer!(:connect_timeout_seconds, connect_timeout_seconds)
|
|
63
|
+
@read_timeout_seconds = validate_positive_integer!(:read_timeout_seconds, read_timeout_seconds)
|
|
64
|
+
@total_timeout_seconds = validate_positive_integer!(:total_timeout_seconds, total_timeout_seconds)
|
|
65
|
+
@max_redirects = validate_non_negative_integer!(:max_redirects, max_redirects)
|
|
66
|
+
@max_response_bytes = validate_positive_integer!(:max_response_bytes, max_response_bytes)
|
|
67
|
+
@max_decompressed_bytes = validate_positive_integer!(:max_decompressed_bytes, max_decompressed_bytes)
|
|
68
|
+
@max_requests = [validate_positive_integer!(:max_requests, max_requests), MAX_REQUESTS_CEILING].min
|
|
69
|
+
@allow_private_networks = allow_private_networks ? true : false
|
|
70
|
+
@allow_cross_origin_followups = allow_cross_origin_followups ? true : false
|
|
71
|
+
@resolver = resolver
|
|
72
|
+
freeze
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
attr_reader :connect_timeout_seconds,
|
|
76
|
+
:read_timeout_seconds,
|
|
77
|
+
:total_timeout_seconds,
|
|
78
|
+
:max_redirects,
|
|
79
|
+
:max_response_bytes,
|
|
80
|
+
:max_decompressed_bytes,
|
|
81
|
+
:max_requests
|
|
82
|
+
|
|
83
|
+
##
|
|
84
|
+
# @return [Boolean] whether private network targets may be requested
|
|
85
|
+
def allow_private_networks?
|
|
86
|
+
@allow_private_networks
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
##
|
|
90
|
+
# @return [Boolean] whether follow-up requests may leave the initial origin
|
|
91
|
+
def allow_cross_origin_followups?
|
|
92
|
+
@allow_cross_origin_followups
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
##
|
|
96
|
+
# Returns the default request policy.
|
|
97
|
+
#
|
|
98
|
+
# @return [Policy] a default, frozen policy instance
|
|
99
|
+
# rubocop:disable Layout/ClassStructure
|
|
100
|
+
def self.default
|
|
101
|
+
new
|
|
102
|
+
end
|
|
103
|
+
# rubocop:enable Layout/ClassStructure
|
|
104
|
+
|
|
105
|
+
##
|
|
106
|
+
# Validates whether a request target is permitted for the given context.
|
|
107
|
+
#
|
|
108
|
+
# @param url [Html2rss::Url] destination URL
|
|
109
|
+
# @param origin_url [Html2rss::Url] initial URL of the feed build
|
|
110
|
+
# @param relation [Symbol] logical reason for the request
|
|
111
|
+
# @return [void]
|
|
112
|
+
# @raise [CrossOriginFollowUpDenied] if a follow-up leaves the origin host
|
|
113
|
+
# @raise [PrivateNetworkDenied] if the target resolves to a private address
|
|
114
|
+
def validate_request!(url:, origin_url:, relation:)
|
|
115
|
+
enforce_same_origin!(url, origin_url, relation)
|
|
116
|
+
enforce_public_network!(url)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
##
|
|
120
|
+
# Validates a redirect hop before it is followed.
|
|
121
|
+
#
|
|
122
|
+
# @param from_url [Html2rss::Url] URL that produced the redirect
|
|
123
|
+
# @param to_url [Html2rss::Url] redirect destination
|
|
124
|
+
# @param origin_url [Html2rss::Url] initial URL of the feed build
|
|
125
|
+
# @param relation [Symbol] logical reason for the request
|
|
126
|
+
# @return [void]
|
|
127
|
+
# @raise [UnsupportedUrlScheme] if the redirect downgrades from HTTPS to HTTP
|
|
128
|
+
def validate_redirect!(from_url:, to_url:, origin_url:, relation:)
|
|
129
|
+
if from_url.scheme == 'https' && to_url.scheme == 'http'
|
|
130
|
+
raise UnsupportedUrlScheme, 'Redirect downgraded from https to http'
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
validate_request!(url: to_url, origin_url:, relation:)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
##
|
|
137
|
+
# Validates the resolved remote IP for a completed request.
|
|
138
|
+
#
|
|
139
|
+
# @param ip [String, nil] remote IP address reported by the client
|
|
140
|
+
# @param url [Html2rss::Url] URL associated with the response
|
|
141
|
+
# @return [void]
|
|
142
|
+
# @raise [PrivateNetworkDenied] if the response came from a blocked address
|
|
143
|
+
def validate_remote_ip!(ip:, url:)
|
|
144
|
+
return if allow_private_networks?
|
|
145
|
+
return if ip.nil? || ip.empty?
|
|
146
|
+
|
|
147
|
+
parsed_ip = parse_ip(ip)
|
|
148
|
+
raise PrivateNetworkDenied, "Remote IP could not be validated for #{url}" unless parsed_ip
|
|
149
|
+
return unless blocked_ip?(parsed_ip)
|
|
150
|
+
|
|
151
|
+
raise PrivateNetworkDenied, "Private network target denied for #{url}"
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
private
|
|
155
|
+
|
|
156
|
+
attr_reader :resolver
|
|
157
|
+
|
|
158
|
+
def validate_positive_integer!(name, value)
|
|
159
|
+
raise ArgumentError, "#{name} must be positive" unless value.is_a?(Integer) && value.positive?
|
|
160
|
+
|
|
161
|
+
value
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def validate_non_negative_integer!(name, value)
|
|
165
|
+
raise ArgumentError, "#{name} must be non-negative" unless value.is_a?(Integer) && !value.negative?
|
|
166
|
+
|
|
167
|
+
value
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def enforce_same_origin!(url, origin_url, relation)
|
|
171
|
+
return if relation == :initial || allow_cross_origin_followups?
|
|
172
|
+
|
|
173
|
+
enforce_follow_up_scheme!(url, origin_url)
|
|
174
|
+
return if comparable_origin(url) == comparable_origin(origin_url)
|
|
175
|
+
|
|
176
|
+
raise CrossOriginFollowUpDenied, "Cross-origin follow-up denied for #{url}"
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def enforce_follow_up_scheme!(url, origin_url)
|
|
180
|
+
return unless origin_url.scheme == 'https' && url.scheme == 'http'
|
|
181
|
+
|
|
182
|
+
raise UnsupportedUrlScheme, "Follow-up downgraded from https to http for #{url}"
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def comparable_origin(url)
|
|
186
|
+
[url.host, normalized_port(url)]
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def normalized_port(url)
|
|
190
|
+
return url.port if url.port
|
|
191
|
+
|
|
192
|
+
url.scheme == 'https' ? 443 : 80
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def enforce_public_network!(url)
|
|
196
|
+
host = url.host
|
|
197
|
+
return if allow_private_networks?
|
|
198
|
+
return unless blocked_host?(host) || resolved_ip_addresses(host).any? { |address| blocked_ip?(address) }
|
|
199
|
+
|
|
200
|
+
raise PrivateNetworkDenied, "Private network target denied for #{url}"
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def blocked_host?(host)
|
|
204
|
+
LOCAL_HOSTS.include?(host.to_s.downcase)
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def resolved_ip_addresses(host)
|
|
208
|
+
literal = parse_ip(host)
|
|
209
|
+
return [literal] if literal
|
|
210
|
+
|
|
211
|
+
if resolver.respond_to?(:each_address)
|
|
212
|
+
addresses_from_each_address(host)
|
|
213
|
+
else
|
|
214
|
+
addresses_from_getaddrinfo(host)
|
|
215
|
+
end
|
|
216
|
+
rescue Resolv::ResolvError, SocketError, SystemCallError
|
|
217
|
+
[]
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def addresses_from_each_address(host)
|
|
221
|
+
[].tap do |addresses|
|
|
222
|
+
resolver.each_address(host) do |address|
|
|
223
|
+
parsed = parse_ip(address)
|
|
224
|
+
addresses << parsed if parsed
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def addresses_from_getaddrinfo(host)
|
|
230
|
+
resolver.getaddrinfo(host, nil).filter_map do |entry|
|
|
231
|
+
parse_ip(entry[3])
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def parse_ip(value)
|
|
236
|
+
IPAddr.new(value)
|
|
237
|
+
rescue IPAddr::AddressFamilyError, IPAddr::InvalidAddressError
|
|
238
|
+
nil
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def blocked_ip?(address)
|
|
242
|
+
BLOCKED_IP_RANGES.any? { |range| range.include?(address) }
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
Policy::DEFAULT_POLICY = Policy.new
|
|
247
|
+
end
|
|
248
|
+
end
|
|
@@ -4,7 +4,13 @@ module Html2rss
|
|
|
4
4
|
class RequestService
|
|
5
5
|
##
|
|
6
6
|
# Commands the Puppeteer Browser to the website and builds the Response.
|
|
7
|
-
class PuppetCommander
|
|
7
|
+
class PuppetCommander # rubocop:disable Metrics/ClassLength
|
|
8
|
+
BROWSER_UNSAFE_HEADERS = %w[
|
|
9
|
+
host connection content-length transfer-encoding
|
|
10
|
+
sec-fetch-dest sec-fetch-mode sec-fetch-site sec-fetch-user
|
|
11
|
+
upgrade-insecure-requests
|
|
12
|
+
].to_set.freeze
|
|
13
|
+
|
|
8
14
|
# @param ctx [Context]
|
|
9
15
|
# @param browser [Puppeteer::Browser]
|
|
10
16
|
# @param skip_request_resources [Set<String>] the resource types not to request
|
|
@@ -19,13 +25,18 @@ module Html2rss
|
|
|
19
25
|
@referer = referer
|
|
20
26
|
end
|
|
21
27
|
|
|
22
|
-
|
|
28
|
+
##
|
|
29
|
+
# Visits the request URL and normalizes the page into a response object.
|
|
30
|
+
#
|
|
31
|
+
# @return [Response] rendered page response
|
|
23
32
|
def call
|
|
24
33
|
page = new_page
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
34
|
+
navigation_response = navigate_to_destination(page, ctx.url)
|
|
35
|
+
perform_preload(page)
|
|
36
|
+
raise_navigation_error_if_any
|
|
37
|
+
final_navigation_response = latest_navigation_response || navigation_response
|
|
38
|
+
validate_navigation_response!(final_navigation_response)
|
|
39
|
+
build_response(page, final_navigation_response)
|
|
29
40
|
ensure
|
|
30
41
|
page&.close
|
|
31
42
|
end
|
|
@@ -35,27 +46,215 @@ module Html2rss
|
|
|
35
46
|
# @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
|
|
36
47
|
def new_page
|
|
37
48
|
page = browser.new_page
|
|
38
|
-
page.
|
|
49
|
+
@main_frame = page.main_frame if page.respond_to?(:main_frame)
|
|
50
|
+
configure_page(page)
|
|
51
|
+
configure_navigation_guards(page)
|
|
52
|
+
page
|
|
53
|
+
end
|
|
39
54
|
|
|
40
|
-
|
|
55
|
+
##
|
|
56
|
+
# @param page [Puppeteer::Page]
|
|
57
|
+
# @return [void]
|
|
58
|
+
def configure_page(page)
|
|
59
|
+
page.extra_http_headers = browser_headers
|
|
60
|
+
page.default_navigation_timeout = navigation_timeout_ms
|
|
61
|
+
page.default_timeout = navigation_timeout_ms
|
|
62
|
+
end
|
|
41
63
|
|
|
64
|
+
##
|
|
65
|
+
# @param page [Puppeteer::Page]
|
|
66
|
+
# @return [void]
|
|
67
|
+
def configure_navigation_guards(page)
|
|
42
68
|
page.request_interception = true
|
|
43
69
|
page.on('request') do |request|
|
|
44
|
-
|
|
70
|
+
handle_request(request)
|
|
45
71
|
end
|
|
46
|
-
|
|
47
|
-
page
|
|
72
|
+
page.on('response') { |response| handle_response(response) }
|
|
48
73
|
end
|
|
49
74
|
|
|
75
|
+
##
|
|
76
|
+
# @param page [Puppeteer::Page] browser page
|
|
77
|
+
# @param url [Html2rss::Url] target URL
|
|
78
|
+
# @return [Puppeteer::HTTPResponse, nil] the navigation response if one was produced
|
|
50
79
|
def navigate_to_destination(page, url)
|
|
51
|
-
|
|
80
|
+
@navigation_error = nil
|
|
81
|
+
@latest_navigation_response = nil
|
|
82
|
+
page.goto(url, wait_until: 'networkidle0', referer:, timeout: navigation_timeout_ms).tap do
|
|
83
|
+
raise_navigation_error_if_any
|
|
84
|
+
end
|
|
85
|
+
rescue StandardError
|
|
86
|
+
raise_navigation_error_if_any
|
|
87
|
+
|
|
88
|
+
raise
|
|
52
89
|
end
|
|
53
90
|
|
|
91
|
+
##
|
|
92
|
+
# @param page [Puppeteer::Page] browser page
|
|
93
|
+
# @return [String] rendered HTML content
|
|
54
94
|
def body(page) = page.content
|
|
55
95
|
|
|
56
96
|
private
|
|
57
97
|
|
|
58
|
-
attr_reader :ctx, :browser, :skip_request_resources, :referer
|
|
98
|
+
attr_reader :ctx, :browser, :skip_request_resources, :referer, :latest_navigation_response, :main_frame
|
|
99
|
+
|
|
100
|
+
def raise_navigation_error_if_any
|
|
101
|
+
raise @navigation_error if @navigation_error
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def navigation_timeout_ms
|
|
105
|
+
ctx.policy.total_timeout_seconds * 1000
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def browser_headers
|
|
109
|
+
ctx.headers.reject { |key, _| BROWSER_UNSAFE_HEADERS.include?(key.to_s.downcase) }
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def handle_request(request)
|
|
113
|
+
validate_request!(request)
|
|
114
|
+
|
|
115
|
+
skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
|
|
116
|
+
rescue Html2rss::Error => error
|
|
117
|
+
store_navigation_error(error, navigation_request: request.navigation_request?)
|
|
118
|
+
request.abort
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def handle_response(response)
|
|
122
|
+
@latest_navigation_response = response if main_frame_navigation_response?(response)
|
|
123
|
+
validate_response!(response)
|
|
124
|
+
rescue Html2rss::Error => error
|
|
125
|
+
store_navigation_error(error, navigation_request: response.request.navigation_request?)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def validate_request!(request)
|
|
129
|
+
validate_navigation_redirect_chain!(request)
|
|
130
|
+
validate_navigation_target!(request)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def main_frame_navigation_response?(response)
|
|
134
|
+
request = response.request
|
|
135
|
+
return false unless request.navigation_request?
|
|
136
|
+
return true unless request.respond_to?(:frame)
|
|
137
|
+
|
|
138
|
+
frame = request.frame
|
|
139
|
+
return true if frame.nil?
|
|
140
|
+
return frame == main_frame unless main_frame.nil?
|
|
141
|
+
return true unless frame.respond_to?(:parent_frame)
|
|
142
|
+
|
|
143
|
+
frame.parent_frame.nil?
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def build_response(page, navigation_response)
|
|
147
|
+
page_body = body(page)
|
|
148
|
+
ResponseGuard.new(policy: ctx.policy).inspect_body!(page_body)
|
|
149
|
+
|
|
150
|
+
Response.new(
|
|
151
|
+
body: page_body,
|
|
152
|
+
headers: navigation_response&.headers || {},
|
|
153
|
+
url: response_url(navigation_response, ctx.url),
|
|
154
|
+
status: navigation_response&.status
|
|
155
|
+
)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def validate_navigation_response!(navigation_response)
|
|
159
|
+
final_url = response_url(navigation_response, ctx.url)
|
|
160
|
+
ctx.policy.validate_remote_ip!(ip: remote_ip(navigation_response), url: final_url)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def validate_response!(response)
|
|
164
|
+
validate_navigation_response!(response)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def response_url(navigation_response, fallback_url)
|
|
168
|
+
raw_url = navigation_response&.url || fallback_url.to_s
|
|
169
|
+
Html2rss::Url.from_absolute(raw_url)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def remote_ip(navigation_response)
|
|
173
|
+
navigation_response.remote_address&.ip
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def request_chain(request)
|
|
177
|
+
(request.redirect_chain + [request]).map { |entry| request_url(entry) }
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def request_url(request)
|
|
181
|
+
Html2rss::Url.from_absolute(request.url)
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def validate_navigation_redirect_chain!(request)
|
|
185
|
+
request_chain(request).each_cons(2) do |from_url, to_url|
|
|
186
|
+
ctx.policy.validate_redirect!(from_url:, to_url:, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def validate_navigation_target!(request)
|
|
191
|
+
ctx.policy.validate_request!(url: request_url(request), origin_url: ctx.origin_url, relation: ctx.relation)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def store_navigation_error(error, navigation_request:)
|
|
195
|
+
return unless navigation_request
|
|
196
|
+
|
|
197
|
+
@navigation_error = error if @navigation_error.nil?
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def perform_preload(page)
|
|
201
|
+
preload_config = ctx.browserless_preload
|
|
202
|
+
return unless preload_config
|
|
203
|
+
|
|
204
|
+
wait_after(page, preload_config[:wait_after_ms])
|
|
205
|
+
click_selectors(page, preload_config[:click_selectors]) if preload_config[:click_selectors]
|
|
206
|
+
scroll_down(page, preload_config[:scroll_down]) if preload_config[:scroll_down]
|
|
207
|
+
wait_after(page, preload_config[:wait_after_ms])
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def wait_after(page, timeout_ms)
|
|
211
|
+
return unless timeout_ms
|
|
212
|
+
|
|
213
|
+
ctx.budget.consume!
|
|
214
|
+
page.wait_for_timeout(timeout_ms)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def click_selectors(page, selectors)
|
|
218
|
+
selectors.each { |selector_config| click_selector(page, selector_config) }
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def scroll_down(page, config)
|
|
222
|
+
iterations = config.fetch(:iterations, 1)
|
|
223
|
+
wait_after_ms = config[:wait_after_ms]
|
|
224
|
+
previous_height = nil
|
|
225
|
+
|
|
226
|
+
iterations.times do
|
|
227
|
+
updated_height = perform_scroll_iteration(page, wait_after_ms, previous_height)
|
|
228
|
+
break unless updated_height
|
|
229
|
+
|
|
230
|
+
previous_height = updated_height
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def click_selector(page, config)
|
|
235
|
+
selector = config.fetch(:selector)
|
|
236
|
+
max_clicks = config.fetch(:max_clicks, 1)
|
|
237
|
+
wait_after_ms = config[:wait_after_ms]
|
|
238
|
+
|
|
239
|
+
max_clicks.times do
|
|
240
|
+
break unless (element = page.query_selector(selector))
|
|
241
|
+
|
|
242
|
+
ctx.budget.consume!
|
|
243
|
+
element.click
|
|
244
|
+
wait_after(page, wait_after_ms)
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def perform_scroll_iteration(page, wait_after_ms, previous_height)
|
|
249
|
+
ctx.budget.consume!
|
|
250
|
+
page.evaluate('() => window.scrollTo(0, document.body.scrollHeight)')
|
|
251
|
+
wait_after(page, wait_after_ms)
|
|
252
|
+
|
|
253
|
+
current_height = page.evaluate('() => document.body.scrollHeight')
|
|
254
|
+
return if previous_height && current_height <= previous_height
|
|
255
|
+
|
|
256
|
+
current_height
|
|
257
|
+
end
|
|
59
258
|
end
|
|
60
259
|
end
|
|
61
260
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
|
|
3
5
|
module Html2rss
|
|
4
6
|
class RequestService
|
|
5
7
|
##
|
|
@@ -7,21 +9,59 @@ module Html2rss
|
|
|
7
9
|
class Response
|
|
8
10
|
##
|
|
9
11
|
# @param body [String] the body of the response
|
|
12
|
+
# @param url [Html2rss::Url] the final request URL
|
|
10
13
|
# @param headers [Hash] the headers of the response
|
|
11
|
-
|
|
14
|
+
# @param status [Integer, nil] the HTTP status code when available
|
|
15
|
+
def initialize(body:, url:, headers: {}, status: nil)
|
|
12
16
|
@body = body
|
|
13
17
|
|
|
14
18
|
headers = headers.dup
|
|
15
19
|
headers.transform_keys!(&:to_s)
|
|
16
20
|
|
|
17
21
|
@headers = headers
|
|
22
|
+
@status = status
|
|
23
|
+
@url = url
|
|
18
24
|
end
|
|
19
25
|
|
|
20
|
-
# @return [String] the body of the response
|
|
26
|
+
# @return [String] the raw body of the response
|
|
21
27
|
attr_reader :body
|
|
22
28
|
|
|
23
29
|
# @return [Hash<String, Object>] the headers of the response
|
|
24
30
|
attr_reader :headers
|
|
31
|
+
|
|
32
|
+
# @return [Integer, nil] the HTTP status code when known
|
|
33
|
+
attr_reader :status
|
|
34
|
+
|
|
35
|
+
# @return [Html2rss::Url] the URL of the response
|
|
36
|
+
attr_reader :url
|
|
37
|
+
|
|
38
|
+
def content_type = header('content-type').to_s
|
|
39
|
+
def json_response? = content_type.include?('application/json')
|
|
40
|
+
def html_response? = content_type.include?('text/html')
|
|
41
|
+
|
|
42
|
+
##
|
|
43
|
+
# @return [Nokogiri::HTML::Document, Hash] the parsed body of the response, frozen object
|
|
44
|
+
# @raise [UnsupportedResponseContentType] if the content type is not supported
|
|
45
|
+
def parsed_body
|
|
46
|
+
@parsed_body ||= if html_response?
|
|
47
|
+
Nokogiri::HTML(body).tap do |doc|
|
|
48
|
+
# Remove comments from the document to avoid processing irrelevant content
|
|
49
|
+
doc.xpath('//comment()').each(&:remove)
|
|
50
|
+
end.freeze
|
|
51
|
+
elsif json_response?
|
|
52
|
+
JSON.parse(body, symbolize_names: true).freeze
|
|
53
|
+
else
|
|
54
|
+
raise UnsupportedResponseContentType, "Unsupported content type: #{content_type}"
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def header(name)
|
|
61
|
+
headers.fetch(name) do
|
|
62
|
+
headers.find { |key, _value| key.casecmp?(name) }&.last
|
|
63
|
+
end
|
|
64
|
+
end
|
|
25
65
|
end
|
|
26
66
|
end
|
|
27
67
|
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestService
|
|
5
|
+
##
|
|
6
|
+
# Enforces response-size limits before parsing.
|
|
7
|
+
class ResponseGuard
|
|
8
|
+
##
|
|
9
|
+
# @param policy [Policy] request policy that defines byte ceilings
|
|
10
|
+
def initialize(policy:)
|
|
11
|
+
@policy = policy
|
|
12
|
+
@streamed_bytes = 0
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
##
|
|
16
|
+
# Validates response headers and streamed byte count.
|
|
17
|
+
#
|
|
18
|
+
# @param total_bytes [Integer] cumulative byte count received so far
|
|
19
|
+
# @param headers [Hash, nil] response headers if known
|
|
20
|
+
# @return [void]
|
|
21
|
+
# @raise [ResponseTooLarge] if the response exceeds configured limits
|
|
22
|
+
def inspect_chunk!(total_bytes:, headers: nil)
|
|
23
|
+
header_length = headers&.fetch('content-length', headers&.fetch('Content-Length', nil))
|
|
24
|
+
raise_if_too_large!(header_length.to_i, policy.max_response_bytes) if header_length
|
|
25
|
+
|
|
26
|
+
@streamed_bytes = total_bytes
|
|
27
|
+
raise_if_too_large!(@streamed_bytes, policy.max_response_bytes)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Validates the final response body after middleware processing.
|
|
32
|
+
#
|
|
33
|
+
# @param body [String, nil] final response body
|
|
34
|
+
# @return [void]
|
|
35
|
+
# @raise [ResponseTooLarge] if the final body exceeds configured limits
|
|
36
|
+
# @raise [BlockedSurfaceDetected] if the body matches known anti-bot interstitial signatures
|
|
37
|
+
def inspect_body!(body)
|
|
38
|
+
normalized_body = body.to_s
|
|
39
|
+
size = normalized_body.bytesize
|
|
40
|
+
raise_if_too_large!(size, policy.max_decompressed_bytes)
|
|
41
|
+
raise_if_blocked_surface!(normalized_body)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
attr_reader :policy
|
|
47
|
+
|
|
48
|
+
def raise_if_blocked_surface!(body)
|
|
49
|
+
signature = Html2rss::BlockedSurface.interstitial_signature_for(body)
|
|
50
|
+
return unless signature
|
|
51
|
+
|
|
52
|
+
raise BlockedSurfaceDetected, signature.fetch(:message)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def raise_if_too_large!(bytes, limit)
|
|
56
|
+
return unless bytes > limit
|
|
57
|
+
|
|
58
|
+
raise ResponseTooLarge, "Response exceeded #{limit} bytes"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|