html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'faraday'
|
|
4
4
|
require 'faraday/follow_redirects'
|
|
5
|
+
require 'faraday/gzip'
|
|
5
6
|
|
|
6
7
|
module Html2rss
|
|
7
8
|
class RequestService
|
|
@@ -9,15 +10,146 @@ module Html2rss
|
|
|
9
10
|
# Strategy to use Faraday for the request.
|
|
10
11
|
# @see https://rubygems.org/gems/faraday
|
|
11
12
|
class FaradayStrategy < Strategy
|
|
12
|
-
|
|
13
|
+
##
|
|
14
|
+
# Restores buffered streamed bytes so response middleware can process them.
|
|
15
|
+
class StreamingBodyMiddleware < Faraday::Middleware
|
|
16
|
+
# Request-context key used to store streamed chunks before middleware completion.
|
|
17
|
+
STREAM_BUFFER_KEY = :html2rss_stream_buffer
|
|
18
|
+
|
|
19
|
+
# @param env [Faraday::Env] completed response environment
|
|
20
|
+
# @return [void]
|
|
21
|
+
def on_complete(env)
|
|
22
|
+
buffer = env.request.context&.delete(STREAM_BUFFER_KEY)
|
|
23
|
+
return if buffer.nil? || buffer.empty?
|
|
24
|
+
|
|
25
|
+
env.body = buffer
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
##
|
|
30
|
+
# Executes a request with runtime policy enforcement.
|
|
31
|
+
#
|
|
32
|
+
# @return [Response] normalized request response
|
|
33
|
+
# @note Unlike BrowserlessStrategy, Faraday does not expose the remote IP after connect.
|
|
34
|
+
# SSRF protection here is pre-connection only (DNS resolution via Policy).
|
|
35
|
+
# A DNS rebinding attack between resolution and connect cannot be caught at this layer.
|
|
13
36
|
def execute
|
|
14
|
-
|
|
15
|
-
|
|
37
|
+
deadline = request_deadline
|
|
38
|
+
response_guard, response = perform_request(deadline:)
|
|
39
|
+
response_guard.inspect_body!(response.body)
|
|
40
|
+
build_response(response)
|
|
41
|
+
rescue Faraday::TimeoutError, Timeout::Error => error
|
|
42
|
+
raise RequestTimedOut, error.message
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def request_deadline
|
|
48
|
+
monotonic_now + ctx.policy.total_timeout_seconds
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def perform_request(deadline:)
|
|
52
|
+
response_guard = ResponseGuard.new(policy: ctx.policy)
|
|
53
|
+
response = faraday_request(response_guard, deadline:, streaming_buffer: true)
|
|
54
|
+
response = retry_without_streaming(response_guard, deadline:) if retry_without_streaming?(response)
|
|
55
|
+
[response_guard, response]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def build_response(response)
|
|
59
|
+
Response.new(body: response.body, headers: response.headers, url: response_url(response),
|
|
60
|
+
status: response.status)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def validate_request!(consume_budget: true)
|
|
64
|
+
ctx.budget.consume! if consume_budget
|
|
65
|
+
ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def faraday_request(response_guard, deadline:, streaming_buffer:, consume_budget: true)
|
|
69
|
+
validate_request!(consume_budget:)
|
|
70
|
+
|
|
71
|
+
client.get do |req|
|
|
72
|
+
apply_timeouts(req, deadline:)
|
|
73
|
+
buffer = prepare_stream_buffer(req) if streaming_buffer
|
|
74
|
+
req.options.on_data = on_data_callback(response_guard, buffer)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def retry_without_streaming(response_guard, deadline:)
|
|
79
|
+
faraday_request(response_guard, deadline:, streaming_buffer: false, consume_budget: false)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def client
|
|
83
|
+
@client ||= Faraday.new(url: ctx.url.to_s, headers: ctx.headers) do |faraday|
|
|
84
|
+
faraday.use Faraday::FollowRedirects::Middleware, limit: ctx.policy.max_redirects, callback: redirect_callback
|
|
85
|
+
faraday.request :gzip
|
|
86
|
+
faraday.use StreamingBodyMiddleware
|
|
16
87
|
faraday.adapter Faraday.default_adapter
|
|
17
88
|
end
|
|
18
|
-
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def apply_timeouts(request, deadline:)
|
|
92
|
+
remaining_timeout = remaining_timeout_seconds(deadline)
|
|
93
|
+
request.options.timeout = remaining_timeout
|
|
94
|
+
request.options.open_timeout = [ctx.policy.connect_timeout_seconds, remaining_timeout].min
|
|
95
|
+
request.options.read_timeout = [ctx.policy.read_timeout_seconds, remaining_timeout].min
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def prepare_stream_buffer(request)
|
|
99
|
+
request.options.context ||= {}
|
|
100
|
+
request.options.context[StreamingBodyMiddleware::STREAM_BUFFER_KEY] = +''
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def on_data_callback(response_guard, buffer)
|
|
104
|
+
proc do |chunk, total_bytes, env|
|
|
105
|
+
response_guard.inspect_chunk!(total_bytes:, headers: env&.response_headers)
|
|
106
|
+
buffer&.<< chunk
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def remaining_timeout_seconds(deadline)
|
|
111
|
+
remaining = deadline - monotonic_now
|
|
112
|
+
raise RequestTimedOut, 'Request timed out' if remaining <= 0
|
|
113
|
+
|
|
114
|
+
remaining
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def retry_without_streaming?(response)
|
|
118
|
+
return false if response.body.to_s.empty? == false
|
|
119
|
+
return false unless response_success?(response)
|
|
120
|
+
|
|
121
|
+
final_url = response.env&.url
|
|
122
|
+
return false unless final_url
|
|
123
|
+
|
|
124
|
+
final_url.to_s != ctx.url.to_s
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def response_success?(response)
|
|
128
|
+
return true if response.status.nil?
|
|
129
|
+
|
|
130
|
+
response.status >= 200 && response.status < 300
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def response_url(response)
|
|
134
|
+
return ctx.url unless (url = response.env&.url)
|
|
135
|
+
|
|
136
|
+
Html2rss::Url.from_absolute(url.to_s)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def redirect_callback
|
|
140
|
+
lambda do |old_env, new_env|
|
|
141
|
+
from_url = normalize_url(old_env[:url])
|
|
142
|
+
to_url = normalize_url(new_env[:url])
|
|
143
|
+
ctx.policy.validate_redirect!(from_url:, to_url:, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def normalize_url(url)
|
|
148
|
+
Html2rss::Url.from_absolute(url.to_s)
|
|
149
|
+
end
|
|
19
150
|
|
|
20
|
-
|
|
151
|
+
def monotonic_now
|
|
152
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
21
153
|
end
|
|
22
154
|
end
|
|
23
155
|
end
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'ipaddr'
|
|
4
|
+
require 'resolv'
|
|
5
|
+
require 'socket'
|
|
6
|
+
|
|
7
|
+
module Html2rss
|
|
8
|
+
class RequestService
|
|
9
|
+
##
|
|
10
|
+
# Describes the runtime request envelope for a single feed build.
|
|
11
|
+
class Policy # rubocop:disable Metrics/ClassLength
|
|
12
|
+
MAX_REQUESTS_CEILING = 10
|
|
13
|
+
# Hostnames treated as local/private surfaces.
|
|
14
|
+
LOCAL_HOSTS = %w[localhost localhost.localdomain metadata.google.internal].to_set.freeze
|
|
15
|
+
# IP ranges blocked when private networks are disabled.
|
|
16
|
+
BLOCKED_IP_RANGES = [
|
|
17
|
+
IPAddr.new('0.0.0.0/8'),
|
|
18
|
+
IPAddr.new('10.0.0.0/8'),
|
|
19
|
+
IPAddr.new('127.0.0.0/8'),
|
|
20
|
+
IPAddr.new('169.254.0.0/16'),
|
|
21
|
+
IPAddr.new('172.16.0.0/12'),
|
|
22
|
+
IPAddr.new('192.168.0.0/16'),
|
|
23
|
+
IPAddr.new('224.0.0.0/4'),
|
|
24
|
+
IPAddr.new('::/128'),
|
|
25
|
+
IPAddr.new('::1/128'),
|
|
26
|
+
IPAddr.new('fe80::/10'),
|
|
27
|
+
IPAddr.new('fc00::/7'),
|
|
28
|
+
IPAddr.new('ff00::/8')
|
|
29
|
+
].freeze
|
|
30
|
+
|
|
31
|
+
# Default policy values used when request controls are not explicitly set.
|
|
32
|
+
DEFAULTS = {
|
|
33
|
+
connect_timeout_seconds: 5,
|
|
34
|
+
read_timeout_seconds: 10,
|
|
35
|
+
total_timeout_seconds: 30,
|
|
36
|
+
max_redirects: 3,
|
|
37
|
+
max_response_bytes: 5_242_880,
|
|
38
|
+
max_decompressed_bytes: 10_485_760,
|
|
39
|
+
max_requests: 1,
|
|
40
|
+
allow_private_networks: false,
|
|
41
|
+
allow_cross_origin_followups: false
|
|
42
|
+
}.freeze
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
# @param connect_timeout_seconds [Integer] maximum connection setup time
|
|
46
|
+
# @param read_timeout_seconds [Integer] maximum read stall time
|
|
47
|
+
# @param total_timeout_seconds [Integer] maximum total request time
|
|
48
|
+
# @param max_redirects [Integer] maximum redirect count
|
|
49
|
+
# @param max_response_bytes [Integer] maximum streamed response bytes
|
|
50
|
+
# @param max_decompressed_bytes [Integer] maximum final body size
|
|
51
|
+
# @param max_requests [Integer] maximum requests per feed build
|
|
52
|
+
# @param allow_private_networks [Boolean] whether private network targets are allowed
|
|
53
|
+
# @param allow_cross_origin_followups [Boolean] whether follow-up requests may leave the origin host
|
|
54
|
+
# @param resolver [#each_address] DNS resolver used for hostname classification
|
|
55
|
+
def initialize(connect_timeout_seconds: DEFAULTS[:connect_timeout_seconds], # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
|
|
56
|
+
read_timeout_seconds: DEFAULTS[:read_timeout_seconds],
|
|
57
|
+
total_timeout_seconds: DEFAULTS[:total_timeout_seconds],
|
|
58
|
+
max_redirects: DEFAULTS[:max_redirects],
|
|
59
|
+
max_response_bytes: DEFAULTS[:max_response_bytes],
|
|
60
|
+
max_decompressed_bytes: DEFAULTS[:max_decompressed_bytes],
|
|
61
|
+
max_requests: DEFAULTS[:max_requests],
|
|
62
|
+
allow_private_networks: DEFAULTS[:allow_private_networks],
|
|
63
|
+
allow_cross_origin_followups: DEFAULTS[:allow_cross_origin_followups],
|
|
64
|
+
resolver: Socket)
|
|
65
|
+
@connect_timeout_seconds = validate_positive_integer!(:connect_timeout_seconds, connect_timeout_seconds)
|
|
66
|
+
@read_timeout_seconds = validate_positive_integer!(:read_timeout_seconds, read_timeout_seconds)
|
|
67
|
+
@total_timeout_seconds = validate_positive_integer!(:total_timeout_seconds, total_timeout_seconds)
|
|
68
|
+
@max_redirects = validate_non_negative_integer!(:max_redirects, max_redirects)
|
|
69
|
+
@max_response_bytes = validate_positive_integer!(:max_response_bytes, max_response_bytes)
|
|
70
|
+
@max_decompressed_bytes = validate_positive_integer!(:max_decompressed_bytes, max_decompressed_bytes)
|
|
71
|
+
@max_requests = [validate_positive_integer!(:max_requests, max_requests), MAX_REQUESTS_CEILING].min
|
|
72
|
+
@allow_private_networks = allow_private_networks ? true : false
|
|
73
|
+
@allow_cross_origin_followups = allow_cross_origin_followups ? true : false
|
|
74
|
+
@resolver = resolver
|
|
75
|
+
freeze
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
attr_reader :connect_timeout_seconds,
|
|
79
|
+
:read_timeout_seconds,
|
|
80
|
+
:total_timeout_seconds,
|
|
81
|
+
:max_redirects,
|
|
82
|
+
:max_response_bytes,
|
|
83
|
+
:max_decompressed_bytes,
|
|
84
|
+
:max_requests
|
|
85
|
+
|
|
86
|
+
##
|
|
87
|
+
# @return [Boolean] whether private network targets may be requested
|
|
88
|
+
def allow_private_networks?
|
|
89
|
+
@allow_private_networks
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
##
|
|
93
|
+
# @return [Boolean] whether follow-up requests may leave the initial origin
|
|
94
|
+
def allow_cross_origin_followups?
|
|
95
|
+
@allow_cross_origin_followups
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
##
|
|
99
|
+
# Returns the default request policy.
|
|
100
|
+
#
|
|
101
|
+
# @return [Policy] a default, frozen policy instance
|
|
102
|
+
# rubocop:disable Layout/ClassStructure
|
|
103
|
+
def self.default
|
|
104
|
+
new
|
|
105
|
+
end
|
|
106
|
+
# rubocop:enable Layout/ClassStructure
|
|
107
|
+
|
|
108
|
+
##
|
|
109
|
+
# Validates whether a request target is permitted for the given context.
|
|
110
|
+
#
|
|
111
|
+
# @param url [Html2rss::Url] destination URL
|
|
112
|
+
# @param origin_url [Html2rss::Url] initial URL of the feed build
|
|
113
|
+
# @param relation [Symbol] logical reason for the request
|
|
114
|
+
# @return [void]
|
|
115
|
+
# @raise [CrossOriginFollowUpDenied] if a follow-up leaves the origin host
|
|
116
|
+
# @raise [PrivateNetworkDenied] if the target resolves to a private address
|
|
117
|
+
def validate_request!(url:, origin_url:, relation:)
|
|
118
|
+
enforce_same_origin!(url, origin_url, relation)
|
|
119
|
+
enforce_public_network!(url)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
##
|
|
123
|
+
# Validates a redirect hop before it is followed.
|
|
124
|
+
#
|
|
125
|
+
# @param from_url [Html2rss::Url] URL that produced the redirect
|
|
126
|
+
# @param to_url [Html2rss::Url] redirect destination
|
|
127
|
+
# @param origin_url [Html2rss::Url] initial URL of the feed build
|
|
128
|
+
# @param relation [Symbol] logical reason for the request
|
|
129
|
+
# @return [void]
|
|
130
|
+
# @raise [UnsupportedUrlScheme] if the redirect downgrades from HTTPS to HTTP
|
|
131
|
+
def validate_redirect!(from_url:, to_url:, origin_url:, relation:)
|
|
132
|
+
if from_url.scheme == 'https' && to_url.scheme == 'http'
|
|
133
|
+
raise UnsupportedUrlScheme, 'Redirect downgraded from https to http'
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
validate_request!(url: to_url, origin_url:, relation:)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
##
|
|
140
|
+
# Validates the resolved remote IP for a completed request.
|
|
141
|
+
#
|
|
142
|
+
# @param ip [String, nil] remote IP address reported by the client
|
|
143
|
+
# @param url [Html2rss::Url] URL associated with the response
|
|
144
|
+
# @return [void]
|
|
145
|
+
# @raise [PrivateNetworkDenied] if the response came from a blocked address
|
|
146
|
+
def validate_remote_ip!(ip:, url:)
|
|
147
|
+
return if allow_private_networks?
|
|
148
|
+
return if ip.nil? || ip.empty?
|
|
149
|
+
|
|
150
|
+
parsed_ip = parse_ip(ip)
|
|
151
|
+
raise PrivateNetworkDenied, "Remote IP could not be validated for #{url}" unless parsed_ip
|
|
152
|
+
return unless blocked_ip?(parsed_ip)
|
|
153
|
+
|
|
154
|
+
raise PrivateNetworkDenied, "Private network target denied for #{url}"
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
private
|
|
158
|
+
|
|
159
|
+
attr_reader :resolver
|
|
160
|
+
|
|
161
|
+
def validate_positive_integer!(name, value)
|
|
162
|
+
raise ArgumentError, "#{name} must be positive" unless value.is_a?(Integer) && value.positive?
|
|
163
|
+
|
|
164
|
+
value
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def validate_non_negative_integer!(name, value)
|
|
168
|
+
raise ArgumentError, "#{name} must be non-negative" unless value.is_a?(Integer) && !value.negative?
|
|
169
|
+
|
|
170
|
+
value
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def enforce_same_origin!(url, origin_url, relation)
|
|
174
|
+
return if relation == :initial || allow_cross_origin_followups?
|
|
175
|
+
|
|
176
|
+
enforce_follow_up_scheme!(url, origin_url)
|
|
177
|
+
return if comparable_origin(url) == comparable_origin(origin_url)
|
|
178
|
+
|
|
179
|
+
raise CrossOriginFollowUpDenied, "Cross-origin follow-up denied for #{url}"
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def enforce_follow_up_scheme!(url, origin_url)
|
|
183
|
+
return unless origin_url.scheme == 'https' && url.scheme == 'http'
|
|
184
|
+
|
|
185
|
+
raise UnsupportedUrlScheme, "Follow-up downgraded from https to http for #{url}"
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def comparable_origin(url)
|
|
189
|
+
[url.host, normalized_port(url)]
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def normalized_port(url)
|
|
193
|
+
return url.port if url.port
|
|
194
|
+
|
|
195
|
+
url.scheme == 'https' ? 443 : 80
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
def enforce_public_network!(url)
|
|
199
|
+
host = url.host
|
|
200
|
+
return if allow_private_networks?
|
|
201
|
+
return unless blocked_host?(host) || resolved_ip_addresses(host).any? { |address| blocked_ip?(address) }
|
|
202
|
+
|
|
203
|
+
raise PrivateNetworkDenied, "Private network target denied for #{url}"
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
def blocked_host?(host)
|
|
207
|
+
LOCAL_HOSTS.include?(host.to_s.downcase)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def resolved_ip_addresses(host)
|
|
211
|
+
literal = parse_ip(host)
|
|
212
|
+
return [literal] if literal
|
|
213
|
+
|
|
214
|
+
if resolver.respond_to?(:each_address)
|
|
215
|
+
addresses_from_each_address(host)
|
|
216
|
+
else
|
|
217
|
+
addresses_from_getaddrinfo(host)
|
|
218
|
+
end
|
|
219
|
+
rescue Resolv::ResolvError, SocketError, SystemCallError
|
|
220
|
+
[]
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def addresses_from_each_address(host)
|
|
224
|
+
[].tap do |addresses|
|
|
225
|
+
resolver.each_address(host) do |address|
|
|
226
|
+
parsed = parse_ip(address)
|
|
227
|
+
addresses << parsed if parsed
|
|
228
|
+
end
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
def addresses_from_getaddrinfo(host)
|
|
233
|
+
resolver.getaddrinfo(host, nil).filter_map do |entry|
|
|
234
|
+
parse_ip(entry[3])
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def parse_ip(value)
|
|
239
|
+
IPAddr.new(value)
|
|
240
|
+
rescue IPAddr::AddressFamilyError, IPAddr::InvalidAddressError
|
|
241
|
+
nil
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def blocked_ip?(address)
|
|
245
|
+
BLOCKED_IP_RANGES.any? { |range| range.include?(address) }
|
|
246
|
+
end
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# Shared immutable policy instance used for default request execution.
|
|
250
|
+
Policy::DEFAULT_POLICY = Policy.new
|
|
251
|
+
end
|
|
252
|
+
end
|
|
@@ -4,7 +4,13 @@ module Html2rss
|
|
|
4
4
|
class RequestService
|
|
5
5
|
##
|
|
6
6
|
# Commands the Puppeteer Browser to the website and builds the Response.
|
|
7
|
-
class PuppetCommander
|
|
7
|
+
class PuppetCommander # rubocop:disable Metrics/ClassLength
|
|
8
|
+
BROWSER_UNSAFE_HEADERS = %w[
|
|
9
|
+
host connection content-length transfer-encoding
|
|
10
|
+
sec-fetch-dest sec-fetch-mode sec-fetch-site sec-fetch-user
|
|
11
|
+
upgrade-insecure-requests
|
|
12
|
+
].to_set.freeze
|
|
13
|
+
|
|
8
14
|
# @param ctx [Context]
|
|
9
15
|
# @param browser [Puppeteer::Browser]
|
|
10
16
|
# @param skip_request_resources [Set<String>] the resource types not to request
|
|
@@ -19,13 +25,18 @@ module Html2rss
|
|
|
19
25
|
@referer = referer
|
|
20
26
|
end
|
|
21
27
|
|
|
22
|
-
|
|
28
|
+
##
|
|
29
|
+
# Visits the request URL and normalizes the page into a response object.
|
|
30
|
+
#
|
|
31
|
+
# @return [Response] rendered page response
|
|
23
32
|
def call
|
|
24
33
|
page = new_page
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
34
|
+
navigation_response = navigate_to_destination(page, ctx.url)
|
|
35
|
+
perform_preload(page)
|
|
36
|
+
raise_navigation_error_if_any
|
|
37
|
+
final_navigation_response = latest_navigation_response || navigation_response
|
|
38
|
+
validate_navigation_response!(final_navigation_response)
|
|
39
|
+
build_response(page, final_navigation_response)
|
|
29
40
|
ensure
|
|
30
41
|
page&.close
|
|
31
42
|
end
|
|
@@ -35,27 +46,215 @@ module Html2rss
|
|
|
35
46
|
# @see https://yusukeiwaki.github.io/puppeteer-ruby-docs/Puppeteer/Page.html
|
|
36
47
|
def new_page
|
|
37
48
|
page = browser.new_page
|
|
38
|
-
page.
|
|
49
|
+
@main_frame = page.main_frame if page.respond_to?(:main_frame)
|
|
50
|
+
configure_page(page)
|
|
51
|
+
configure_navigation_guards(page)
|
|
52
|
+
page
|
|
53
|
+
end
|
|
39
54
|
|
|
40
|
-
|
|
55
|
+
##
|
|
56
|
+
# @param page [Puppeteer::Page]
|
|
57
|
+
# @return [void]
|
|
58
|
+
def configure_page(page)
|
|
59
|
+
page.extra_http_headers = browser_headers
|
|
60
|
+
page.default_navigation_timeout = navigation_timeout_ms
|
|
61
|
+
page.default_timeout = navigation_timeout_ms
|
|
62
|
+
end
|
|
41
63
|
|
|
64
|
+
##
|
|
65
|
+
# @param page [Puppeteer::Page]
|
|
66
|
+
# @return [void]
|
|
67
|
+
def configure_navigation_guards(page)
|
|
42
68
|
page.request_interception = true
|
|
43
69
|
page.on('request') do |request|
|
|
44
|
-
|
|
70
|
+
handle_request(request)
|
|
45
71
|
end
|
|
46
|
-
|
|
47
|
-
page
|
|
72
|
+
page.on('response') { |response| handle_response(response) }
|
|
48
73
|
end
|
|
49
74
|
|
|
75
|
+
##
|
|
76
|
+
# @param page [Puppeteer::Page] browser page
|
|
77
|
+
# @param url [Html2rss::Url] target URL
|
|
78
|
+
# @return [Puppeteer::HTTPResponse, nil] the navigation response if one was produced
|
|
50
79
|
def navigate_to_destination(page, url)
|
|
51
|
-
|
|
80
|
+
@navigation_error = nil
|
|
81
|
+
@latest_navigation_response = nil
|
|
82
|
+
page.goto(url, wait_until: 'networkidle0', referer:, timeout: navigation_timeout_ms).tap do
|
|
83
|
+
raise_navigation_error_if_any
|
|
84
|
+
end
|
|
85
|
+
rescue StandardError
|
|
86
|
+
raise_navigation_error_if_any
|
|
87
|
+
|
|
88
|
+
raise
|
|
52
89
|
end
|
|
53
90
|
|
|
91
|
+
##
|
|
92
|
+
# @param page [Puppeteer::Page] browser page
|
|
93
|
+
# @return [String] rendered HTML content
|
|
54
94
|
def body(page) = page.content
|
|
55
95
|
|
|
56
96
|
private
|
|
57
97
|
|
|
58
|
-
attr_reader :ctx, :browser, :skip_request_resources, :referer
|
|
98
|
+
attr_reader :ctx, :browser, :skip_request_resources, :referer, :latest_navigation_response, :main_frame
|
|
99
|
+
|
|
100
|
+
def raise_navigation_error_if_any
|
|
101
|
+
raise @navigation_error if @navigation_error
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def navigation_timeout_ms
|
|
105
|
+
ctx.policy.total_timeout_seconds * 1000
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def browser_headers
|
|
109
|
+
ctx.headers.reject { |key, _| BROWSER_UNSAFE_HEADERS.include?(key.to_s.downcase) }
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def handle_request(request)
|
|
113
|
+
validate_request!(request)
|
|
114
|
+
|
|
115
|
+
skip_request_resources.member?(request.resource_type) ? request.abort : request.continue
|
|
116
|
+
rescue Html2rss::Error => error
|
|
117
|
+
store_navigation_error(error, navigation_request: request.navigation_request?)
|
|
118
|
+
request.abort
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def handle_response(response)
|
|
122
|
+
@latest_navigation_response = response if main_frame_navigation_response?(response)
|
|
123
|
+
validate_response!(response)
|
|
124
|
+
rescue Html2rss::Error => error
|
|
125
|
+
store_navigation_error(error, navigation_request: response.request.navigation_request?)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def validate_request!(request)
|
|
129
|
+
validate_navigation_redirect_chain!(request)
|
|
130
|
+
validate_navigation_target!(request)
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def main_frame_navigation_response?(response)
|
|
134
|
+
request = response.request
|
|
135
|
+
return false unless request.navigation_request?
|
|
136
|
+
return true unless request.respond_to?(:frame)
|
|
137
|
+
|
|
138
|
+
frame = request.frame
|
|
139
|
+
return true if frame.nil?
|
|
140
|
+
return frame == main_frame unless main_frame.nil?
|
|
141
|
+
return true unless frame.respond_to?(:parent_frame)
|
|
142
|
+
|
|
143
|
+
frame.parent_frame.nil?
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def build_response(page, navigation_response)
|
|
147
|
+
page_body = body(page)
|
|
148
|
+
ResponseGuard.new(policy: ctx.policy).inspect_body!(page_body)
|
|
149
|
+
|
|
150
|
+
Response.new(
|
|
151
|
+
body: page_body,
|
|
152
|
+
headers: navigation_response&.headers || {},
|
|
153
|
+
url: response_url(navigation_response, ctx.url),
|
|
154
|
+
status: navigation_response&.status
|
|
155
|
+
)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
def validate_navigation_response!(navigation_response)
|
|
159
|
+
final_url = response_url(navigation_response, ctx.url)
|
|
160
|
+
ctx.policy.validate_remote_ip!(ip: remote_ip(navigation_response), url: final_url)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def validate_response!(response)
|
|
164
|
+
validate_navigation_response!(response)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def response_url(navigation_response, fallback_url)
|
|
168
|
+
raw_url = navigation_response&.url || fallback_url.to_s
|
|
169
|
+
Html2rss::Url.from_absolute(raw_url)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def remote_ip(navigation_response)
|
|
173
|
+
navigation_response.remote_address&.ip
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def request_chain(request)
|
|
177
|
+
(request.redirect_chain + [request]).map { |entry| request_url(entry) }
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def request_url(request)
|
|
181
|
+
Html2rss::Url.from_absolute(request.url)
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def validate_navigation_redirect_chain!(request)
|
|
185
|
+
request_chain(request).each_cons(2) do |from_url, to_url|
|
|
186
|
+
ctx.policy.validate_redirect!(from_url:, to_url:, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def validate_navigation_target!(request)
|
|
191
|
+
ctx.policy.validate_request!(url: request_url(request), origin_url: ctx.origin_url, relation: ctx.relation)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def store_navigation_error(error, navigation_request:)
|
|
195
|
+
return unless navigation_request
|
|
196
|
+
|
|
197
|
+
@navigation_error = error if @navigation_error.nil?
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def perform_preload(page)
|
|
201
|
+
preload_config = ctx.browserless_preload
|
|
202
|
+
return unless preload_config
|
|
203
|
+
|
|
204
|
+
wait_after(page, preload_config[:wait_after_ms])
|
|
205
|
+
click_selectors(page, preload_config[:click_selectors]) if preload_config[:click_selectors]
|
|
206
|
+
scroll_down(page, preload_config[:scroll_down]) if preload_config[:scroll_down]
|
|
207
|
+
wait_after(page, preload_config[:wait_after_ms])
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def wait_after(page, timeout_ms)
|
|
211
|
+
return unless timeout_ms
|
|
212
|
+
|
|
213
|
+
ctx.budget.consume!
|
|
214
|
+
page.wait_for_timeout(timeout_ms)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def click_selectors(page, selectors)
|
|
218
|
+
selectors.each { |selector_config| click_selector(page, selector_config) }
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def scroll_down(page, config)
|
|
222
|
+
iterations = config.fetch(:iterations, 1)
|
|
223
|
+
wait_after_ms = config[:wait_after_ms]
|
|
224
|
+
previous_height = nil
|
|
225
|
+
|
|
226
|
+
iterations.times do
|
|
227
|
+
updated_height = perform_scroll_iteration(page, wait_after_ms, previous_height)
|
|
228
|
+
break unless updated_height
|
|
229
|
+
|
|
230
|
+
previous_height = updated_height
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
def click_selector(page, config)
|
|
235
|
+
selector = config.fetch(:selector)
|
|
236
|
+
max_clicks = config.fetch(:max_clicks, 1)
|
|
237
|
+
wait_after_ms = config[:wait_after_ms]
|
|
238
|
+
|
|
239
|
+
max_clicks.times do
|
|
240
|
+
break unless (element = page.query_selector(selector))
|
|
241
|
+
|
|
242
|
+
ctx.budget.consume!
|
|
243
|
+
element.click
|
|
244
|
+
wait_after(page, wait_after_ms)
|
|
245
|
+
end
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def perform_scroll_iteration(page, wait_after_ms, previous_height)
|
|
249
|
+
ctx.budget.consume!
|
|
250
|
+
page.evaluate('() => window.scrollTo(0, document.body.scrollHeight)')
|
|
251
|
+
wait_after(page, wait_after_ms)
|
|
252
|
+
|
|
253
|
+
current_height = page.evaluate('() => document.body.scrollHeight')
|
|
254
|
+
return if previous_height && current_height <= previous_height
|
|
255
|
+
|
|
256
|
+
current_height
|
|
257
|
+
end
|
|
59
258
|
end
|
|
60
259
|
end
|
|
61
260
|
end
|