html2rss 0.22.1 → 0.22.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/html2rss/feed_pipeline.rb +12 -3
- data/lib/html2rss/request_service/botasaurus_strategy.rb +11 -3
- data/lib/html2rss/request_service/browserless_strategy.rb +5 -1
- data/lib/html2rss/request_service/budget.rb +15 -1
- data/lib/html2rss/request_service/context.rb +3 -1
- data/lib/html2rss/request_service/faraday_strategy.rb +2 -1
- data/lib/html2rss/request_service/strategy.rb +5 -0
- data/lib/html2rss/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: eb40d85981c1bc11576b0c9d7a3eae3948bd49239ad8b9a65bb6a4ea103b54f9
|
|
4
|
+
data.tar.gz: 6c4636413e1e995e98adec8d8cc742a5833208c2197a024abe90f4b81fce1774
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 9134076ad05417b8b7176da26033e4d66137f862781378b44080d4ea4fe67e8932156c8eb7d22b58ad97f0d7f5e32102c96ef84741d681c2c7571b3add09afb0
|
|
7
|
+
data.tar.gz: 61958c85cb791adc78d6d18debcdbfb7a90948332696dac0a4ab4aee39f6b098c3366f948d8905d3d36e2052e03e75037006bd2e6cfa3420b9a11e46e3ebce18
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
module Html2rss
|
|
4
4
|
##
|
|
5
5
|
# Builds feeds from validated config through request, extraction, and rendering stages.
|
|
6
|
-
class FeedPipeline
|
|
6
|
+
class FeedPipeline # rubocop:disable Metrics/ClassLength
|
|
7
7
|
##
|
|
8
8
|
# @param raw_config [Hash{Symbol => Object}] user-provided feed config
|
|
9
9
|
def initialize(raw_config)
|
|
@@ -77,11 +77,16 @@ module Html2rss
|
|
|
77
77
|
auto_fallback_for(config).call
|
|
78
78
|
end
|
|
79
79
|
|
|
80
|
+
# rubocop:disable Metrics/MethodLength
|
|
80
81
|
def auto_fallback_for(config)
|
|
81
82
|
AutoFallback.new(
|
|
82
83
|
strategies: AutoFallback::CHAIN,
|
|
83
84
|
budget: auto_pipeline_budget(config),
|
|
84
85
|
session_for: lambda do |strategy:, budget:|
|
|
86
|
+
if budget.remaining_timeout_seconds && budget.remaining_timeout_seconds <= 0
|
|
87
|
+
raise RequestService::RequestTimedOut, 'Request timed out'
|
|
88
|
+
end
|
|
89
|
+
|
|
85
90
|
request_session_for(config, strategy:, budget:)
|
|
86
91
|
end,
|
|
87
92
|
articles_for: lambda do |response:, request_session:|
|
|
@@ -89,10 +94,14 @@ module Html2rss
|
|
|
89
94
|
end
|
|
90
95
|
)
|
|
91
96
|
end
|
|
97
|
+
# rubocop:enable Metrics/MethodLength
|
|
92
98
|
|
|
93
99
|
def auto_pipeline_budget(config)
|
|
94
|
-
|
|
95
|
-
RequestService::Budget.new(
|
|
100
|
+
policy = RequestSession::RuntimePolicy.from_config(config)
|
|
101
|
+
RequestService::Budget.new(
|
|
102
|
+
max_requests: policy.max_requests,
|
|
103
|
+
total_timeout_seconds: policy.total_timeout_seconds
|
|
104
|
+
)
|
|
96
105
|
end
|
|
97
106
|
|
|
98
107
|
def collect_articles(response:, config:, request_session:)
|
|
@@ -16,9 +16,9 @@ module Html2rss
|
|
|
16
16
|
# @raise [BotasaurusConnectionFailed] when Botasaurus cannot be reached or returns an invalid payload
|
|
17
17
|
# @raise [RequestTimedOut] when the Botasaurus request exceeds configured timeout
|
|
18
18
|
def execute
|
|
19
|
+
check_timeout!
|
|
19
20
|
validate_request!
|
|
20
|
-
|
|
21
|
-
parsed_response = contract.parse_response(transport_response)
|
|
21
|
+
parsed_response = post_scrape_request
|
|
22
22
|
raise_if_challenge_blocked!(parsed_response)
|
|
23
23
|
raise_if_upstream_failed!(parsed_response)
|
|
24
24
|
build_response(parsed_response)
|
|
@@ -30,6 +30,11 @@ module Html2rss
|
|
|
30
30
|
|
|
31
31
|
private
|
|
32
32
|
|
|
33
|
+
def post_scrape_request
|
|
34
|
+
transport_response = client.post('/scrape', JSON.generate(contract.request_payload), content_type_header)
|
|
35
|
+
contract.parse_response(transport_response)
|
|
36
|
+
end
|
|
37
|
+
|
|
33
38
|
def validate_request!
|
|
34
39
|
ctx.budget.consume!
|
|
35
40
|
ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
@@ -76,7 +81,10 @@ module Html2rss
|
|
|
76
81
|
end
|
|
77
82
|
|
|
78
83
|
def request_options
|
|
79
|
-
|
|
84
|
+
timeout = ctx.budget.remaining_timeout_seconds || ctx.policy.total_timeout_seconds
|
|
85
|
+
raise RequestTimedOut, 'Request timed out' if timeout <= 0
|
|
86
|
+
|
|
87
|
+
{ timeout: timeout.to_i }
|
|
80
88
|
end
|
|
81
89
|
|
|
82
90
|
def content_type_header
|
|
@@ -37,6 +37,7 @@ module Html2rss
|
|
|
37
37
|
# @return [Response] normalized request response
|
|
38
38
|
# @raise [RequestTimedOut] if the browser session exceeds the configured timeout
|
|
39
39
|
def execute
|
|
40
|
+
check_timeout!
|
|
40
41
|
validate_request!
|
|
41
42
|
execute_browserless_request
|
|
42
43
|
rescue Puppeteer::TimeoutError => error
|
|
@@ -71,7 +72,10 @@ module Html2rss
|
|
|
71
72
|
end
|
|
72
73
|
|
|
73
74
|
def protocol_timeout_ms
|
|
74
|
-
ctx.policy.total_timeout_seconds
|
|
75
|
+
timeout = ctx.budget.remaining_timeout_seconds || ctx.policy.total_timeout_seconds
|
|
76
|
+
raise RequestTimedOut, 'Request timed out' if timeout <= 0
|
|
77
|
+
|
|
78
|
+
(timeout * 1000).to_i
|
|
75
79
|
end
|
|
76
80
|
|
|
77
81
|
def connect_with_timeout_support(&)
|
|
@@ -5,14 +5,18 @@ module Html2rss
|
|
|
5
5
|
##
|
|
6
6
|
# Tracks how many outbound requests a single feed build may still perform.
|
|
7
7
|
class Budget
|
|
8
|
+
##
|
|
8
9
|
##
|
|
9
10
|
# @param max_requests [Integer] the maximum number of requests allowed
|
|
10
|
-
|
|
11
|
+
# @param total_timeout_seconds [Integer, nil] the total timeout for the feed build
|
|
12
|
+
def initialize(max_requests:, total_timeout_seconds: nil)
|
|
11
13
|
unless max_requests.is_a?(Integer) && max_requests.positive?
|
|
12
14
|
raise ArgumentError, 'max_requests must be positive'
|
|
13
15
|
end
|
|
14
16
|
|
|
15
17
|
@remaining = max_requests
|
|
18
|
+
@start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
19
|
+
@total_timeout_seconds = total_timeout_seconds
|
|
16
20
|
@mutex = Mutex.new
|
|
17
21
|
end
|
|
18
22
|
|
|
@@ -34,6 +38,16 @@ module Html2rss
|
|
|
34
38
|
def remaining
|
|
35
39
|
@mutex.synchronize { @remaining }
|
|
36
40
|
end
|
|
41
|
+
|
|
42
|
+
##
|
|
43
|
+
# @return [Float, nil] the remaining timeout in seconds, or nil if not tracked
|
|
44
|
+
def remaining_timeout_seconds
|
|
45
|
+
return unless @total_timeout_seconds
|
|
46
|
+
|
|
47
|
+
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - @start_time
|
|
48
|
+
remaining = @total_timeout_seconds - elapsed
|
|
49
|
+
[remaining, 0.0].max
|
|
50
|
+
end
|
|
37
51
|
end
|
|
38
52
|
end
|
|
39
53
|
end
|
|
@@ -77,7 +77,9 @@ module Html2rss
|
|
|
77
77
|
raise ArgumentError, 'policy must not be nil' if @policy.nil?
|
|
78
78
|
|
|
79
79
|
@origin_url = normalized_origin_url(request_options[:origin_url])
|
|
80
|
-
@budget = request_options.fetch(:budget)
|
|
80
|
+
@budget = request_options.fetch(:budget) do
|
|
81
|
+
Budget.new(max_requests: policy.max_requests, total_timeout_seconds: policy.total_timeout_seconds)
|
|
82
|
+
end
|
|
81
83
|
raise ArgumentError, 'budget must not be nil' if @budget.nil?
|
|
82
84
|
end
|
|
83
85
|
|
|
@@ -34,6 +34,7 @@ module Html2rss
|
|
|
34
34
|
# SSRF protection here is pre-connection only (DNS resolution via Policy).
|
|
35
35
|
# A DNS rebinding attack between resolution and connect cannot be caught at this layer.
|
|
36
36
|
def execute
|
|
37
|
+
check_timeout!
|
|
37
38
|
deadline = request_deadline
|
|
38
39
|
response_guard, response = perform_request(deadline:)
|
|
39
40
|
response_guard.inspect_body!(response.body)
|
|
@@ -45,7 +46,7 @@ module Html2rss
|
|
|
45
46
|
private
|
|
46
47
|
|
|
47
48
|
def request_deadline
|
|
48
|
-
monotonic_now + ctx.policy.total_timeout_seconds
|
|
49
|
+
monotonic_now + (ctx.budget.remaining_timeout_seconds || ctx.policy.total_timeout_seconds)
|
|
49
50
|
end
|
|
50
51
|
|
|
51
52
|
def perform_request(deadline:)
|
|
@@ -23,6 +23,11 @@ module Html2rss
|
|
|
23
23
|
|
|
24
24
|
# @return [Context] the context for the request
|
|
25
25
|
attr_reader :ctx
|
|
26
|
+
|
|
27
|
+
def check_timeout!
|
|
28
|
+
remaining = ctx.budget.remaining_timeout_seconds
|
|
29
|
+
raise RequestTimedOut, 'Request timed out' if remaining && remaining <= 0
|
|
30
|
+
end
|
|
26
31
|
end
|
|
27
32
|
end
|
|
28
33
|
end
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.22.
|
|
4
|
+
version: 0.22.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
@@ -381,7 +381,7 @@ licenses:
|
|
|
381
381
|
- MIT
|
|
382
382
|
metadata:
|
|
383
383
|
allowed_push_host: https://rubygems.org
|
|
384
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.
|
|
384
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.2
|
|
385
385
|
rubygems_mfa_required: 'true'
|
|
386
386
|
rdoc_options: []
|
|
387
387
|
require_paths:
|