html2rss 0.22.1 → 0.22.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 750b7fb967b328cef2238b66729cafe122d1bae23bee05fd8504bb31e760b8a7
4
- data.tar.gz: 327406de9c7c97ea13e90c89bec1c2653c962bbcaccfd29ddb78b282477f7578
3
+ metadata.gz: eb40d85981c1bc11576b0c9d7a3eae3948bd49239ad8b9a65bb6a4ea103b54f9
4
+ data.tar.gz: 6c4636413e1e995e98adec8d8cc742a5833208c2197a024abe90f4b81fce1774
5
5
  SHA512:
6
- metadata.gz: 5f41e00edfdd19ceb012900db7518f28236b417662dc4f11f45d9c498ede0720bdbbc0fb31443d80495eaf6076df646062fdf7f972b27bd71c50cc4e198b4540
7
- data.tar.gz: 7a7eff85bd7f98cd872131041aa58faf9d3fba1aff47893a6d286378b6f52904ee11120f053e3ca8beca3060d86d1438cf2037e3e4e7e1586d3d3c4679b026f0
6
+ metadata.gz: 9134076ad05417b8b7176da26033e4d66137f862781378b44080d4ea4fe67e8932156c8eb7d22b58ad97f0d7f5e32102c96ef84741d681c2c7571b3add09afb0
7
+ data.tar.gz: 61958c85cb791adc78d6d18debcdbfb7a90948332696dac0a4ab4aee39f6b098c3366f948d8905d3d36e2052e03e75037006bd2e6cfa3420b9a11e46e3ebce18
@@ -3,7 +3,7 @@
3
3
  module Html2rss
4
4
  ##
5
5
  # Builds feeds from validated config through request, extraction, and rendering stages.
6
- class FeedPipeline
6
+ class FeedPipeline # rubocop:disable Metrics/ClassLength
7
7
  ##
8
8
  # @param raw_config [Hash{Symbol => Object}] user-provided feed config
9
9
  def initialize(raw_config)
@@ -77,11 +77,16 @@ module Html2rss
77
77
  auto_fallback_for(config).call
78
78
  end
79
79
 
80
+ # rubocop:disable Metrics/MethodLength
80
81
  def auto_fallback_for(config)
81
82
  AutoFallback.new(
82
83
  strategies: AutoFallback::CHAIN,
83
84
  budget: auto_pipeline_budget(config),
84
85
  session_for: lambda do |strategy:, budget:|
86
+ if budget.remaining_timeout_seconds && budget.remaining_timeout_seconds <= 0
87
+ raise RequestService::RequestTimedOut, 'Request timed out'
88
+ end
89
+
85
90
  request_session_for(config, strategy:, budget:)
86
91
  end,
87
92
  articles_for: lambda do |response:, request_session:|
@@ -89,10 +94,14 @@ module Html2rss
89
94
  end
90
95
  )
91
96
  end
97
+ # rubocop:enable Metrics/MethodLength
92
98
 
93
99
  def auto_pipeline_budget(config)
94
- max_requests = RequestSession::RuntimePolicy.from_config(config).max_requests
95
- RequestService::Budget.new(max_requests:)
100
+ policy = RequestSession::RuntimePolicy.from_config(config)
101
+ RequestService::Budget.new(
102
+ max_requests: policy.max_requests,
103
+ total_timeout_seconds: policy.total_timeout_seconds
104
+ )
96
105
  end
97
106
 
98
107
  def collect_articles(response:, config:, request_session:)
@@ -16,9 +16,9 @@ module Html2rss
16
16
  # @raise [BotasaurusConnectionFailed] when Botasaurus cannot be reached or returns an invalid payload
17
17
  # @raise [RequestTimedOut] when the Botasaurus request exceeds configured timeout
18
18
  def execute
19
+ check_timeout!
19
20
  validate_request!
20
- transport_response = client.post('/scrape', JSON.generate(contract.request_payload), content_type_header)
21
- parsed_response = contract.parse_response(transport_response)
21
+ parsed_response = post_scrape_request
22
22
  raise_if_challenge_blocked!(parsed_response)
23
23
  raise_if_upstream_failed!(parsed_response)
24
24
  build_response(parsed_response)
@@ -30,6 +30,11 @@ module Html2rss
30
30
 
31
31
  private
32
32
 
33
+ def post_scrape_request
34
+ transport_response = client.post('/scrape', JSON.generate(contract.request_payload), content_type_header)
35
+ contract.parse_response(transport_response)
36
+ end
37
+
33
38
  def validate_request!
34
39
  ctx.budget.consume!
35
40
  ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
@@ -76,7 +81,10 @@ module Html2rss
76
81
  end
77
82
 
78
83
  def request_options
79
- { timeout: ctx.policy.total_timeout_seconds }
84
+ timeout = ctx.budget.remaining_timeout_seconds || ctx.policy.total_timeout_seconds
85
+ raise RequestTimedOut, 'Request timed out' if timeout <= 0
86
+
87
+ { timeout: timeout.to_i }
80
88
  end
81
89
 
82
90
  def content_type_header
@@ -37,6 +37,7 @@ module Html2rss
37
37
  # @return [Response] normalized request response
38
38
  # @raise [RequestTimedOut] if the browser session exceeds the configured timeout
39
39
  def execute
40
+ check_timeout!
40
41
  validate_request!
41
42
  execute_browserless_request
42
43
  rescue Puppeteer::TimeoutError => error
@@ -71,7 +72,10 @@ module Html2rss
71
72
  end
72
73
 
73
74
  def protocol_timeout_ms
74
- ctx.policy.total_timeout_seconds * 1000
75
+ timeout = ctx.budget.remaining_timeout_seconds || ctx.policy.total_timeout_seconds
76
+ raise RequestTimedOut, 'Request timed out' if timeout <= 0
77
+
78
+ (timeout * 1000).to_i
75
79
  end
76
80
 
77
81
  def connect_with_timeout_support(&)
@@ -5,14 +5,18 @@ module Html2rss
5
5
  ##
6
6
  # Tracks how many outbound requests a single feed build may still perform.
7
7
  class Budget
8
+ ##
8
9
  ##
9
10
  # @param max_requests [Integer] the maximum number of requests allowed
10
- def initialize(max_requests:)
11
+ # @param total_timeout_seconds [Integer, nil] the total timeout for the feed build
12
+ def initialize(max_requests:, total_timeout_seconds: nil)
11
13
  unless max_requests.is_a?(Integer) && max_requests.positive?
12
14
  raise ArgumentError, 'max_requests must be positive'
13
15
  end
14
16
 
15
17
  @remaining = max_requests
18
+ @start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
19
+ @total_timeout_seconds = total_timeout_seconds
16
20
  @mutex = Mutex.new
17
21
  end
18
22
 
@@ -34,6 +38,16 @@ module Html2rss
34
38
  def remaining
35
39
  @mutex.synchronize { @remaining }
36
40
  end
41
+
42
+ ##
43
+ # @return [Float, nil] the remaining timeout in seconds, or nil if not tracked
44
+ def remaining_timeout_seconds
45
+ return unless @total_timeout_seconds
46
+
47
+ elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - @start_time
48
+ remaining = @total_timeout_seconds - elapsed
49
+ [remaining, 0.0].max
50
+ end
37
51
  end
38
52
  end
39
53
  end
@@ -77,7 +77,9 @@ module Html2rss
77
77
  raise ArgumentError, 'policy must not be nil' if @policy.nil?
78
78
 
79
79
  @origin_url = normalized_origin_url(request_options[:origin_url])
80
- @budget = request_options.fetch(:budget) { Budget.new(max_requests: policy.max_requests) }
80
+ @budget = request_options.fetch(:budget) do
81
+ Budget.new(max_requests: policy.max_requests, total_timeout_seconds: policy.total_timeout_seconds)
82
+ end
81
83
  raise ArgumentError, 'budget must not be nil' if @budget.nil?
82
84
  end
83
85
 
@@ -34,6 +34,7 @@ module Html2rss
34
34
  # SSRF protection here is pre-connection only (DNS resolution via Policy).
35
35
  # A DNS rebinding attack between resolution and connect cannot be caught at this layer.
36
36
  def execute
37
+ check_timeout!
37
38
  deadline = request_deadline
38
39
  response_guard, response = perform_request(deadline:)
39
40
  response_guard.inspect_body!(response.body)
@@ -45,7 +46,7 @@ module Html2rss
45
46
  private
46
47
 
47
48
  def request_deadline
48
- monotonic_now + ctx.policy.total_timeout_seconds
49
+ monotonic_now + (ctx.budget.remaining_timeout_seconds || ctx.policy.total_timeout_seconds)
49
50
  end
50
51
 
51
52
  def perform_request(deadline:)
@@ -23,6 +23,11 @@ module Html2rss
23
23
 
24
24
  # @return [Context] the context for the request
25
25
  attr_reader :ctx
26
+
27
+ def check_timeout!
28
+ remaining = ctx.budget.remaining_timeout_seconds
29
+ raise RequestTimedOut, 'Request timed out' if remaining && remaining <= 0
30
+ end
26
31
  end
27
32
  end
28
33
  end
@@ -4,6 +4,6 @@
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
6
  # Current application version.
7
- VERSION = '0.22.1'
7
+ VERSION = '0.22.2'
8
8
  public_constant :VERSION
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.22.1
4
+ version: 0.22.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
@@ -381,7 +381,7 @@ licenses:
381
381
  - MIT
382
382
  metadata:
383
383
  allowed_push_host: https://rubygems.org
384
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.1
384
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.2
385
385
  rubygems_mfa_required: 'true'
386
386
  rdoc_options: []
387
387
  require_paths: