html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'cgi'
|
|
4
|
+
|
|
5
|
+
module Html2rss
|
|
6
|
+
module Rendering
|
|
7
|
+
# Renders an HTML <video> tag from a URL and type.
|
|
8
|
+
class VideoRenderer
|
|
9
|
+
def initialize(url:, type:)
|
|
10
|
+
@url = url
|
|
11
|
+
@type = type
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def to_html
|
|
15
|
+
%(<video controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous" playsinline>
|
|
16
|
+
<source src="#{escaped_url}" type="#{escaped_type}">
|
|
17
|
+
</video>)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def escaped_url
|
|
23
|
+
CGI.escapeHTML(@url.to_s)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def escaped_type
|
|
27
|
+
CGI.escapeHTML(@type.to_s)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
# Namespace for HTML rendering logic, used to generate rich content such as
|
|
5
|
+
# images, audio, video, or embedded documents for feed descriptions.
|
|
6
|
+
#
|
|
7
|
+
# @example
|
|
8
|
+
# Html2rss::Rendering::ImageRenderer.new(...).to_html
|
|
9
|
+
# Html2rss::Rendering::MediaRenderer.for(...)
|
|
10
|
+
#
|
|
11
|
+
# @see Html2rss::Rendering::DescriptionBuilder
|
|
12
|
+
module Rendering
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Tracks runtime request controls together with whether each value was explicitly set.
|
|
6
|
+
class RequestControls
|
|
7
|
+
TOP_LEVEL_KEYS = %i[strategy].freeze
|
|
8
|
+
REQUEST_KEYS = %i[max_redirects max_requests].freeze
|
|
9
|
+
|
|
10
|
+
##
|
|
11
|
+
# @param config [Hash<Symbol, Object>, Hash<String, Object>] raw config input
|
|
12
|
+
# @return [RequestControls] request controls extracted from the config hash
|
|
13
|
+
def self.from_config(config)
|
|
14
|
+
new(
|
|
15
|
+
strategy: value_for(config, :strategy),
|
|
16
|
+
max_redirects: request_value_for(config, :max_redirects),
|
|
17
|
+
max_requests: request_value_for(config, :max_requests),
|
|
18
|
+
explicit_keys: explicit_keys_for(config)
|
|
19
|
+
)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def self.explicit_keys_for(config)
|
|
23
|
+
TOP_LEVEL_KEYS.filter { top_level_key?(config, _1) } +
|
|
24
|
+
REQUEST_KEYS.filter { request_key?(config, _1) }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def self.value_for(config, key)
|
|
28
|
+
return config[key] if config.key?(key)
|
|
29
|
+
return config[key.to_s] if config.key?(key.to_s)
|
|
30
|
+
|
|
31
|
+
nil
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.request_value_for(config, key)
|
|
35
|
+
request_config = value_for(config, :request)
|
|
36
|
+
return nil unless request_config.is_a?(Hash)
|
|
37
|
+
|
|
38
|
+
value_for(request_config, key)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.top_level_key?(config, key)
|
|
42
|
+
config.key?(key) || config.key?(key.to_s)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.request_key?(config, key)
|
|
46
|
+
request_config = value_for(config, :request)
|
|
47
|
+
request_config.is_a?(Hash) && top_level_key?(request_config, key)
|
|
48
|
+
end
|
|
49
|
+
private_class_method :explicit_keys_for, :request_value_for, :top_level_key?, :request_key?, :value_for
|
|
50
|
+
|
|
51
|
+
##
|
|
52
|
+
# @param strategy [Symbol, nil] effective request strategy
|
|
53
|
+
# @param max_redirects [Integer, nil] effective redirect limit
|
|
54
|
+
# @param max_requests [Integer, nil] effective request budget
|
|
55
|
+
# @param explicit_keys [Array<Symbol>] controls explicitly supplied by the caller
|
|
56
|
+
def initialize(strategy: nil, max_redirects: nil, max_requests: nil, explicit_keys: [])
|
|
57
|
+
@strategy = strategy
|
|
58
|
+
@max_redirects = max_redirects
|
|
59
|
+
@max_requests = max_requests
|
|
60
|
+
@explicit_keys = explicit_keys.map(&:to_sym).uniq.freeze
|
|
61
|
+
freeze
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
##
|
|
65
|
+
# @return [Symbol, nil] effective request strategy
|
|
66
|
+
attr_reader :strategy
|
|
67
|
+
|
|
68
|
+
##
|
|
69
|
+
# @return [Integer, nil] effective redirect limit
|
|
70
|
+
attr_reader :max_redirects
|
|
71
|
+
|
|
72
|
+
##
|
|
73
|
+
# @return [Integer, nil] effective request budget
|
|
74
|
+
attr_reader :max_requests
|
|
75
|
+
|
|
76
|
+
##
|
|
77
|
+
# @param name [Symbol, String] request control name
|
|
78
|
+
# @return [Boolean] whether the control was explicitly supplied
|
|
79
|
+
def explicit?(name)
|
|
80
|
+
explicit_keys.include?(name.to_sym)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
##
|
|
84
|
+
# @param strategy [Symbol, nil] validated request strategy
|
|
85
|
+
# @param max_redirects [Integer, nil] validated redirect limit
|
|
86
|
+
# @param max_requests [Integer, nil] validated request budget
|
|
87
|
+
# @return [RequestControls] controls updated with validated effective values
|
|
88
|
+
def with_effective_values(strategy:, max_redirects:, max_requests:)
|
|
89
|
+
self.class.new(
|
|
90
|
+
strategy:,
|
|
91
|
+
max_redirects:,
|
|
92
|
+
max_requests:,
|
|
93
|
+
explicit_keys:
|
|
94
|
+
)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
##
|
|
98
|
+
# Applies only explicitly set controls to the provided config hash.
|
|
99
|
+
#
|
|
100
|
+
# @param config [Hash<Symbol, Object>] mutable config hash
|
|
101
|
+
# @return [Hash<Symbol, Object>] the same hash with explicit controls written
|
|
102
|
+
def apply_to(config)
|
|
103
|
+
config[:strategy] = strategy if explicit?(:strategy)
|
|
104
|
+
apply_request_value(config, :max_redirects, max_redirects)
|
|
105
|
+
apply_request_value(config, :max_requests, max_requests)
|
|
106
|
+
config
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
private
|
|
110
|
+
|
|
111
|
+
attr_reader :explicit_keys
|
|
112
|
+
|
|
113
|
+
def apply_request_value(config, key, value)
|
|
114
|
+
return unless explicit?(key)
|
|
115
|
+
|
|
116
|
+
ensure_request_config!(config)
|
|
117
|
+
config[:request][key] = value
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def ensure_request_config!(config)
|
|
121
|
+
request_config = config[:request]
|
|
122
|
+
return config[:request] = {} if request_config.nil?
|
|
123
|
+
return if request_config.is_a?(Hash)
|
|
124
|
+
|
|
125
|
+
raise ArgumentError, 'request config must be a hash'
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
@@ -31,23 +31,119 @@ module Html2rss
|
|
|
31
31
|
# are aligned with the default values.
|
|
32
32
|
# @see https://github.com/browserless/browserless/pkgs/container/chromium
|
|
33
33
|
class BrowserlessStrategy < Strategy
|
|
34
|
-
|
|
34
|
+
##
|
|
35
|
+
# Executes a Browserless-backed request with the shared request policy.
|
|
36
|
+
#
|
|
37
|
+
# @return [Response] normalized request response
|
|
38
|
+
# @raise [RequestTimedOut] if the browser session exceeds the configured timeout
|
|
35
39
|
def execute
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
end
|
|
40
|
+
validate_request!
|
|
41
|
+
execute_browserless_request
|
|
42
|
+
rescue Puppeteer::TimeoutError => error
|
|
43
|
+
raise RequestTimedOut, error.message
|
|
41
44
|
end
|
|
42
45
|
|
|
46
|
+
##
|
|
47
|
+
# @return [String] the Browserless websocket endpoint with token query param
|
|
48
|
+
# @raise [ArgumentError] if a custom endpoint is configured without an API token
|
|
43
49
|
def browser_ws_endpoint
|
|
44
50
|
@browser_ws_endpoint ||= begin
|
|
45
|
-
api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', '6R0W53R135510')
|
|
46
51
|
ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
|
|
52
|
+
api_token = browserless_api_token(ws_url)
|
|
47
53
|
|
|
48
54
|
"#{ws_url}?token=#{api_token}"
|
|
49
55
|
end
|
|
50
56
|
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def validate_request!
|
|
61
|
+
ctx.budget.consume!
|
|
62
|
+
ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def execute_browserless_request
|
|
66
|
+
connect_with_timeout_support do |browser|
|
|
67
|
+
PuppetCommander.new(ctx, browser).call
|
|
68
|
+
ensure
|
|
69
|
+
browser.disconnect
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def protocol_timeout_ms
|
|
74
|
+
ctx.policy.total_timeout_seconds * 1000
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def connect_with_timeout_support(&)
|
|
78
|
+
connect_browserless(protocol_timeout: protocol_timeout_ms, &)
|
|
79
|
+
rescue ArgumentError => error
|
|
80
|
+
raise unless unsupported_protocol_timeout?(error)
|
|
81
|
+
|
|
82
|
+
connect_browserless(&)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def unsupported_protocol_timeout?(error)
|
|
86
|
+
error.message.include?('unknown keyword: :protocol_timeout')
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def connect_browserless(protocol_timeout: nil, &)
|
|
90
|
+
connected = false
|
|
91
|
+
|
|
92
|
+
Puppeteer.connect(**browserless_connect_options(protocol_timeout)) do |browser|
|
|
93
|
+
connected = true
|
|
94
|
+
yield browser
|
|
95
|
+
end
|
|
96
|
+
rescue ArgumentError => error
|
|
97
|
+
handle_connection_error(error, connected:, protocol_timeout:)
|
|
98
|
+
rescue StandardError => error
|
|
99
|
+
handle_connection_error(error, connected:)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def browserless_connect_options(protocol_timeout)
|
|
103
|
+
{ browser_ws_endpoint:, protocol_timeout: }.compact
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def handle_connection_error(error, connected:, protocol_timeout: nil)
|
|
107
|
+
raise if connected || compatibility_timeout_error?(error, protocol_timeout:)
|
|
108
|
+
|
|
109
|
+
raise BrowserlessConnectionFailed, browserless_connection_message(error), cause: error
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def compatibility_timeout_error?(error, protocol_timeout:)
|
|
113
|
+
protocol_timeout && unsupported_protocol_timeout?(error)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def browserless_connection_message(error)
|
|
117
|
+
base = "Browserless connection failed (#{error.class}: #{error.message})."
|
|
118
|
+
endpoint_hint = "Check BROWSERLESS_IO_WEBSOCKET_URL (currently #{browserless_websocket_url})."
|
|
119
|
+
token_hint = 'Check BROWSERLESS_IO_API_TOKEN and ensure it matches your Browserless TOKEN.'
|
|
120
|
+
local_hint = 'For local Browserless, confirm the service is running and reachable.'
|
|
121
|
+
|
|
122
|
+
if likely_authentication_error?(error)
|
|
123
|
+
"#{base} #{token_hint} #{endpoint_hint}"
|
|
124
|
+
else
|
|
125
|
+
"#{base} #{endpoint_hint} #{token_hint} #{local_hint}"
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def likely_authentication_error?(error)
|
|
130
|
+
message = error.message.downcase
|
|
131
|
+
message.include?('unauthorized') || message.include?('forbidden') || message.include?('401')
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def browserless_websocket_url
|
|
135
|
+
ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def browserless_api_token(ws_url)
|
|
139
|
+
ENV.fetch('BROWSERLESS_IO_API_TOKEN') do
|
|
140
|
+
return '6R0W53R135510' if ws_url == 'ws://127.0.0.1:3000'
|
|
141
|
+
|
|
142
|
+
raise BrowserlessConfigurationError,
|
|
143
|
+
'BROWSERLESS_IO_API_TOKEN is required for custom Browserless endpoints. ' \
|
|
144
|
+
'Set BROWSERLESS_IO_API_TOKEN or use ws://127.0.0.1:3000 for local defaults.'
|
|
145
|
+
end
|
|
146
|
+
end
|
|
51
147
|
end
|
|
52
148
|
end
|
|
53
149
|
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestService
|
|
5
|
+
##
|
|
6
|
+
# Tracks how many outbound requests a single feed build may still perform.
|
|
7
|
+
class Budget
|
|
8
|
+
##
|
|
9
|
+
# @param max_requests [Integer] the maximum number of requests allowed
|
|
10
|
+
def initialize(max_requests:)
|
|
11
|
+
unless max_requests.is_a?(Integer) && max_requests.positive?
|
|
12
|
+
raise ArgumentError, 'max_requests must be positive'
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
@remaining = max_requests
|
|
16
|
+
@mutex = Mutex.new
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
##
|
|
20
|
+
# Consumes one request from the budget.
|
|
21
|
+
#
|
|
22
|
+
# @return [Integer] remaining request count after consumption
|
|
23
|
+
# @raise [RequestBudgetExceeded] if no requests remain
|
|
24
|
+
def consume!
|
|
25
|
+
@mutex.synchronize do
|
|
26
|
+
raise RequestBudgetExceeded, 'Request budget exhausted' if @remaining.zero?
|
|
27
|
+
|
|
28
|
+
@remaining -= 1
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
##
|
|
33
|
+
# @return [Integer] requests still available
|
|
34
|
+
def remaining
|
|
35
|
+
@mutex.synchronize { @remaining }
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -1,45 +1,89 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'addressable/uri'
|
|
4
|
-
|
|
5
3
|
module Html2rss
|
|
6
4
|
class RequestService
|
|
7
5
|
##
|
|
8
6
|
# Holds information needed to send requests to websites.
|
|
9
7
|
# To be passed down to the RequestService's strategies.
|
|
10
8
|
class Context
|
|
11
|
-
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
|
12
|
-
|
|
13
9
|
##
|
|
14
|
-
# @param url [String,
|
|
10
|
+
# @param url [String, Html2rss::Url] the URL to request
|
|
15
11
|
# @param headers [Hash] HTTP request headers
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
12
|
+
# @param request [Hash] request specific options passed to strategies
|
|
13
|
+
# @param request_options [Hash] runtime request options
|
|
14
|
+
# @option request_options [Symbol] :relation why this request is being made
|
|
15
|
+
# @option request_options [String, Html2rss::Url, nil] :origin_url originating URL for same-origin checks
|
|
16
|
+
# @option request_options [Policy] :policy runtime request policy
|
|
17
|
+
# @option request_options [Budget] :budget shared request budget for the feed build
|
|
18
|
+
# @raise [ArgumentError] if policy or budget is explicitly nil
|
|
19
|
+
def initialize(url:, headers: {}, request: {}, **request_options)
|
|
20
|
+
@url = Html2rss::Url.from_absolute(url)
|
|
20
21
|
@headers = headers
|
|
22
|
+
@request = request.freeze
|
|
23
|
+
assign_request_options(request_options)
|
|
21
24
|
end
|
|
22
25
|
|
|
23
|
-
# @return [
|
|
26
|
+
# @return [Html2rss::Url] the parsed and normalized URL
|
|
24
27
|
attr_reader :url
|
|
25
28
|
|
|
26
29
|
# @return [Hash] the HTTP request headers
|
|
27
30
|
attr_reader :headers
|
|
28
31
|
|
|
29
|
-
|
|
32
|
+
# @return [Hash] the request specific options
|
|
33
|
+
attr_reader :request
|
|
34
|
+
|
|
35
|
+
# @return [Hash] browserless specific options
|
|
36
|
+
def browserless = request.fetch(:browserless, {})
|
|
37
|
+
|
|
38
|
+
# @return [Hash, nil] preload options for browserless requests
|
|
39
|
+
def browserless_preload = browserless[:preload]
|
|
40
|
+
|
|
41
|
+
# @return [Symbol] the request relation
|
|
42
|
+
attr_reader :relation
|
|
43
|
+
|
|
44
|
+
# @return [Html2rss::Url] the initial URL for the feed build
|
|
45
|
+
attr_reader :origin_url
|
|
46
|
+
|
|
47
|
+
# @return [Policy] the runtime request policy
|
|
48
|
+
attr_reader :policy
|
|
49
|
+
|
|
50
|
+
# @return [Budget] the shared request budget
|
|
51
|
+
attr_reader :budget
|
|
30
52
|
|
|
31
53
|
##
|
|
32
|
-
#
|
|
33
|
-
#
|
|
34
|
-
# @
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
54
|
+
# Builds a follow-up request context sharing headers, budget, and policy.
|
|
55
|
+
#
|
|
56
|
+
# @param url [String, Html2rss::Url] the follow-up URL
|
|
57
|
+
# @param relation [Symbol] why the follow-up is being made
|
|
58
|
+
# @param origin_url [String, Html2rss::Url] effective origin for same-origin checks
|
|
59
|
+
# @return [Context] derived request context
|
|
60
|
+
def follow_up(url:, relation:, origin_url: self.origin_url)
|
|
61
|
+
self.class.new(
|
|
62
|
+
url:,
|
|
63
|
+
headers:,
|
|
64
|
+
request:,
|
|
65
|
+
relation:,
|
|
66
|
+
origin_url:,
|
|
67
|
+
policy:,
|
|
68
|
+
budget:
|
|
69
|
+
)
|
|
70
|
+
end
|
|
38
71
|
|
|
39
|
-
|
|
72
|
+
private
|
|
73
|
+
|
|
74
|
+
def assign_request_options(request_options)
|
|
75
|
+
@relation = request_options.fetch(:relation, :initial)
|
|
76
|
+
@policy = request_options.fetch(:policy, Policy.default)
|
|
77
|
+
raise ArgumentError, 'policy must not be nil' if @policy.nil?
|
|
78
|
+
|
|
79
|
+
@origin_url = normalized_origin_url(request_options[:origin_url])
|
|
80
|
+
@budget = request_options.fetch(:budget) { Budget.new(max_requests: policy.max_requests) }
|
|
81
|
+
raise ArgumentError, 'budget must not be nil' if @budget.nil?
|
|
82
|
+
end
|
|
40
83
|
|
|
41
|
-
|
|
42
|
-
|
|
84
|
+
def normalized_origin_url(origin_url)
|
|
85
|
+
source = origin_url || @url
|
|
86
|
+
Html2rss::Url.from_absolute(source)
|
|
43
87
|
end
|
|
44
88
|
end
|
|
45
89
|
end
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
require 'faraday'
|
|
4
4
|
require 'faraday/follow_redirects'
|
|
5
|
+
require 'faraday/gzip'
|
|
5
6
|
|
|
6
7
|
module Html2rss
|
|
7
8
|
class RequestService
|
|
@@ -9,15 +10,144 @@ module Html2rss
|
|
|
9
10
|
# Strategy to use Faraday for the request.
|
|
10
11
|
# @see https://rubygems.org/gems/faraday
|
|
11
12
|
class FaradayStrategy < Strategy
|
|
12
|
-
|
|
13
|
+
##
|
|
14
|
+
# Restores buffered streamed bytes so response middleware can process them.
|
|
15
|
+
class StreamingBodyMiddleware < Faraday::Middleware
|
|
16
|
+
STREAM_BUFFER_KEY = :html2rss_stream_buffer
|
|
17
|
+
|
|
18
|
+
def on_complete(env)
|
|
19
|
+
buffer = env.request.context&.delete(STREAM_BUFFER_KEY)
|
|
20
|
+
return if buffer.nil? || buffer.empty?
|
|
21
|
+
|
|
22
|
+
env.body = buffer
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
##
|
|
27
|
+
# NOTE: Unlike BrowserlessStrategy, Faraday does not expose the remote IP after connect.
|
|
28
|
+
# SSRF protection here is pre-connection only (DNS resolution via Policy).
|
|
29
|
+
# A DNS rebinding attack between resolution and connect cannot be caught at this layer.
|
|
30
|
+
#
|
|
31
|
+
# Executes a request with runtime policy enforcement.
|
|
32
|
+
#
|
|
33
|
+
# @return [Response] normalized request response
|
|
13
34
|
def execute
|
|
14
|
-
|
|
15
|
-
|
|
35
|
+
deadline = request_deadline
|
|
36
|
+
response_guard, response = perform_request(deadline:)
|
|
37
|
+
response_guard.inspect_body!(response.body)
|
|
38
|
+
build_response(response)
|
|
39
|
+
rescue Faraday::TimeoutError, Timeout::Error => error
|
|
40
|
+
raise RequestTimedOut, error.message
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def request_deadline
|
|
46
|
+
monotonic_now + ctx.policy.total_timeout_seconds
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def perform_request(deadline:)
|
|
50
|
+
response_guard = ResponseGuard.new(policy: ctx.policy)
|
|
51
|
+
response = faraday_request(response_guard, deadline:, streaming_buffer: true)
|
|
52
|
+
response = retry_without_streaming(response_guard, deadline:) if retry_without_streaming?(response)
|
|
53
|
+
[response_guard, response]
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def build_response(response)
|
|
57
|
+
Response.new(body: response.body, headers: response.headers, url: response_url(response),
|
|
58
|
+
status: response.status)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def validate_request!(consume_budget: true)
|
|
62
|
+
ctx.budget.consume! if consume_budget
|
|
63
|
+
ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def faraday_request(response_guard, deadline:, streaming_buffer:, consume_budget: true)
|
|
67
|
+
validate_request!(consume_budget:)
|
|
68
|
+
|
|
69
|
+
client.get do |req|
|
|
70
|
+
apply_timeouts(req, deadline:)
|
|
71
|
+
buffer = prepare_stream_buffer(req) if streaming_buffer
|
|
72
|
+
req.options.on_data = on_data_callback(response_guard, buffer)
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def retry_without_streaming(response_guard, deadline:)
|
|
77
|
+
faraday_request(response_guard, deadline:, streaming_buffer: false, consume_budget: false)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def client
|
|
81
|
+
@client ||= Faraday.new(url: ctx.url.to_s, headers: ctx.headers) do |faraday|
|
|
82
|
+
faraday.use Faraday::FollowRedirects::Middleware, limit: ctx.policy.max_redirects, callback: redirect_callback
|
|
83
|
+
faraday.request :gzip
|
|
84
|
+
faraday.use StreamingBodyMiddleware
|
|
16
85
|
faraday.adapter Faraday.default_adapter
|
|
17
86
|
end
|
|
18
|
-
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def apply_timeouts(request, deadline:)
|
|
90
|
+
remaining_timeout = remaining_timeout_seconds(deadline)
|
|
91
|
+
request.options.timeout = remaining_timeout
|
|
92
|
+
request.options.open_timeout = [ctx.policy.connect_timeout_seconds, remaining_timeout].min
|
|
93
|
+
request.options.read_timeout = [ctx.policy.read_timeout_seconds, remaining_timeout].min
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def prepare_stream_buffer(request)
|
|
97
|
+
request.options.context ||= {}
|
|
98
|
+
request.options.context[StreamingBodyMiddleware::STREAM_BUFFER_KEY] = +''
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def on_data_callback(response_guard, buffer)
|
|
102
|
+
proc do |chunk, total_bytes, env|
|
|
103
|
+
response_guard.inspect_chunk!(total_bytes:, headers: env&.response_headers)
|
|
104
|
+
buffer&.<< chunk
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def remaining_timeout_seconds(deadline)
|
|
109
|
+
remaining = deadline - monotonic_now
|
|
110
|
+
raise RequestTimedOut, 'Request timed out' if remaining <= 0
|
|
111
|
+
|
|
112
|
+
remaining
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def retry_without_streaming?(response)
|
|
116
|
+
return false if response.body.to_s.empty? == false
|
|
117
|
+
return false unless response_success?(response)
|
|
118
|
+
|
|
119
|
+
final_url = response.env&.url
|
|
120
|
+
return false unless final_url
|
|
121
|
+
|
|
122
|
+
final_url.to_s != ctx.url.to_s
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def response_success?(response)
|
|
126
|
+
return true if response.status.nil?
|
|
127
|
+
|
|
128
|
+
response.status >= 200 && response.status < 300
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def response_url(response)
|
|
132
|
+
return ctx.url unless (url = response.env&.url)
|
|
133
|
+
|
|
134
|
+
Html2rss::Url.from_absolute(url.to_s)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def redirect_callback
|
|
138
|
+
lambda do |old_env, new_env|
|
|
139
|
+
from_url = normalize_url(old_env[:url])
|
|
140
|
+
to_url = normalize_url(new_env[:url])
|
|
141
|
+
ctx.policy.validate_redirect!(from_url:, to_url:, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def normalize_url(url)
|
|
146
|
+
Html2rss::Url.from_absolute(url.to_s)
|
|
147
|
+
end
|
|
19
148
|
|
|
20
|
-
|
|
149
|
+
def monotonic_now
|
|
150
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
21
151
|
end
|
|
22
152
|
end
|
|
23
153
|
end
|