html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
##
|
|
6
|
+
# Main html2rss namespace.
|
|
7
|
+
module Html2rss
|
|
8
|
+
##
|
|
9
|
+
# Request transport orchestration and strategies.
|
|
10
|
+
class RequestService
|
|
11
|
+
##
|
|
12
|
+
# Maps html2rss request/response handling to the botasaurus-scrape-api contract.
|
|
13
|
+
class BotasaurusContract
|
|
14
|
+
# Default Botasaurus scrape options when no explicit config is provided.
|
|
15
|
+
DEFAULT_OPTIONS = {
|
|
16
|
+
navigation_mode: 'auto',
|
|
17
|
+
max_retries: 2,
|
|
18
|
+
headless: false
|
|
19
|
+
}.freeze
|
|
20
|
+
|
|
21
|
+
# Allowlisted request.botasaurus keys forwarded to upstream.
|
|
22
|
+
OPTION_KEYS = %i[
|
|
23
|
+
navigation_mode
|
|
24
|
+
max_retries
|
|
25
|
+
wait_for_selector
|
|
26
|
+
wait_timeout_seconds
|
|
27
|
+
block_images
|
|
28
|
+
block_images_and_css
|
|
29
|
+
wait_for_complete_page_load
|
|
30
|
+
headless
|
|
31
|
+
proxy
|
|
32
|
+
user_agent
|
|
33
|
+
window_size
|
|
34
|
+
lang
|
|
35
|
+
].freeze
|
|
36
|
+
|
|
37
|
+
# Parsed Botasaurus response wrapper.
|
|
38
|
+
class ParsedResponse
|
|
39
|
+
# Fallback headers when upstream omits response headers.
|
|
40
|
+
DEFAULT_HEADERS = { 'content-type' => 'text/html' }.freeze
|
|
41
|
+
|
|
42
|
+
# @param payload [Hash{String => Object}] parsed Botasaurus response payload
|
|
43
|
+
# @param transport_status [Integer] HTTP status returned by Botasaurus
|
|
44
|
+
def initialize(payload:, transport_status:)
|
|
45
|
+
@payload = payload
|
|
46
|
+
@transport_status = transport_status
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @return [Boolean] true when upstream classified request as challenge blocked
|
|
50
|
+
def challenge_block? = error_category == 'challenge_block'
|
|
51
|
+
|
|
52
|
+
# @return [Boolean] true when upstream returned non-200 or an error payload
|
|
53
|
+
def upstream_failure?
|
|
54
|
+
status != 200 || error_message?
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @return [String] normalized challenge error message
|
|
58
|
+
def challenge_message
|
|
59
|
+
error || 'Botasaurus challenge block detected.'
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# @return [String] actionable upstream failure summary
|
|
63
|
+
def upstream_failure_message
|
|
64
|
+
details = ["status=#{status}"]
|
|
65
|
+
details << "error_category=#{error_category}" if error_category
|
|
66
|
+
details << "error=#{error}" if error
|
|
67
|
+
details << "request_id=#{request_id}" if request_id
|
|
68
|
+
"Botasaurus scrape failed (#{details.join(', ')})."
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# @return [String] rendered HTML body from Botasaurus
|
|
72
|
+
# @raise [BotasaurusConnectionFailed] when html is missing
|
|
73
|
+
def html
|
|
74
|
+
value = payload['html']
|
|
75
|
+
raise BotasaurusConnectionFailed, "Botasaurus response missing required 'html' field" if value.nil?
|
|
76
|
+
|
|
77
|
+
value.to_s
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# @return [Hash{String => String}] normalized response headers
|
|
81
|
+
def headers
|
|
82
|
+
raw_headers = payload['headers']
|
|
83
|
+
return DEFAULT_HEADERS.dup unless raw_headers.is_a?(Hash) && raw_headers.any?
|
|
84
|
+
|
|
85
|
+
raw_headers.to_h { |key, value| [key.to_s, value.to_s] }
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# @return [Integer] resolved status code (payload status_code or transport status)
|
|
89
|
+
def status
|
|
90
|
+
status_code = payload['status_code']
|
|
91
|
+
status_code.is_a?(Integer) ? status_code : transport_status
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# @return [String, nil] final URL reported by upstream
|
|
95
|
+
def final_url = payload['final_url']
|
|
96
|
+
|
|
97
|
+
private
|
|
98
|
+
|
|
99
|
+
attr_reader :payload, :transport_status
|
|
100
|
+
|
|
101
|
+
def error = payload['error']
|
|
102
|
+
|
|
103
|
+
def request_id = payload['request_id']
|
|
104
|
+
|
|
105
|
+
def error_category = payload['error_category']
|
|
106
|
+
|
|
107
|
+
def error_message?
|
|
108
|
+
value = error
|
|
109
|
+
value.is_a?(String) ? !value.empty? : !value.nil?
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
##
|
|
114
|
+
# @param url [Html2rss::Url] canonical URL to scrape
|
|
115
|
+
# @param options [Hash] validated request.botasaurus options
|
|
116
|
+
# @option options [String] :navigation_mode
|
|
117
|
+
# @option options [Integer] :max_retries
|
|
118
|
+
# @option options [String] :wait_for_selector
|
|
119
|
+
# @option options [Integer] :wait_timeout_seconds
|
|
120
|
+
# @option options [Boolean] :block_images
|
|
121
|
+
# @option options [Boolean] :block_images_and_css
|
|
122
|
+
# @option options [Boolean] :wait_for_complete_page_load
|
|
123
|
+
# @option options [Boolean] :headless
|
|
124
|
+
# @option options [String] :proxy
|
|
125
|
+
# @option options [String] :user_agent
|
|
126
|
+
# @option options [Array<Integer>] :window_size
|
|
127
|
+
# @option options [String] :lang
|
|
128
|
+
def initialize(url:, options: {})
|
|
129
|
+
@url = url
|
|
130
|
+
@options = options
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# @return [Hash] payload for POST /scrape
|
|
134
|
+
def request_payload
|
|
135
|
+
DEFAULT_OPTIONS.merge(filtered_options).merge(url: url.to_s)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# @param transport_response [Faraday::Response] upstream HTTP response
|
|
139
|
+
# @return [ParsedResponse]
|
|
140
|
+
# @raise [BotasaurusConnectionFailed] when payload is not valid JSON object
|
|
141
|
+
def parse_response(transport_response)
|
|
142
|
+
payload = JSON.parse(transport_response.body.to_s)
|
|
143
|
+
raise BotasaurusConnectionFailed, 'Botasaurus response must be a JSON object' unless payload.is_a?(Hash)
|
|
144
|
+
|
|
145
|
+
ParsedResponse.new(payload:, transport_status: transport_response.status)
|
|
146
|
+
rescue JSON::ParserError => error
|
|
147
|
+
raise BotasaurusConnectionFailed, "Botasaurus response JSON parse failed: #{error.message}"
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
private
|
|
151
|
+
|
|
152
|
+
attr_reader :url, :options
|
|
153
|
+
|
|
154
|
+
def filtered_options
|
|
155
|
+
OPTION_KEYS.each_with_object({}) do |key, normalized|
|
|
156
|
+
normalized[key] = options[key] if options.key?(key)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module Html2rss
|
|
7
|
+
class RequestService
|
|
8
|
+
##
|
|
9
|
+
# Strategy to delegate fetching to a Botasaurus scrape API.
|
|
10
|
+
class BotasaurusStrategy < Strategy
|
|
11
|
+
##
|
|
12
|
+
# Executes a Botasaurus-backed request with shared request policy guards.
|
|
13
|
+
#
|
|
14
|
+
# @return [Response] normalized request response
|
|
15
|
+
# @raise [BotasaurusConfigurationError] when BOTASAURUS_SCRAPER_URL is missing or invalid
|
|
16
|
+
# @raise [BotasaurusConnectionFailed] when Botasaurus cannot be reached or returns an invalid payload
|
|
17
|
+
# @raise [RequestTimedOut] when the Botasaurus request exceeds configured timeout
|
|
18
|
+
def execute
|
|
19
|
+
validate_request!
|
|
20
|
+
transport_response = client.post('/scrape', JSON.generate(contract.request_payload), content_type_header)
|
|
21
|
+
parsed_response = contract.parse_response(transport_response)
|
|
22
|
+
raise_if_challenge_blocked!(parsed_response)
|
|
23
|
+
raise_if_upstream_failed!(parsed_response)
|
|
24
|
+
build_response(parsed_response)
|
|
25
|
+
rescue Faraday::TimeoutError, Timeout::Error => error
|
|
26
|
+
raise RequestTimedOut, error.message
|
|
27
|
+
rescue Faraday::ConnectionFailed, Faraday::SSLError => error
|
|
28
|
+
raise BotasaurusConnectionFailed, "Botasaurus connection failed: #{error.message}"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def validate_request!
|
|
34
|
+
ctx.budget.consume!
|
|
35
|
+
ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def build_response(parsed_response)
|
|
39
|
+
body = parsed_response.html
|
|
40
|
+
ResponseGuard.new(policy: ctx.policy).inspect_body!(body)
|
|
41
|
+
|
|
42
|
+
Response.new(
|
|
43
|
+
body:,
|
|
44
|
+
headers: parsed_response.headers,
|
|
45
|
+
url: response_url(parsed_response.final_url),
|
|
46
|
+
status: parsed_response.status
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def raise_if_challenge_blocked!(parsed_response)
|
|
51
|
+
return unless parsed_response.challenge_block?
|
|
52
|
+
|
|
53
|
+
raise BlockedSurfaceDetected, "Blocked surface detected: #{parsed_response.challenge_message}"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def raise_if_upstream_failed!(parsed_response)
|
|
57
|
+
return unless parsed_response.upstream_failure?
|
|
58
|
+
|
|
59
|
+
raise BotasaurusConnectionFailed, parsed_response.upstream_failure_message
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def response_url(final_url)
|
|
63
|
+
return ctx.url if final_url.nil?
|
|
64
|
+
|
|
65
|
+
Html2rss::Url.from_absolute(final_url)
|
|
66
|
+
rescue ArgumentError
|
|
67
|
+
ctx.url
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def contract
|
|
71
|
+
@contract ||= BotasaurusContract.new(url: ctx.url, options: ctx.request.fetch(:botasaurus, {}))
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def client
|
|
75
|
+
@client ||= Faraday.new(url: scraper_base_url.to_s, request: request_options)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def request_options
|
|
79
|
+
{ timeout: ctx.policy.total_timeout_seconds }
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def content_type_header
|
|
83
|
+
{ 'Content-Type' => 'application/json' }
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def scraper_base_url
|
|
87
|
+
@scraper_base_url ||= begin
|
|
88
|
+
configured = ENV.fetch('BOTASAURUS_SCRAPER_URL') do
|
|
89
|
+
raise BotasaurusConfigurationError, 'BOTASAURUS_SCRAPER_URL is required for strategy=botasaurus.'
|
|
90
|
+
end
|
|
91
|
+
Html2rss::Url.for_channel(configured)
|
|
92
|
+
rescue ArgumentError => error
|
|
93
|
+
raise BotasaurusConfigurationError, "BOTASAURUS_SCRAPER_URL is invalid: #{error.message}"
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
@@ -31,23 +31,119 @@ module Html2rss
|
|
|
31
31
|
# are aligned with the default values.
|
|
32
32
|
# @see https://github.com/browserless/browserless/pkgs/container/chromium
|
|
33
33
|
class BrowserlessStrategy < Strategy
|
|
34
|
-
|
|
34
|
+
##
|
|
35
|
+
# Executes a Browserless-backed request with the shared request policy.
|
|
36
|
+
#
|
|
37
|
+
# @return [Response] normalized request response
|
|
38
|
+
# @raise [RequestTimedOut] if the browser session exceeds the configured timeout
|
|
35
39
|
def execute
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
end
|
|
40
|
+
validate_request!
|
|
41
|
+
execute_browserless_request
|
|
42
|
+
rescue Puppeteer::TimeoutError => error
|
|
43
|
+
raise RequestTimedOut, error.message
|
|
41
44
|
end
|
|
42
45
|
|
|
46
|
+
##
|
|
47
|
+
# @return [String] the Browserless websocket endpoint with token query param
|
|
48
|
+
# @raise [ArgumentError] if a custom endpoint is configured without an API token
|
|
43
49
|
def browser_ws_endpoint
|
|
44
50
|
@browser_ws_endpoint ||= begin
|
|
45
|
-
api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', '6R0W53R135510')
|
|
46
51
|
ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
|
|
52
|
+
api_token = browserless_api_token(ws_url)
|
|
47
53
|
|
|
48
54
|
"#{ws_url}?token=#{api_token}"
|
|
49
55
|
end
|
|
50
56
|
end
|
|
57
|
+
|
|
58
|
+
private
|
|
59
|
+
|
|
60
|
+
def validate_request!
|
|
61
|
+
ctx.budget.consume!
|
|
62
|
+
ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def execute_browserless_request
|
|
66
|
+
connect_with_timeout_support do |browser|
|
|
67
|
+
PuppetCommander.new(ctx, browser).call
|
|
68
|
+
ensure
|
|
69
|
+
browser.disconnect
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def protocol_timeout_ms
|
|
74
|
+
ctx.policy.total_timeout_seconds * 1000
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def connect_with_timeout_support(&)
|
|
78
|
+
connect_browserless(protocol_timeout: protocol_timeout_ms, &)
|
|
79
|
+
rescue ArgumentError => error
|
|
80
|
+
raise unless unsupported_protocol_timeout?(error)
|
|
81
|
+
|
|
82
|
+
connect_browserless(&)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def unsupported_protocol_timeout?(error)
|
|
86
|
+
error.message.include?('unknown keyword: :protocol_timeout')
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def connect_browserless(protocol_timeout: nil, &)
|
|
90
|
+
connected = false
|
|
91
|
+
|
|
92
|
+
Puppeteer.connect(**browserless_connect_options(protocol_timeout)) do |browser|
|
|
93
|
+
connected = true
|
|
94
|
+
yield browser
|
|
95
|
+
end
|
|
96
|
+
rescue ArgumentError => error
|
|
97
|
+
handle_connection_error(error, connected:, protocol_timeout:)
|
|
98
|
+
rescue StandardError => error
|
|
99
|
+
handle_connection_error(error, connected:)
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def browserless_connect_options(protocol_timeout)
|
|
103
|
+
{ browser_ws_endpoint:, protocol_timeout: }.compact
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def handle_connection_error(error, connected:, protocol_timeout: nil)
|
|
107
|
+
raise if connected || compatibility_timeout_error?(error, protocol_timeout:)
|
|
108
|
+
|
|
109
|
+
raise BrowserlessConnectionFailed, browserless_connection_message(error), cause: error
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
def compatibility_timeout_error?(error, protocol_timeout:)
|
|
113
|
+
protocol_timeout && unsupported_protocol_timeout?(error)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def browserless_connection_message(error)
|
|
117
|
+
base = "Browserless connection failed (#{error.class}: #{error.message})."
|
|
118
|
+
endpoint_hint = "Check BROWSERLESS_IO_WEBSOCKET_URL (currently #{browserless_websocket_url})."
|
|
119
|
+
token_hint = 'Check BROWSERLESS_IO_API_TOKEN and ensure it matches your Browserless TOKEN.'
|
|
120
|
+
local_hint = 'For local Browserless, confirm the service is running and reachable.'
|
|
121
|
+
|
|
122
|
+
if likely_authentication_error?(error)
|
|
123
|
+
"#{base} #{token_hint} #{endpoint_hint}"
|
|
124
|
+
else
|
|
125
|
+
"#{base} #{endpoint_hint} #{token_hint} #{local_hint}"
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def likely_authentication_error?(error)
|
|
130
|
+
message = error.message.downcase
|
|
131
|
+
message.include?('unauthorized') || message.include?('forbidden') || message.include?('401')
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def browserless_websocket_url
|
|
135
|
+
ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def browserless_api_token(ws_url)
|
|
139
|
+
ENV.fetch('BROWSERLESS_IO_API_TOKEN') do
|
|
140
|
+
return '6R0W53R135510' if ws_url == 'ws://127.0.0.1:3000'
|
|
141
|
+
|
|
142
|
+
raise BrowserlessConfigurationError,
|
|
143
|
+
'BROWSERLESS_IO_API_TOKEN is required for custom Browserless endpoints. ' \
|
|
144
|
+
'Set BROWSERLESS_IO_API_TOKEN or use ws://127.0.0.1:3000 for local defaults.'
|
|
145
|
+
end
|
|
146
|
+
end
|
|
51
147
|
end
|
|
52
148
|
end
|
|
53
149
|
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestService
|
|
5
|
+
##
|
|
6
|
+
# Tracks how many outbound requests a single feed build may still perform.
|
|
7
|
+
class Budget
|
|
8
|
+
##
|
|
9
|
+
# @param max_requests [Integer] the maximum number of requests allowed
|
|
10
|
+
def initialize(max_requests:)
|
|
11
|
+
unless max_requests.is_a?(Integer) && max_requests.positive?
|
|
12
|
+
raise ArgumentError, 'max_requests must be positive'
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
@remaining = max_requests
|
|
16
|
+
@mutex = Mutex.new
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
##
|
|
20
|
+
# Consumes one request from the budget.
|
|
21
|
+
#
|
|
22
|
+
# @return [Integer] remaining request count after consumption
|
|
23
|
+
# @raise [RequestBudgetExceeded] if no requests remain
|
|
24
|
+
def consume!
|
|
25
|
+
@mutex.synchronize do
|
|
26
|
+
raise RequestBudgetExceeded, 'Request budget exhausted' if @remaining.zero?
|
|
27
|
+
|
|
28
|
+
@remaining -= 1
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
##
|
|
33
|
+
# @return [Integer] requests still available
|
|
34
|
+
def remaining
|
|
35
|
+
@mutex.synchronize { @remaining }
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -1,45 +1,101 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'addressable/uri'
|
|
4
|
-
|
|
5
3
|
module Html2rss
|
|
6
4
|
class RequestService
|
|
7
5
|
##
|
|
8
6
|
# Holds information needed to send requests to websites.
|
|
9
7
|
# To be passed down to the RequestService's strategies.
|
|
10
8
|
class Context
|
|
11
|
-
SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
|
|
12
|
-
|
|
13
9
|
##
|
|
14
|
-
# @param url [String,
|
|
10
|
+
# @param url [String, Html2rss::Url] the URL to request
|
|
15
11
|
# @param headers [Hash] HTTP request headers
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
12
|
+
# @param request [Hash] request specific options passed to strategies
|
|
13
|
+
# @param request_options [Hash] runtime request options
|
|
14
|
+
# @option request_options [Symbol] :relation why this request is being made
|
|
15
|
+
# @option request_options [String, Html2rss::Url, nil] :origin_url originating URL for same-origin checks
|
|
16
|
+
# @option request_options [Policy] :policy runtime request policy
|
|
17
|
+
# @option request_options [Budget] :budget shared request budget for the feed build
|
|
18
|
+
# @raise [ArgumentError] if policy or budget is explicitly nil
|
|
19
|
+
def initialize(url:, headers: {}, request: {}, **request_options)
|
|
20
|
+
@url = Html2rss::Url.from_absolute(url)
|
|
21
|
+
@headers = normalize_headers(headers).freeze
|
|
22
|
+
@request = normalize_request(request).freeze
|
|
23
|
+
assign_request_options(request_options)
|
|
21
24
|
end
|
|
22
25
|
|
|
23
|
-
# @return [
|
|
26
|
+
# @return [Html2rss::Url] the parsed and normalized URL
|
|
24
27
|
attr_reader :url
|
|
25
28
|
|
|
26
29
|
# @return [Hash] the HTTP request headers
|
|
27
30
|
attr_reader :headers
|
|
28
31
|
|
|
29
|
-
|
|
32
|
+
# @return [Hash] the request specific options
|
|
33
|
+
attr_reader :request
|
|
34
|
+
|
|
35
|
+
# @return [Hash] browserless specific options
|
|
36
|
+
def browserless = request.fetch(:browserless, {})
|
|
37
|
+
|
|
38
|
+
# @return [Hash, nil] preload options for browserless requests
|
|
39
|
+
def browserless_preload = browserless[:preload]
|
|
40
|
+
|
|
41
|
+
# @return [Symbol] the request relation
|
|
42
|
+
attr_reader :relation
|
|
43
|
+
|
|
44
|
+
# @return [Html2rss::Url] the initial URL for the feed build
|
|
45
|
+
attr_reader :origin_url
|
|
46
|
+
|
|
47
|
+
# @return [Policy] the runtime request policy
|
|
48
|
+
attr_reader :policy
|
|
49
|
+
|
|
50
|
+
# @return [Budget] the shared request budget
|
|
51
|
+
attr_reader :budget
|
|
30
52
|
|
|
31
53
|
##
|
|
32
|
-
#
|
|
33
|
-
#
|
|
34
|
-
# @
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
54
|
+
# Builds a follow-up request context sharing headers, budget, and policy.
|
|
55
|
+
#
|
|
56
|
+
# @param url [String, Html2rss::Url] the follow-up URL
|
|
57
|
+
# @param relation [Symbol] why the follow-up is being made
|
|
58
|
+
# @param origin_url [String, Html2rss::Url] effective origin for same-origin checks
|
|
59
|
+
# @return [Context] derived request context
|
|
60
|
+
def follow_up(url:, relation:, origin_url: self.origin_url)
|
|
61
|
+
self.class.new(
|
|
62
|
+
url:,
|
|
63
|
+
headers:,
|
|
64
|
+
request:,
|
|
65
|
+
relation:,
|
|
66
|
+
origin_url:,
|
|
67
|
+
policy:,
|
|
68
|
+
budget:
|
|
69
|
+
)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
private
|
|
38
73
|
|
|
39
|
-
|
|
74
|
+
def assign_request_options(request_options)
|
|
75
|
+
@relation = request_options.fetch(:relation, :initial)
|
|
76
|
+
@policy = request_options.fetch(:policy, Policy.default)
|
|
77
|
+
raise ArgumentError, 'policy must not be nil' if @policy.nil?
|
|
78
|
+
|
|
79
|
+
@origin_url = normalized_origin_url(request_options[:origin_url])
|
|
80
|
+
@budget = request_options.fetch(:budget) { Budget.new(max_requests: policy.max_requests) }
|
|
81
|
+
raise ArgumentError, 'budget must not be nil' if @budget.nil?
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def normalized_origin_url(origin_url)
|
|
85
|
+
source = origin_url || @url
|
|
86
|
+
Html2rss::Url.from_absolute(source)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def normalize_headers(headers)
|
|
90
|
+
headers.to_h do |key, value|
|
|
91
|
+
[key.to_s, value]
|
|
92
|
+
end
|
|
93
|
+
end
|
|
40
94
|
|
|
41
|
-
|
|
42
|
-
|
|
95
|
+
def normalize_request(request)
|
|
96
|
+
normalized = HashUtil.deep_symbolize_keys(request, context: 'request')
|
|
97
|
+
HashUtil.assert_symbol_keys!(normalized, context: 'request')
|
|
98
|
+
normalized
|
|
43
99
|
end
|
|
44
100
|
end
|
|
45
101
|
end
|