html2rss 0.18.0 → 0.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -1
- data/lib/html2rss/articles/deduplicator.rb +1 -0
- data/lib/html2rss/auto_source/cleanup.rb +11 -0
- data/lib/html2rss/auto_source/scraper/html.rb +5 -0
- data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
- data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
- data/lib/html2rss/auto_source/scraper.rb +19 -1
- data/lib/html2rss/auto_source.rb +4 -0
- data/lib/html2rss/blocked_surface.rb +1 -0
- data/lib/html2rss/category_extractor.rb +2 -2
- data/lib/html2rss/cli.rb +30 -6
- data/lib/html2rss/config/class_methods.rb +24 -35
- data/lib/html2rss/config/dynamic_params.rb +6 -4
- data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
- data/lib/html2rss/config/request_headers.rb +9 -3
- data/lib/html2rss/config/schema.rb +33 -1
- data/lib/html2rss/config/validator.rb +40 -2
- data/lib/html2rss/config.rb +19 -13
- data/lib/html2rss/error.rb +25 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
- data/lib/html2rss/html_extractor.rb +5 -0
- data/lib/html2rss/html_navigator.rb +8 -0
- data/lib/html2rss/json_feed_builder.rb +1 -0
- data/lib/html2rss/rendering/audio_renderer.rb +8 -3
- data/lib/html2rss/rendering/description_builder.rb +0 -1
- data/lib/html2rss/rendering/image_renderer.rb +17 -7
- data/lib/html2rss/rendering/media_renderer.rb +4 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
- data/lib/html2rss/rendering/video_renderer.rb +8 -3
- data/lib/html2rss/rendering.rb +11 -2
- data/lib/html2rss/request_controls.rb +16 -21
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/context.rb +14 -2
- data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
- data/lib/html2rss/request_service/policy.rb +4 -0
- data/lib/html2rss/request_service/response.rb +9 -1
- data/lib/html2rss/request_service.rb +19 -0
- data/lib/html2rss/request_session/runtime_input.rb +16 -2
- data/lib/html2rss/request_session/runtime_policy.rb +7 -0
- data/lib/html2rss/request_session.rb +13 -9
- data/lib/html2rss/rss_builder/article.rb +22 -1
- data/lib/html2rss/rss_builder/channel.rb +11 -2
- data/lib/html2rss/rss_builder/enclosure.rb +15 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
- data/lib/html2rss/rss_builder.rb +4 -0
- data/lib/html2rss/selectors/config.rb +1 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
- data/lib/html2rss/selectors/extractors/href.rb +2 -0
- data/lib/html2rss/selectors/extractors/html.rb +1 -0
- data/lib/html2rss/selectors/extractors/static.rb +2 -1
- data/lib/html2rss/selectors/extractors/text.rb +1 -0
- data/lib/html2rss/selectors/extractors.rb +2 -1
- data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
- data/lib/html2rss/selectors/post_processors/base.rb +13 -7
- data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
- data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
- data/lib/html2rss/selectors/post_processors/template.rb +3 -0
- data/lib/html2rss/selectors/post_processors.rb +5 -0
- data/lib/html2rss/selectors.rb +7 -0
- data/lib/html2rss/url.rb +27 -23
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +15 -78
- data/schema/html2rss-config.schema.json +83 -1
- metadata +7 -2
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
##
|
|
6
|
+
# Main html2rss namespace.
|
|
7
|
+
module Html2rss
|
|
8
|
+
##
|
|
9
|
+
# Request transport orchestration and strategies.
|
|
10
|
+
class RequestService
|
|
11
|
+
##
|
|
12
|
+
# Maps html2rss request/response handling to the botasaurus-scrape-api contract.
|
|
13
|
+
class BotasaurusContract
|
|
14
|
+
# Default Botasaurus scrape options when no explicit config is provided.
|
|
15
|
+
DEFAULT_OPTIONS = {
|
|
16
|
+
navigation_mode: 'auto',
|
|
17
|
+
max_retries: 2,
|
|
18
|
+
headless: false
|
|
19
|
+
}.freeze
|
|
20
|
+
|
|
21
|
+
# Allowlisted request.botasaurus keys forwarded to upstream.
|
|
22
|
+
OPTION_KEYS = %i[
|
|
23
|
+
navigation_mode
|
|
24
|
+
max_retries
|
|
25
|
+
wait_for_selector
|
|
26
|
+
wait_timeout_seconds
|
|
27
|
+
block_images
|
|
28
|
+
block_images_and_css
|
|
29
|
+
wait_for_complete_page_load
|
|
30
|
+
headless
|
|
31
|
+
proxy
|
|
32
|
+
user_agent
|
|
33
|
+
window_size
|
|
34
|
+
lang
|
|
35
|
+
].freeze
|
|
36
|
+
|
|
37
|
+
# Parsed Botasaurus response wrapper.
|
|
38
|
+
class ParsedResponse
|
|
39
|
+
# Fallback headers when upstream omits response headers.
|
|
40
|
+
DEFAULT_HEADERS = { 'content-type' => 'text/html' }.freeze
|
|
41
|
+
|
|
42
|
+
# @param payload [Hash{String => Object}] parsed Botasaurus response payload
|
|
43
|
+
# @param transport_status [Integer] HTTP status returned by Botasaurus
|
|
44
|
+
def initialize(payload:, transport_status:)
|
|
45
|
+
@payload = payload
|
|
46
|
+
@transport_status = transport_status
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @return [Boolean] true when upstream classified request as challenge blocked
|
|
50
|
+
def challenge_block? = error_category == 'challenge_block'
|
|
51
|
+
|
|
52
|
+
# @return [Boolean] true when upstream returned non-200 or an error payload
|
|
53
|
+
def upstream_failure?
|
|
54
|
+
status != 200 || error_message?
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @return [String] normalized challenge error message
|
|
58
|
+
def challenge_message
|
|
59
|
+
error || 'Botasaurus challenge block detected.'
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# @return [String] actionable upstream failure summary
|
|
63
|
+
def upstream_failure_message
|
|
64
|
+
details = ["status=#{status}"]
|
|
65
|
+
details << "error_category=#{error_category}" if error_category
|
|
66
|
+
details << "error=#{error}" if error
|
|
67
|
+
details << "request_id=#{request_id}" if request_id
|
|
68
|
+
"Botasaurus scrape failed (#{details.join(', ')})."
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# @return [String] rendered HTML body from Botasaurus
|
|
72
|
+
# @raise [BotasaurusConnectionFailed] when html is missing
|
|
73
|
+
def html
|
|
74
|
+
value = payload['html']
|
|
75
|
+
raise BotasaurusConnectionFailed, "Botasaurus response missing required 'html' field" if value.nil?
|
|
76
|
+
|
|
77
|
+
value.to_s
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# @return [Hash{String => String}] normalized response headers
|
|
81
|
+
def headers
|
|
82
|
+
raw_headers = payload['headers']
|
|
83
|
+
return DEFAULT_HEADERS.dup unless raw_headers.is_a?(Hash) && raw_headers.any?
|
|
84
|
+
|
|
85
|
+
raw_headers.to_h { |key, value| [key.to_s, value.to_s] }
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# @return [Integer] resolved status code (payload status_code or transport status)
|
|
89
|
+
def status
|
|
90
|
+
status_code = payload['status_code']
|
|
91
|
+
status_code.is_a?(Integer) ? status_code : transport_status
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# @return [String, nil] final URL reported by upstream
|
|
95
|
+
def final_url = payload['final_url']
|
|
96
|
+
|
|
97
|
+
private
|
|
98
|
+
|
|
99
|
+
attr_reader :payload, :transport_status
|
|
100
|
+
|
|
101
|
+
def error = payload['error']
|
|
102
|
+
|
|
103
|
+
def request_id = payload['request_id']
|
|
104
|
+
|
|
105
|
+
def error_category = payload['error_category']
|
|
106
|
+
|
|
107
|
+
def error_message?
|
|
108
|
+
value = error
|
|
109
|
+
value.is_a?(String) ? !value.empty? : !value.nil?
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
##
|
|
114
|
+
# @param url [Html2rss::Url] canonical URL to scrape
|
|
115
|
+
# @param options [Hash] validated request.botasaurus options
|
|
116
|
+
# @option options [String] :navigation_mode
|
|
117
|
+
# @option options [Integer] :max_retries
|
|
118
|
+
# @option options [String] :wait_for_selector
|
|
119
|
+
# @option options [Integer] :wait_timeout_seconds
|
|
120
|
+
# @option options [Boolean] :block_images
|
|
121
|
+
# @option options [Boolean] :block_images_and_css
|
|
122
|
+
# @option options [Boolean] :wait_for_complete_page_load
|
|
123
|
+
# @option options [Boolean] :headless
|
|
124
|
+
# @option options [String] :proxy
|
|
125
|
+
# @option options [String] :user_agent
|
|
126
|
+
# @option options [Array<Integer>] :window_size
|
|
127
|
+
# @option options [String] :lang
|
|
128
|
+
def initialize(url:, options: {})
|
|
129
|
+
@url = url
|
|
130
|
+
@options = options
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# @return [Hash] payload for POST /scrape
|
|
134
|
+
def request_payload
|
|
135
|
+
DEFAULT_OPTIONS.merge(filtered_options).merge(url: url.to_s)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# @param transport_response [Faraday::Response] upstream HTTP response
|
|
139
|
+
# @return [ParsedResponse]
|
|
140
|
+
# @raise [BotasaurusConnectionFailed] when payload is not valid JSON object
|
|
141
|
+
def parse_response(transport_response)
|
|
142
|
+
payload = JSON.parse(transport_response.body.to_s)
|
|
143
|
+
raise BotasaurusConnectionFailed, 'Botasaurus response must be a JSON object' unless payload.is_a?(Hash)
|
|
144
|
+
|
|
145
|
+
ParsedResponse.new(payload:, transport_status: transport_response.status)
|
|
146
|
+
rescue JSON::ParserError => error
|
|
147
|
+
raise BotasaurusConnectionFailed, "Botasaurus response JSON parse failed: #{error.message}"
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
private
|
|
151
|
+
|
|
152
|
+
attr_reader :url, :options
|
|
153
|
+
|
|
154
|
+
def filtered_options
|
|
155
|
+
OPTION_KEYS.each_with_object({}) do |key, normalized|
|
|
156
|
+
normalized[key] = options[key] if options.key?(key)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
end
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'faraday'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module Html2rss
|
|
7
|
+
class RequestService
|
|
8
|
+
##
|
|
9
|
+
# Strategy to delegate fetching to a Botasaurus scrape API.
|
|
10
|
+
class BotasaurusStrategy < Strategy
|
|
11
|
+
##
|
|
12
|
+
# Executes a Botasaurus-backed request with shared request policy guards.
|
|
13
|
+
#
|
|
14
|
+
# @return [Response] normalized request response
|
|
15
|
+
# @raise [BotasaurusConfigurationError] when BOTASAURUS_SCRAPER_URL is missing or invalid
|
|
16
|
+
# @raise [BotasaurusConnectionFailed] when Botasaurus cannot be reached or returns an invalid payload
|
|
17
|
+
# @raise [RequestTimedOut] when the Botasaurus request exceeds configured timeout
|
|
18
|
+
def execute
|
|
19
|
+
validate_request!
|
|
20
|
+
transport_response = client.post('/scrape', JSON.generate(contract.request_payload), content_type_header)
|
|
21
|
+
parsed_response = contract.parse_response(transport_response)
|
|
22
|
+
raise_if_challenge_blocked!(parsed_response)
|
|
23
|
+
raise_if_upstream_failed!(parsed_response)
|
|
24
|
+
build_response(parsed_response)
|
|
25
|
+
rescue Faraday::TimeoutError, Timeout::Error => error
|
|
26
|
+
raise RequestTimedOut, error.message
|
|
27
|
+
rescue Faraday::ConnectionFailed, Faraday::SSLError => error
|
|
28
|
+
raise BotasaurusConnectionFailed, "Botasaurus connection failed: #{error.message}"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def validate_request!
|
|
34
|
+
ctx.budget.consume!
|
|
35
|
+
ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def build_response(parsed_response)
|
|
39
|
+
body = parsed_response.html
|
|
40
|
+
ResponseGuard.new(policy: ctx.policy).inspect_body!(body)
|
|
41
|
+
|
|
42
|
+
Response.new(
|
|
43
|
+
body:,
|
|
44
|
+
headers: parsed_response.headers,
|
|
45
|
+
url: response_url(parsed_response.final_url),
|
|
46
|
+
status: parsed_response.status
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def raise_if_challenge_blocked!(parsed_response)
|
|
51
|
+
return unless parsed_response.challenge_block?
|
|
52
|
+
|
|
53
|
+
raise BlockedSurfaceDetected, "Blocked surface detected: #{parsed_response.challenge_message}"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def raise_if_upstream_failed!(parsed_response)
|
|
57
|
+
return unless parsed_response.upstream_failure?
|
|
58
|
+
|
|
59
|
+
raise BotasaurusConnectionFailed, parsed_response.upstream_failure_message
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def response_url(final_url)
|
|
63
|
+
return ctx.url if final_url.nil?
|
|
64
|
+
|
|
65
|
+
Html2rss::Url.from_absolute(final_url)
|
|
66
|
+
rescue ArgumentError
|
|
67
|
+
ctx.url
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def contract
|
|
71
|
+
@contract ||= BotasaurusContract.new(url: ctx.url, options: ctx.request.fetch(:botasaurus, {}))
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def client
|
|
75
|
+
@client ||= Faraday.new(url: scraper_base_url.to_s, request: request_options)
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def request_options
|
|
79
|
+
{ timeout: ctx.policy.total_timeout_seconds }
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def content_type_header
|
|
83
|
+
{ 'Content-Type' => 'application/json' }
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def scraper_base_url
|
|
87
|
+
@scraper_base_url ||= begin
|
|
88
|
+
configured = ENV.fetch('BOTASAURUS_SCRAPER_URL') do
|
|
89
|
+
raise BotasaurusConfigurationError, 'BOTASAURUS_SCRAPER_URL is required for strategy=botasaurus.'
|
|
90
|
+
end
|
|
91
|
+
Html2rss::Url.for_channel(configured)
|
|
92
|
+
rescue ArgumentError => error
|
|
93
|
+
raise BotasaurusConfigurationError, "BOTASAURUS_SCRAPER_URL is invalid: #{error.message}"
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
@@ -18,8 +18,8 @@ module Html2rss
|
|
|
18
18
|
# @raise [ArgumentError] if policy or budget is explicitly nil
|
|
19
19
|
def initialize(url:, headers: {}, request: {}, **request_options)
|
|
20
20
|
@url = Html2rss::Url.from_absolute(url)
|
|
21
|
-
@headers = headers
|
|
22
|
-
@request = request.freeze
|
|
21
|
+
@headers = normalize_headers(headers).freeze
|
|
22
|
+
@request = normalize_request(request).freeze
|
|
23
23
|
assign_request_options(request_options)
|
|
24
24
|
end
|
|
25
25
|
|
|
@@ -85,6 +85,18 @@ module Html2rss
|
|
|
85
85
|
source = origin_url || @url
|
|
86
86
|
Html2rss::Url.from_absolute(source)
|
|
87
87
|
end
|
|
88
|
+
|
|
89
|
+
def normalize_headers(headers)
|
|
90
|
+
headers.to_h do |key, value|
|
|
91
|
+
[key.to_s, value]
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def normalize_request(request)
|
|
96
|
+
normalized = HashUtil.deep_symbolize_keys(request, context: 'request')
|
|
97
|
+
HashUtil.assert_symbol_keys!(normalized, context: 'request')
|
|
98
|
+
normalized
|
|
99
|
+
end
|
|
88
100
|
end
|
|
89
101
|
end
|
|
90
102
|
end
|
|
@@ -13,8 +13,11 @@ module Html2rss
|
|
|
13
13
|
##
|
|
14
14
|
# Restores buffered streamed bytes so response middleware can process them.
|
|
15
15
|
class StreamingBodyMiddleware < Faraday::Middleware
|
|
16
|
+
# Request-context key used to store streamed chunks before middleware completion.
|
|
16
17
|
STREAM_BUFFER_KEY = :html2rss_stream_buffer
|
|
17
18
|
|
|
19
|
+
# @param env [Faraday::Env] completed response environment
|
|
20
|
+
# @return [void]
|
|
18
21
|
def on_complete(env)
|
|
19
22
|
buffer = env.request.context&.delete(STREAM_BUFFER_KEY)
|
|
20
23
|
return if buffer.nil? || buffer.empty?
|
|
@@ -24,13 +27,12 @@ module Html2rss
|
|
|
24
27
|
end
|
|
25
28
|
|
|
26
29
|
##
|
|
27
|
-
# NOTE: Unlike BrowserlessStrategy, Faraday does not expose the remote IP after connect.
|
|
28
|
-
# SSRF protection here is pre-connection only (DNS resolution via Policy).
|
|
29
|
-
# A DNS rebinding attack between resolution and connect cannot be caught at this layer.
|
|
30
|
-
#
|
|
31
30
|
# Executes a request with runtime policy enforcement.
|
|
32
31
|
#
|
|
33
32
|
# @return [Response] normalized request response
|
|
33
|
+
# @note Unlike BrowserlessStrategy, Faraday does not expose the remote IP after connect.
|
|
34
|
+
# SSRF protection here is pre-connection only (DNS resolution via Policy).
|
|
35
|
+
# A DNS rebinding attack between resolution and connect cannot be caught at this layer.
|
|
34
36
|
def execute
|
|
35
37
|
deadline = request_deadline
|
|
36
38
|
response_guard, response = perform_request(deadline:)
|
|
@@ -10,7 +10,9 @@ module Html2rss
|
|
|
10
10
|
# Describes the runtime request envelope for a single feed build.
|
|
11
11
|
class Policy # rubocop:disable Metrics/ClassLength
|
|
12
12
|
MAX_REQUESTS_CEILING = 10
|
|
13
|
+
# Hostnames treated as local/private surfaces.
|
|
13
14
|
LOCAL_HOSTS = %w[localhost localhost.localdomain metadata.google.internal].to_set.freeze
|
|
15
|
+
# IP ranges blocked when private networks are disabled.
|
|
14
16
|
BLOCKED_IP_RANGES = [
|
|
15
17
|
IPAddr.new('0.0.0.0/8'),
|
|
16
18
|
IPAddr.new('10.0.0.0/8'),
|
|
@@ -26,6 +28,7 @@ module Html2rss
|
|
|
26
28
|
IPAddr.new('ff00::/8')
|
|
27
29
|
].freeze
|
|
28
30
|
|
|
31
|
+
# Default policy values used when request controls are not explicitly set.
|
|
29
32
|
DEFAULTS = {
|
|
30
33
|
connect_timeout_seconds: 5,
|
|
31
34
|
read_timeout_seconds: 10,
|
|
@@ -243,6 +246,7 @@ module Html2rss
|
|
|
243
246
|
end
|
|
244
247
|
end
|
|
245
248
|
|
|
249
|
+
# Shared immutable policy instance used for default request execution.
|
|
246
250
|
Policy::DEFAULT_POLICY = Policy.new
|
|
247
251
|
end
|
|
248
252
|
end
|
|
@@ -17,6 +17,7 @@ module Html2rss
|
|
|
17
17
|
|
|
18
18
|
headers = headers.dup
|
|
19
19
|
headers.transform_keys!(&:to_s)
|
|
20
|
+
HashUtil.assert_string_keys!(headers, context: 'response headers', deep: false)
|
|
20
21
|
|
|
21
22
|
@headers = headers
|
|
22
23
|
@status = status
|
|
@@ -26,7 +27,7 @@ module Html2rss
|
|
|
26
27
|
# @return [String] the raw body of the response
|
|
27
28
|
attr_reader :body
|
|
28
29
|
|
|
29
|
-
# @return [Hash
|
|
30
|
+
# @return [Hash{String => Object}] the headers of the response
|
|
30
31
|
attr_reader :headers
|
|
31
32
|
|
|
32
33
|
# @return [Integer, nil] the HTTP status code when known
|
|
@@ -35,8 +36,13 @@ module Html2rss
|
|
|
35
36
|
# @return [Html2rss::Url] the URL of the response
|
|
36
37
|
attr_reader :url
|
|
37
38
|
|
|
39
|
+
# @return [String] normalized content type header value
|
|
38
40
|
def content_type = header('content-type').to_s
|
|
41
|
+
|
|
42
|
+
# @return [Boolean] whether response content is JSON
|
|
39
43
|
def json_response? = content_type.include?('application/json')
|
|
44
|
+
|
|
45
|
+
# @return [Boolean] whether response content is HTML
|
|
40
46
|
def html_response? = content_type.include?('text/html')
|
|
41
47
|
|
|
42
48
|
##
|
|
@@ -57,6 +63,8 @@ module Html2rss
|
|
|
57
63
|
|
|
58
64
|
private
|
|
59
65
|
|
|
66
|
+
# @param name [String] canonical header name
|
|
67
|
+
# @return [Object, nil] header value when present
|
|
60
68
|
def header(name)
|
|
61
69
|
headers.fetch(name) do
|
|
62
70
|
headers.find { |key, _value| key.casecmp?(name) }&.last
|
|
@@ -10,18 +10,34 @@ module Html2rss
|
|
|
10
10
|
class RequestService
|
|
11
11
|
include Singleton
|
|
12
12
|
|
|
13
|
+
# Raised when an unknown request strategy is requested.
|
|
13
14
|
class UnknownStrategy < Html2rss::Error; end
|
|
15
|
+
# Raised when a URL cannot be parsed or validated.
|
|
14
16
|
class InvalidUrl < Html2rss::Error; end
|
|
17
|
+
# Raised when a URL uses an unsupported scheme.
|
|
15
18
|
class UnsupportedUrlScheme < Html2rss::Error; end
|
|
19
|
+
# Raised when a response type cannot be parsed.
|
|
16
20
|
class UnsupportedResponseContentType < Html2rss::Error; end
|
|
21
|
+
# Raised when request limits are exceeded.
|
|
17
22
|
class RequestBudgetExceeded < Html2rss::Error; end
|
|
23
|
+
# Raised when policy denies private-network access.
|
|
18
24
|
class PrivateNetworkDenied < Html2rss::Error; end
|
|
25
|
+
# Raised when cross-origin follow-up requests are denied.
|
|
19
26
|
class CrossOriginFollowUpDenied < Html2rss::Error; end
|
|
27
|
+
# Raised when a response exceeds configured size limits.
|
|
20
28
|
class ResponseTooLarge < Html2rss::Error; end
|
|
29
|
+
# Raised when blocked content surfaces are detected.
|
|
21
30
|
class BlockedSurfaceDetected < Html2rss::Error; end
|
|
31
|
+
# Raised when a request times out.
|
|
22
32
|
class RequestTimedOut < Html2rss::Error; end
|
|
33
|
+
# Raised when Browserless configuration is missing or invalid.
|
|
23
34
|
class BrowserlessConfigurationError < Html2rss::Error; end
|
|
35
|
+
# Raised when Browserless cannot be reached.
|
|
24
36
|
class BrowserlessConnectionFailed < Html2rss::Error; end
|
|
37
|
+
# Raised when Botasaurus configuration is missing or invalid.
|
|
38
|
+
class BotasaurusConfigurationError < Html2rss::Error; end
|
|
39
|
+
# Raised when Botasaurus cannot be reached or returns invalid payloads.
|
|
40
|
+
class BotasaurusConnectionFailed < Html2rss::Error; end
|
|
25
41
|
|
|
26
42
|
class << self
|
|
27
43
|
extend Forwardable
|
|
@@ -40,6 +56,7 @@ module Html2rss
|
|
|
40
56
|
def initialize
|
|
41
57
|
@strategies = {
|
|
42
58
|
faraday: FaradayStrategy,
|
|
59
|
+
botasaurus: BotasaurusStrategy,
|
|
43
60
|
browserless: BrowserlessStrategy
|
|
44
61
|
}
|
|
45
62
|
@default_strategy_name = :faraday
|
|
@@ -51,6 +68,7 @@ module Html2rss
|
|
|
51
68
|
##
|
|
52
69
|
# Sets the default strategy.
|
|
53
70
|
# @param strategy [Symbol] the name of the strategy
|
|
71
|
+
# @return [Symbol] the selected default strategy name
|
|
54
72
|
# @raise [UnknownStrategy] if the strategy is not registered
|
|
55
73
|
def default_strategy_name=(strategy)
|
|
56
74
|
raise UnknownStrategy unless strategy_registered?(strategy)
|
|
@@ -65,6 +83,7 @@ module Html2rss
|
|
|
65
83
|
# Registers a new strategy.
|
|
66
84
|
# @param name [Symbol] the name of the strategy
|
|
67
85
|
# @param strategy_class [Class] the class implementing the strategy
|
|
86
|
+
# @return [Class] the registered strategy class
|
|
68
87
|
# @raise [ArgumentError] if strategy_class is not a Class
|
|
69
88
|
def register_strategy(name, strategy_class)
|
|
70
89
|
unless strategy_class.is_a?(Class)
|
|
@@ -26,8 +26,8 @@ module Html2rss
|
|
|
26
26
|
# @param request_policy [RequestService::Policy] request policy for the session
|
|
27
27
|
def initialize(url:, headers:, request:, strategy:, request_policy:)
|
|
28
28
|
@url = Html2rss::Url.from_absolute(url)
|
|
29
|
-
@headers = headers.freeze
|
|
30
|
-
@request = request.freeze
|
|
29
|
+
@headers = normalize_headers(headers).freeze
|
|
30
|
+
@request = normalize_request(request).freeze
|
|
31
31
|
@strategy = strategy
|
|
32
32
|
@request_policy = request_policy
|
|
33
33
|
freeze
|
|
@@ -52,6 +52,20 @@ module Html2rss
|
|
|
52
52
|
##
|
|
53
53
|
# @return [RequestService::Policy] policy derived from the runtime request inputs
|
|
54
54
|
attr_reader :request_policy
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def normalize_headers(headers)
|
|
59
|
+
headers.to_h do |key, value|
|
|
60
|
+
[key.to_s, value]
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def normalize_request(request)
|
|
65
|
+
normalized = HashUtil.deep_symbolize_keys(request, context: 'request')
|
|
66
|
+
HashUtil.assert_symbol_keys!(normalized, context: 'request')
|
|
67
|
+
normalized
|
|
68
|
+
end
|
|
55
69
|
end
|
|
56
70
|
end
|
|
57
71
|
end
|
|
@@ -29,9 +29,16 @@ module Html2rss
|
|
|
29
29
|
def baseline_request_budget_for(config)
|
|
30
30
|
1 + pagination_follow_up_budget_for(config) +
|
|
31
31
|
known_auto_source_follow_up_budget_for(config) +
|
|
32
|
+
auto_strategy_fallback_budget_for(config) +
|
|
32
33
|
browserless_preload_budget_for(config)
|
|
33
34
|
end
|
|
34
35
|
|
|
36
|
+
def auto_strategy_fallback_budget_for(config)
|
|
37
|
+
return 0 unless config.strategy == :auto
|
|
38
|
+
|
|
39
|
+
[FeedPipeline::AutoFallback::CHAIN.size - 1, 0].max
|
|
40
|
+
end
|
|
41
|
+
|
|
35
42
|
def pagination_follow_up_budget_for(config)
|
|
36
43
|
[config.selectors&.dig(:items, :pagination, :max_pages).to_i - 1, 0].max
|
|
37
44
|
end
|
|
@@ -9,16 +9,20 @@ module Html2rss
|
|
|
9
9
|
# Builds a request session from translated runtime request inputs.
|
|
10
10
|
#
|
|
11
11
|
# @param runtime_input [RuntimeInput] translated runtime request inputs
|
|
12
|
+
# @param budget [RequestService::Budget, nil] optional shared budget for multi-attempt runs
|
|
12
13
|
# @param logger [Logger] logger used for operational warnings
|
|
13
14
|
# @return [RequestSession] configured request session
|
|
14
|
-
def from_runtime_input(runtime_input, logger: Html2rss::Log)
|
|
15
|
+
def from_runtime_input(runtime_input, budget: nil, logger: Html2rss::Log) # rubocop:disable Metrics/MethodLength
|
|
16
|
+
context_options = {
|
|
17
|
+
url: runtime_input.url,
|
|
18
|
+
headers: runtime_input.headers,
|
|
19
|
+
request: runtime_input.request,
|
|
20
|
+
policy: runtime_input.request_policy
|
|
21
|
+
}
|
|
22
|
+
context_options[:budget] = budget unless budget.nil?
|
|
23
|
+
|
|
15
24
|
new(
|
|
16
|
-
context: RequestService::Context.new(
|
|
17
|
-
url: runtime_input.url,
|
|
18
|
-
headers: runtime_input.headers,
|
|
19
|
-
request: runtime_input.request,
|
|
20
|
-
policy: runtime_input.request_policy
|
|
21
|
-
),
|
|
25
|
+
context: RequestService::Context.new(**context_options),
|
|
22
26
|
strategy: runtime_input.strategy,
|
|
23
27
|
logger:
|
|
24
28
|
)
|
|
@@ -81,7 +85,7 @@ module Html2rss
|
|
|
81
85
|
end
|
|
82
86
|
|
|
83
87
|
##
|
|
84
|
-
# @param url [String, Html2rss::Url]
|
|
88
|
+
# @param url [String, Html2rss::Url] follow-up target URL for the request
|
|
85
89
|
# @return [Boolean] whether the url was already visited in this session
|
|
86
90
|
def visited?(url)
|
|
87
91
|
visited_urls.include?(normalize_url(url))
|
|
@@ -90,7 +94,7 @@ module Html2rss
|
|
|
90
94
|
##
|
|
91
95
|
# Records a visited url in the session.
|
|
92
96
|
#
|
|
93
|
-
# @param url [String, Html2rss::Url]
|
|
97
|
+
# @param url [String, Html2rss::Url] URL used to update relation tracking state
|
|
94
98
|
# @return [Set<Html2rss::Url>] visited urls
|
|
95
99
|
def remember!(url)
|
|
96
100
|
visited_urls.add(normalize_url(url))
|
|
@@ -13,10 +13,23 @@ module Html2rss
|
|
|
13
13
|
include Enumerable
|
|
14
14
|
include Comparable
|
|
15
15
|
|
|
16
|
+
# Allowed article attributes accepted by the value object constructor.
|
|
16
17
|
PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
|
|
18
|
+
# Separator used to build deterministic deduplication fingerprints.
|
|
17
19
|
DEDUP_FINGERPRINT_SEPARATOR = '#!/'
|
|
18
20
|
|
|
19
|
-
# @param options [Hash
|
|
21
|
+
# @param options [Hash{Symbol => String}]
|
|
22
|
+
# @option options [String] :id stable article identifier
|
|
23
|
+
# @option options [String] :title article title
|
|
24
|
+
# @option options [String] :description article description/content
|
|
25
|
+
# @option options [String, Html2rss::Url] :url canonical article URL
|
|
26
|
+
# @option options [String, Html2rss::Url] :image image URL for fallback enclosure rendering
|
|
27
|
+
# @option options [String] :author author name
|
|
28
|
+
# @option options [String] :guid explicit GUID override
|
|
29
|
+
# @option options [String, Time, DateTime] :published_at publication timestamp
|
|
30
|
+
# @option options [Array<Hash{Symbol => Object}>] :enclosures enclosure attribute hashes
|
|
31
|
+
# @option options [Array<String>] :categories category labels
|
|
32
|
+
# @option options [Class] :scraper scraper class that produced the article
|
|
20
33
|
def initialize(**options)
|
|
21
34
|
@to_h = {}
|
|
22
35
|
options.each_pair { |key, value| @to_h[key] = value.freeze if value }
|
|
@@ -41,10 +54,13 @@ module Html2rss
|
|
|
41
54
|
PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
|
|
42
55
|
end
|
|
43
56
|
|
|
57
|
+
# @return [String, nil] stable article identifier
|
|
44
58
|
def id = blank_string_to_nil(@to_h[:id])
|
|
45
59
|
|
|
60
|
+
# @return [String, nil] article title
|
|
46
61
|
def title = blank_string_to_nil(@to_h[:title])
|
|
47
62
|
|
|
63
|
+
# @return [String] rendered article description
|
|
48
64
|
def description
|
|
49
65
|
@description ||= Rendering::DescriptionBuilder.new(
|
|
50
66
|
base: @to_h[:description],
|
|
@@ -82,6 +98,7 @@ module Html2rss
|
|
|
82
98
|
dedup_from_url || dedup_from_id || dedup_from_guid || hash
|
|
83
99
|
end
|
|
84
100
|
|
|
101
|
+
# @return [Array<Html2rss::RssBuilder::Enclosure>] normalized enclosure objects
|
|
85
102
|
def enclosures
|
|
86
103
|
@enclosures ||= Array(@to_h[:enclosures])
|
|
87
104
|
.map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
|
|
@@ -101,6 +118,7 @@ module Html2rss
|
|
|
101
118
|
end
|
|
102
119
|
end
|
|
103
120
|
|
|
121
|
+
# @return [Array<String>] normalized, unique category names
|
|
104
122
|
def categories
|
|
105
123
|
@categories ||= @to_h[:categories].dup.to_a.tap do |categories|
|
|
106
124
|
categories.map! { |category| category.to_s.strip }
|
|
@@ -119,10 +137,13 @@ module Html2rss
|
|
|
119
137
|
nil
|
|
120
138
|
end
|
|
121
139
|
|
|
140
|
+
# @return [Class, nil] scraper class that produced this article
|
|
122
141
|
def scraper
|
|
123
142
|
@to_h[:scraper]
|
|
124
143
|
end
|
|
125
144
|
|
|
145
|
+
# @param other [Object] value compared against this article
|
|
146
|
+
# @return [Integer, nil] comparison result for compatible Article values
|
|
126
147
|
def <=>(other)
|
|
127
148
|
return nil unless other.is_a?(Article)
|
|
128
149
|
|
|
@@ -7,24 +7,28 @@ module Html2rss
|
|
|
7
7
|
# 1. the HTML document's <head>.
|
|
8
8
|
# 2. the HTTP response
|
|
9
9
|
class Channel
|
|
10
|
+
# Fallback RSS ttl (in minutes) when no cache directives are present.
|
|
10
11
|
DEFAULT_TTL_IN_MINUTES = 360
|
|
12
|
+
# Description template used when no explicit or discovered description exists.
|
|
11
13
|
DEFAULT_DESCRIPTION_TEMPLATE = 'Latest items from %<url>s'
|
|
12
14
|
|
|
13
15
|
##
|
|
14
|
-
#
|
|
15
16
|
# @param response [Html2rss::RequestService::Response]
|
|
16
|
-
# @param overrides [Hash
|
|
17
|
+
# @param overrides [Hash{Symbol => String}] optional overrides for channel attributes
|
|
17
18
|
def initialize(response, overrides: {})
|
|
18
19
|
@response = response
|
|
19
20
|
@overrides = overrides
|
|
20
21
|
end
|
|
21
22
|
|
|
23
|
+
# @return [String] channel title derived from overrides, document title, or URL
|
|
22
24
|
def title
|
|
23
25
|
@title ||= fetch_title
|
|
24
26
|
end
|
|
25
27
|
|
|
28
|
+
# @return [Html2rss::Url] canonical channel URL
|
|
26
29
|
def url = @url ||= Html2rss::Url.from_absolute(@response.url)
|
|
27
30
|
|
|
31
|
+
# @return [String] channel description text
|
|
28
32
|
def description
|
|
29
33
|
return overrides[:description] unless overrides[:description].to_s.empty?
|
|
30
34
|
|
|
@@ -35,6 +39,7 @@ module Html2rss
|
|
|
35
39
|
description
|
|
36
40
|
end
|
|
37
41
|
|
|
42
|
+
# @return [Integer] cache time-to-live in minutes
|
|
38
43
|
def ttl
|
|
39
44
|
return overrides[:ttl] if overrides[:ttl]
|
|
40
45
|
|
|
@@ -45,6 +50,7 @@ module Html2rss
|
|
|
45
50
|
DEFAULT_TTL_IN_MINUTES
|
|
46
51
|
end
|
|
47
52
|
|
|
53
|
+
# @return [String, nil] ISO-like language code when available
|
|
48
54
|
def language
|
|
49
55
|
return overrides[:language] if overrides[:language]
|
|
50
56
|
|
|
@@ -57,6 +63,7 @@ module Html2rss
|
|
|
57
63
|
parsed_body['lang'] || parsed_body.at_css('[lang]')&.[]('lang')
|
|
58
64
|
end
|
|
59
65
|
|
|
66
|
+
# @return [String, nil] channel author metadata
|
|
60
67
|
def author
|
|
61
68
|
return overrides[:author] if overrides[:author]
|
|
62
69
|
|
|
@@ -65,8 +72,10 @@ module Html2rss
|
|
|
65
72
|
parsed_body.at_css('meta[name="author"]')&.[]('content')
|
|
66
73
|
end
|
|
67
74
|
|
|
75
|
+
# @return [String, Time] source last-modified timestamp or current time fallback
|
|
68
76
|
def last_build_date = headers['last-modified'] || Time.now
|
|
69
77
|
|
|
78
|
+
# @return [Html2rss::Url, nil] channel image URL
|
|
70
79
|
def image
|
|
71
80
|
return overrides[:image] if overrides[:image]
|
|
72
81
|
|