html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
|
|
3
5
|
module Html2rss
|
|
4
6
|
class RequestService
|
|
5
7
|
##
|
|
@@ -7,21 +9,67 @@ module Html2rss
|
|
|
7
9
|
class Response
|
|
8
10
|
##
|
|
9
11
|
# @param body [String] the body of the response
|
|
12
|
+
# @param url [Html2rss::Url] the final request URL
|
|
10
13
|
# @param headers [Hash] the headers of the response
|
|
11
|
-
|
|
14
|
+
# @param status [Integer, nil] the HTTP status code when available
|
|
15
|
+
def initialize(body:, url:, headers: {}, status: nil)
|
|
12
16
|
@body = body
|
|
13
17
|
|
|
14
18
|
headers = headers.dup
|
|
15
19
|
headers.transform_keys!(&:to_s)
|
|
20
|
+
HashUtil.assert_string_keys!(headers, context: 'response headers', deep: false)
|
|
16
21
|
|
|
17
22
|
@headers = headers
|
|
23
|
+
@status = status
|
|
24
|
+
@url = url
|
|
18
25
|
end
|
|
19
26
|
|
|
20
|
-
# @return [String] the body of the response
|
|
27
|
+
# @return [String] the raw body of the response
|
|
21
28
|
attr_reader :body
|
|
22
29
|
|
|
23
|
-
# @return [Hash
|
|
30
|
+
# @return [Hash{String => Object}] the headers of the response
|
|
24
31
|
attr_reader :headers
|
|
32
|
+
|
|
33
|
+
# @return [Integer, nil] the HTTP status code when known
|
|
34
|
+
attr_reader :status
|
|
35
|
+
|
|
36
|
+
# @return [Html2rss::Url] the URL of the response
|
|
37
|
+
attr_reader :url
|
|
38
|
+
|
|
39
|
+
# @return [String] normalized content type header value
|
|
40
|
+
def content_type = header('content-type').to_s
|
|
41
|
+
|
|
42
|
+
# @return [Boolean] whether response content is JSON
|
|
43
|
+
def json_response? = content_type.include?('application/json')
|
|
44
|
+
|
|
45
|
+
# @return [Boolean] whether response content is HTML
|
|
46
|
+
def html_response? = content_type.include?('text/html')
|
|
47
|
+
|
|
48
|
+
##
|
|
49
|
+
# @return [Nokogiri::HTML::Document, Hash] the parsed body of the response, frozen object
|
|
50
|
+
# @raise [UnsupportedResponseContentType] if the content type is not supported
|
|
51
|
+
def parsed_body
|
|
52
|
+
@parsed_body ||= if html_response?
|
|
53
|
+
Nokogiri::HTML(body).tap do |doc|
|
|
54
|
+
# Remove comments from the document to avoid processing irrelevant content
|
|
55
|
+
doc.xpath('//comment()').each(&:remove)
|
|
56
|
+
end.freeze
|
|
57
|
+
elsif json_response?
|
|
58
|
+
JSON.parse(body, symbolize_names: true).freeze
|
|
59
|
+
else
|
|
60
|
+
raise UnsupportedResponseContentType, "Unsupported content type: #{content_type}"
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
# @param name [String] canonical header name
|
|
67
|
+
# @return [Object, nil] header value when present
|
|
68
|
+
def header(name)
|
|
69
|
+
headers.fetch(name) do
|
|
70
|
+
headers.find { |key, _value| key.casecmp?(name) }&.last
|
|
71
|
+
end
|
|
72
|
+
end
|
|
25
73
|
end
|
|
26
74
|
end
|
|
27
75
|
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestService
|
|
5
|
+
##
|
|
6
|
+
# Enforces response-size limits before parsing.
|
|
7
|
+
class ResponseGuard
|
|
8
|
+
##
|
|
9
|
+
# @param policy [Policy] request policy that defines byte ceilings
|
|
10
|
+
def initialize(policy:)
|
|
11
|
+
@policy = policy
|
|
12
|
+
@streamed_bytes = 0
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
##
|
|
16
|
+
# Validates response headers and streamed byte count.
|
|
17
|
+
#
|
|
18
|
+
# @param total_bytes [Integer] cumulative byte count received so far
|
|
19
|
+
# @param headers [Hash, nil] response headers if known
|
|
20
|
+
# @return [void]
|
|
21
|
+
# @raise [ResponseTooLarge] if the response exceeds configured limits
|
|
22
|
+
def inspect_chunk!(total_bytes:, headers: nil)
|
|
23
|
+
header_length = headers&.fetch('content-length', headers&.fetch('Content-Length', nil))
|
|
24
|
+
raise_if_too_large!(header_length.to_i, policy.max_response_bytes) if header_length
|
|
25
|
+
|
|
26
|
+
@streamed_bytes = total_bytes
|
|
27
|
+
raise_if_too_large!(@streamed_bytes, policy.max_response_bytes)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Validates the final response body after middleware processing.
|
|
32
|
+
#
|
|
33
|
+
# @param body [String, nil] final response body
|
|
34
|
+
# @return [void]
|
|
35
|
+
# @raise [ResponseTooLarge] if the final body exceeds configured limits
|
|
36
|
+
# @raise [BlockedSurfaceDetected] if the body matches known anti-bot interstitial signatures
|
|
37
|
+
def inspect_body!(body)
|
|
38
|
+
normalized_body = body.to_s
|
|
39
|
+
size = normalized_body.bytesize
|
|
40
|
+
raise_if_too_large!(size, policy.max_decompressed_bytes)
|
|
41
|
+
raise_if_blocked_surface!(normalized_body)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
attr_reader :policy
|
|
47
|
+
|
|
48
|
+
def raise_if_blocked_surface!(body)
|
|
49
|
+
signature = Html2rss::BlockedSurface.interstitial_signature_for(body)
|
|
50
|
+
return unless signature
|
|
51
|
+
|
|
52
|
+
raise BlockedSurfaceDetected, signature.fetch(:message)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def raise_if_too_large!(bytes, limit)
|
|
56
|
+
return unless bytes > limit
|
|
57
|
+
|
|
58
|
+
raise ResponseTooLarge, "Response exceeded #{limit} bytes"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -6,13 +6,38 @@ require 'forwardable'
|
|
|
6
6
|
module Html2rss
|
|
7
7
|
##
|
|
8
8
|
# Requests website URLs to retrieve their HTML for further processing.
|
|
9
|
-
# Provides strategies,
|
|
9
|
+
# Provides strategies, e.g. integrating Browserless.io.
|
|
10
10
|
class RequestService
|
|
11
11
|
include Singleton
|
|
12
12
|
|
|
13
|
+
# Raised when an unknown request strategy is requested.
|
|
13
14
|
class UnknownStrategy < Html2rss::Error; end
|
|
15
|
+
# Raised when a URL cannot be parsed or validated.
|
|
14
16
|
class InvalidUrl < Html2rss::Error; end
|
|
17
|
+
# Raised when a URL uses an unsupported scheme.
|
|
15
18
|
class UnsupportedUrlScheme < Html2rss::Error; end
|
|
19
|
+
# Raised when a response type cannot be parsed.
|
|
20
|
+
class UnsupportedResponseContentType < Html2rss::Error; end
|
|
21
|
+
# Raised when request limits are exceeded.
|
|
22
|
+
class RequestBudgetExceeded < Html2rss::Error; end
|
|
23
|
+
# Raised when policy denies private-network access.
|
|
24
|
+
class PrivateNetworkDenied < Html2rss::Error; end
|
|
25
|
+
# Raised when cross-origin follow-up requests are denied.
|
|
26
|
+
class CrossOriginFollowUpDenied < Html2rss::Error; end
|
|
27
|
+
# Raised when a response exceeds configured size limits.
|
|
28
|
+
class ResponseTooLarge < Html2rss::Error; end
|
|
29
|
+
# Raised when blocked content surfaces are detected.
|
|
30
|
+
class BlockedSurfaceDetected < Html2rss::Error; end
|
|
31
|
+
# Raised when a request times out.
|
|
32
|
+
class RequestTimedOut < Html2rss::Error; end
|
|
33
|
+
# Raised when Browserless configuration is missing or invalid.
|
|
34
|
+
class BrowserlessConfigurationError < Html2rss::Error; end
|
|
35
|
+
# Raised when Browserless cannot be reached.
|
|
36
|
+
class BrowserlessConnectionFailed < Html2rss::Error; end
|
|
37
|
+
# Raised when Botasaurus configuration is missing or invalid.
|
|
38
|
+
class BotasaurusConfigurationError < Html2rss::Error; end
|
|
39
|
+
# Raised when Botasaurus cannot be reached or returns invalid payloads.
|
|
40
|
+
class BotasaurusConnectionFailed < Html2rss::Error; end
|
|
16
41
|
|
|
17
42
|
class << self
|
|
18
43
|
extend Forwardable
|
|
@@ -31,6 +56,7 @@ module Html2rss
|
|
|
31
56
|
def initialize
|
|
32
57
|
@strategies = {
|
|
33
58
|
faraday: FaradayStrategy,
|
|
59
|
+
botasaurus: BotasaurusStrategy,
|
|
34
60
|
browserless: BrowserlessStrategy
|
|
35
61
|
}
|
|
36
62
|
@default_strategy_name = :faraday
|
|
@@ -42,6 +68,7 @@ module Html2rss
|
|
|
42
68
|
##
|
|
43
69
|
# Sets the default strategy.
|
|
44
70
|
# @param strategy [Symbol] the name of the strategy
|
|
71
|
+
# @return [Symbol] the selected default strategy name
|
|
45
72
|
# @raise [UnknownStrategy] if the strategy is not registered
|
|
46
73
|
def default_strategy_name=(strategy)
|
|
47
74
|
raise UnknownStrategy unless strategy_registered?(strategy)
|
|
@@ -55,9 +82,13 @@ module Html2rss
|
|
|
55
82
|
##
|
|
56
83
|
# Registers a new strategy.
|
|
57
84
|
# @param name [Symbol] the name of the strategy
|
|
58
|
-
# @param strategy_class [Class] the class
|
|
85
|
+
# @param strategy_class [Class] the class implementing the strategy
|
|
86
|
+
# @return [Class] the registered strategy class
|
|
87
|
+
# @raise [ArgumentError] if strategy_class is not a Class
|
|
59
88
|
def register_strategy(name, strategy_class)
|
|
60
|
-
|
|
89
|
+
unless strategy_class.is_a?(Class)
|
|
90
|
+
raise ArgumentError, "Expected a Class for strategy, got #{strategy_class.class}"
|
|
91
|
+
end
|
|
61
92
|
|
|
62
93
|
@strategies[name.to_sym] = strategy_class
|
|
63
94
|
end
|
|
@@ -65,7 +96,7 @@ module Html2rss
|
|
|
65
96
|
##
|
|
66
97
|
# Checks if a strategy is registered.
|
|
67
98
|
# @param name [Symbol] the name of the strategy
|
|
68
|
-
# @return [Boolean] true if the strategy is registered, false otherwise
|
|
99
|
+
# @return [Boolean] true if the strategy is registered, false otherwise.
|
|
69
100
|
def strategy_registered?(name)
|
|
70
101
|
@strategies.key?(name.to_sym)
|
|
71
102
|
end
|
|
@@ -73,24 +104,28 @@ module Html2rss
|
|
|
73
104
|
##
|
|
74
105
|
# Unregisters a strategy.
|
|
75
106
|
# @param name [Symbol] the name of the strategy
|
|
76
|
-
# @return [Boolean] true if the strategy was unregistered, false otherwise
|
|
77
|
-
|
|
78
|
-
|
|
107
|
+
# @return [Boolean] true if the strategy was unregistered, false otherwise.
|
|
108
|
+
# @raise [ArgumentError] if attempting to unregister the default strategy.
|
|
109
|
+
def unregister_strategy(name) # rubocop:disable Naming/PredicateMethod
|
|
110
|
+
name_sym = name.to_sym
|
|
111
|
+
raise ArgumentError, 'Cannot unregister the default strategy.' if name_sym == @default_strategy_name
|
|
79
112
|
|
|
80
|
-
!!@strategies.delete(
|
|
113
|
+
!!@strategies.delete(name_sym)
|
|
81
114
|
end
|
|
82
115
|
|
|
83
116
|
##
|
|
84
|
-
# Executes the request.
|
|
85
|
-
# @param ctx [Context] the context for the request
|
|
86
|
-
# @param strategy [Symbol] the strategy to use
|
|
87
|
-
# @return [Response] the response from the strategy
|
|
88
|
-
# @raise [
|
|
117
|
+
# Executes the request using the specified strategy.
|
|
118
|
+
# @param ctx [Context] the context for the request.
|
|
119
|
+
# @param strategy [Symbol] the strategy to use (defaults to the default strategy).
|
|
120
|
+
# @return [Response] the response from the executed strategy.
|
|
121
|
+
# @raise [ArgumentError] if the context is nil.
|
|
122
|
+
# @raise [UnknownStrategy] if the strategy is not registered.
|
|
89
123
|
def execute(ctx, strategy: default_strategy_name)
|
|
90
|
-
strategy_class = @strategies.fetch(strategy) do
|
|
124
|
+
strategy_class = @strategies.fetch(strategy.to_sym) do
|
|
91
125
|
raise UnknownStrategy,
|
|
92
|
-
"The strategy '#{strategy}' is not known. Available strategies
|
|
126
|
+
"The strategy '#{strategy}' is not known. Available strategies: #{strategy_names.join(', ')}"
|
|
93
127
|
end
|
|
128
|
+
|
|
94
129
|
strategy_class.new(ctx).execute
|
|
95
130
|
end
|
|
96
131
|
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestSession
|
|
5
|
+
##
|
|
6
|
+
# Traverses a rel=next pagination chain for selector-driven extraction.
|
|
7
|
+
class RelNextPager
|
|
8
|
+
include Enumerable
|
|
9
|
+
|
|
10
|
+
##
|
|
11
|
+
# @param session [RequestSession] request session used to execute follow-ups
|
|
12
|
+
# @param initial_response [RequestService::Response] first page response
|
|
13
|
+
# @param max_pages [Integer] configured page budget, including the initial page
|
|
14
|
+
# @param logger [Logger] logger used for pagination stop reasons
|
|
15
|
+
def initialize(session:, initial_response:, max_pages:, logger: Html2rss::Log)
|
|
16
|
+
@session = session
|
|
17
|
+
@initial_response = initial_response
|
|
18
|
+
@max_pages = max_pages
|
|
19
|
+
@logger = logger
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
##
|
|
23
|
+
# Iterates over all paginated responses, beginning with the initial response.
|
|
24
|
+
#
|
|
25
|
+
# @yield [RequestService::Response] each page response
|
|
26
|
+
# @return [Enumerator] enumerator when no block is given
|
|
27
|
+
def each
|
|
28
|
+
return enum_for(:each) unless block_given?
|
|
29
|
+
|
|
30
|
+
yield initial_response
|
|
31
|
+
|
|
32
|
+
current_response = initial_response
|
|
33
|
+
session.effective_page_budget(max_pages).pred.times do
|
|
34
|
+
next_url = next_page_url(current_response)
|
|
35
|
+
break unless follow_up_allowed?(next_url)
|
|
36
|
+
|
|
37
|
+
current_response = fetch_follow_up_response_or_stop(next_url, current_response.url)
|
|
38
|
+
break unless current_response
|
|
39
|
+
|
|
40
|
+
yield current_response
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
attr_reader :session, :initial_response, :max_pages, :logger
|
|
47
|
+
|
|
48
|
+
def next_page_url(page_response)
|
|
49
|
+
href = page_response.parsed_body.at_css('link[rel~="next"][href], a[rel~="next"][href]')&.[]('href')
|
|
50
|
+
return nil if href.nil? || href.empty?
|
|
51
|
+
|
|
52
|
+
Html2rss::Url.from_relative(href, page_response.url)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def follow_up_allowed?(next_url)
|
|
56
|
+
next_url && !session.visited?(next_url)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def fetch_follow_up_response_or_stop(next_url, origin_url)
|
|
60
|
+
session.follow_up(url: next_url, relation: :pagination, origin_url:)
|
|
61
|
+
rescue RequestService::RequestBudgetExceeded => error
|
|
62
|
+
logger.warn(
|
|
63
|
+
"#{self.class}: pagination stopped at #{next_url} - #{error.message}. " \
|
|
64
|
+
"Retry with --max-requests #{session.max_requests + 1} or increase request.max_requests in the config."
|
|
65
|
+
)
|
|
66
|
+
nil
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestSession
|
|
5
|
+
##
|
|
6
|
+
# Carries the runtime request inputs needed to build a RequestSession.
|
|
7
|
+
class RuntimeInput
|
|
8
|
+
##
|
|
9
|
+
# @param config [Html2rss::Config] validated feed config
|
|
10
|
+
# @return [RuntimeInput] runtime request inputs derived from the config
|
|
11
|
+
def self.from_config(config)
|
|
12
|
+
new(
|
|
13
|
+
url: config.url,
|
|
14
|
+
headers: config.headers,
|
|
15
|
+
request: config.request,
|
|
16
|
+
strategy: config.strategy,
|
|
17
|
+
request_policy: RuntimePolicy.from_config(config)
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
##
|
|
22
|
+
# @param url [String, Html2rss::Url] initial request URL
|
|
23
|
+
# @param headers [Hash] normalized request headers
|
|
24
|
+
# @param request [Hash] validated request options for strategies
|
|
25
|
+
# @param strategy [Symbol] request strategy to use for the session
|
|
26
|
+
# @param request_policy [RequestService::Policy] request policy for the session
|
|
27
|
+
def initialize(url:, headers:, request:, strategy:, request_policy:)
|
|
28
|
+
@url = Html2rss::Url.from_absolute(url)
|
|
29
|
+
@headers = normalize_headers(headers).freeze
|
|
30
|
+
@request = normalize_request(request).freeze
|
|
31
|
+
@strategy = strategy
|
|
32
|
+
@request_policy = request_policy
|
|
33
|
+
freeze
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
##
|
|
37
|
+
# @return [Html2rss::Url] initial request URL
|
|
38
|
+
attr_reader :url
|
|
39
|
+
|
|
40
|
+
##
|
|
41
|
+
# @return [Hash] normalized request headers
|
|
42
|
+
attr_reader :headers
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
# @return [Hash] validated request options for strategies
|
|
46
|
+
attr_reader :request
|
|
47
|
+
|
|
48
|
+
##
|
|
49
|
+
# @return [Symbol] request strategy to use for the session
|
|
50
|
+
attr_reader :strategy
|
|
51
|
+
|
|
52
|
+
##
|
|
53
|
+
# @return [RequestService::Policy] policy derived from the runtime request inputs
|
|
54
|
+
attr_reader :request_policy
|
|
55
|
+
|
|
56
|
+
private
|
|
57
|
+
|
|
58
|
+
def normalize_headers(headers)
|
|
59
|
+
headers.to_h do |key, value|
|
|
60
|
+
[key.to_s, value]
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def normalize_request(request)
|
|
65
|
+
normalized = HashUtil.deep_symbolize_keys(request, context: 'request')
|
|
66
|
+
HashUtil.assert_symbol_keys!(normalized, context: 'request')
|
|
67
|
+
normalized
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class RequestSession
|
|
5
|
+
##
|
|
6
|
+
# Builds the runtime request policy for a feed run.
|
|
7
|
+
class RuntimePolicy
|
|
8
|
+
##
|
|
9
|
+
# @param config [Html2rss::Config] validated feed config
|
|
10
|
+
# @return [Html2rss::RequestService::Policy] request policy derived from runtime config
|
|
11
|
+
def self.from_config(config)
|
|
12
|
+
RequestService::Policy.new(
|
|
13
|
+
max_requests: effective_max_requests_for(config),
|
|
14
|
+
max_redirects: config.max_redirects
|
|
15
|
+
)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
class << self
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
def effective_max_requests_for(config)
|
|
22
|
+
return config.max_requests if config.explicit_max_requests?
|
|
23
|
+
|
|
24
|
+
[baseline_request_budget_for(config), config.max_requests].max
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Reserve enough budget for the initial request plus predictable follow-ups
|
|
28
|
+
# that the top-level pipeline may trigger during a normal feed build.
|
|
29
|
+
def baseline_request_budget_for(config)
|
|
30
|
+
1 + pagination_follow_up_budget_for(config) +
|
|
31
|
+
known_auto_source_follow_up_budget_for(config) +
|
|
32
|
+
auto_strategy_fallback_budget_for(config) +
|
|
33
|
+
browserless_preload_budget_for(config)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def auto_strategy_fallback_budget_for(config)
|
|
37
|
+
return 0 unless config.strategy == :auto
|
|
38
|
+
|
|
39
|
+
[FeedPipeline::AutoFallback::CHAIN.size - 1, 0].max
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def pagination_follow_up_budget_for(config)
|
|
43
|
+
[config.selectors&.dig(:items, :pagination, :max_pages).to_i - 1, 0].max
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def known_auto_source_follow_up_budget_for(config)
|
|
47
|
+
config.auto_source&.dig(:scraper, :wordpress_api, :enabled) ? 1 : 0
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def browserless_preload_budget_for(config)
|
|
51
|
+
preload = config.request.dig(:browserless, :preload)
|
|
52
|
+
return 0 unless preload
|
|
53
|
+
|
|
54
|
+
top_level_preload_wait_budget(preload) +
|
|
55
|
+
click_selector_preload_budget(preload) +
|
|
56
|
+
scroll_preload_budget(preload)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def top_level_preload_wait_budget(preload)
|
|
60
|
+
preload[:wait_after_ms] ? 2 : 0
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def click_selector_preload_budget(preload)
|
|
64
|
+
preload.fetch(:click_selectors, []).sum { preload_action_budget(_1, :max_clicks) }
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def scroll_preload_budget(preload)
|
|
68
|
+
scroll = preload[:scroll_down]
|
|
69
|
+
return 0 unless scroll
|
|
70
|
+
|
|
71
|
+
preload_action_budget(scroll, :iterations)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def preload_action_budget(config, count_key)
|
|
75
|
+
action_count = config.fetch(count_key, 1)
|
|
76
|
+
wait_budget = config[:wait_after_ms] ? action_count : 0
|
|
77
|
+
|
|
78
|
+
action_count + wait_budget
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
##
|
|
5
|
+
# Coordinates multi-request feed builds on top of RequestService.
|
|
6
|
+
class RequestSession
|
|
7
|
+
class << self
|
|
8
|
+
##
|
|
9
|
+
# Builds a request session from translated runtime request inputs.
|
|
10
|
+
#
|
|
11
|
+
# @param runtime_input [RuntimeInput] translated runtime request inputs
|
|
12
|
+
# @param budget [RequestService::Budget, nil] optional shared budget for multi-attempt runs
|
|
13
|
+
# @param logger [Logger] logger used for operational warnings
|
|
14
|
+
# @return [RequestSession] configured request session
|
|
15
|
+
def from_runtime_input(runtime_input, budget: nil, logger: Html2rss::Log) # rubocop:disable Metrics/MethodLength
|
|
16
|
+
context_options = {
|
|
17
|
+
url: runtime_input.url,
|
|
18
|
+
headers: runtime_input.headers,
|
|
19
|
+
request: runtime_input.request,
|
|
20
|
+
policy: runtime_input.request_policy
|
|
21
|
+
}
|
|
22
|
+
context_options[:budget] = budget unless budget.nil?
|
|
23
|
+
|
|
24
|
+
new(
|
|
25
|
+
context: RequestService::Context.new(**context_options),
|
|
26
|
+
strategy: runtime_input.strategy,
|
|
27
|
+
logger:
|
|
28
|
+
)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
##
|
|
33
|
+
# @param context [RequestService::Context] initial request context
|
|
34
|
+
# @param strategy [Symbol] request strategy to use for all requests in the session
|
|
35
|
+
# @param logger [Logger] logger used for operational warnings
|
|
36
|
+
def initialize(context:, strategy:, logger: Html2rss::Log)
|
|
37
|
+
@context = context
|
|
38
|
+
@strategy = strategy
|
|
39
|
+
@logger = logger
|
|
40
|
+
@visited_urls = Set.new
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
##
|
|
44
|
+
# Executes the initial request for the session.
|
|
45
|
+
#
|
|
46
|
+
# @return [RequestService::Response] initial response
|
|
47
|
+
def fetch_initial_response
|
|
48
|
+
execute(context).tap { |response| remember!(response.url) }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
##
|
|
52
|
+
# Executes a follow-up request sharing policy, headers, and budget.
|
|
53
|
+
#
|
|
54
|
+
# @param url [String, Html2rss::Url] follow-up request url
|
|
55
|
+
# @param relation [Symbol] why the follow-up is being made
|
|
56
|
+
# @param origin_url [String, Html2rss::Url] effective origin for same-origin checks
|
|
57
|
+
# @return [RequestService::Response] follow-up response
|
|
58
|
+
def follow_up(url:, relation:, origin_url:)
|
|
59
|
+
execute(context.follow_up(url:, relation:, origin_url:)).tap { |response| remember!(response.url) }
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
##
|
|
63
|
+
# Returns the effective page budget after applying the policy ceiling.
|
|
64
|
+
#
|
|
65
|
+
# @param requested_pages [Integer] configured page budget
|
|
66
|
+
# @return [Integer] effective page budget for the session
|
|
67
|
+
def effective_page_budget(requested_pages)
|
|
68
|
+
effective_pages = [requested_pages, context.policy.max_requests].min
|
|
69
|
+
return effective_pages if effective_pages == requested_pages
|
|
70
|
+
|
|
71
|
+
logger.warn(
|
|
72
|
+
"#{self.class}: pagination max_pages=#{requested_pages} " \
|
|
73
|
+
"exceeds system ceiling=#{context.policy.max_requests}; " \
|
|
74
|
+
"clamping to #{effective_pages}"
|
|
75
|
+
)
|
|
76
|
+
effective_pages
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
##
|
|
80
|
+
# Returns the configured request budget for the session.
|
|
81
|
+
#
|
|
82
|
+
# @return [Integer] maximum requests allowed for the feed build
|
|
83
|
+
def max_requests
|
|
84
|
+
context.policy.max_requests
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
##
|
|
88
|
+
# @param url [String, Html2rss::Url] follow-up target URL for the request
|
|
89
|
+
# @return [Boolean] whether the url was already visited in this session
|
|
90
|
+
def visited?(url)
|
|
91
|
+
visited_urls.include?(normalize_url(url))
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
##
|
|
95
|
+
# Records a visited url in the session.
|
|
96
|
+
#
|
|
97
|
+
# @param url [String, Html2rss::Url] URL used to update relation tracking state
|
|
98
|
+
# @return [Set<Html2rss::Url>] visited urls
|
|
99
|
+
def remember!(url)
|
|
100
|
+
visited_urls.add(normalize_url(url))
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
private
|
|
104
|
+
|
|
105
|
+
attr_reader :context, :strategy, :logger, :visited_urls
|
|
106
|
+
|
|
107
|
+
def execute(request_context)
|
|
108
|
+
RequestService.execute(request_context, strategy:).tap do |response|
|
|
109
|
+
logger.debug(
|
|
110
|
+
"#{self.class}: relation=#{request_context.relation} " \
|
|
111
|
+
"request_url=#{request_context.url} final_url=#{response.url} " \
|
|
112
|
+
"status=#{response.status || 'unknown'} content_type=#{response.content_type.inspect} " \
|
|
113
|
+
"bytes=#{response.body.bytesize}"
|
|
114
|
+
)
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def normalize_url(url)
|
|
119
|
+
Html2rss::Url.from_absolute(url)
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
end
|