html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'nokogiri'
4
+
3
5
  module Html2rss
4
6
  class RequestService
5
7
  ##
@@ -7,21 +9,67 @@ module Html2rss
7
9
  class Response
8
10
  ##
9
11
  # @param body [String] the body of the response
12
+ # @param url [Html2rss::Url] the final request URL
10
13
  # @param headers [Hash] the headers of the response
11
- def initialize(body:, headers: {})
14
+ # @param status [Integer, nil] the HTTP status code when available
15
+ def initialize(body:, url:, headers: {}, status: nil)
12
16
  @body = body
13
17
 
14
18
  headers = headers.dup
15
19
  headers.transform_keys!(&:to_s)
20
+ HashUtil.assert_string_keys!(headers, context: 'response headers', deep: false)
16
21
 
17
22
  @headers = headers
23
+ @status = status
24
+ @url = url
18
25
  end
19
26
 
20
- # @return [String] the body of the response
27
+ # @return [String] the raw body of the response
21
28
  attr_reader :body
22
29
 
23
- # @return [Hash<String, Object>] the headers of the response
30
+ # @return [Hash{String => Object}] the headers of the response
24
31
  attr_reader :headers
32
+
33
+ # @return [Integer, nil] the HTTP status code when known
34
+ attr_reader :status
35
+
36
+ # @return [Html2rss::Url] the URL of the response
37
+ attr_reader :url
38
+
39
+ # @return [String] normalized content type header value
40
+ def content_type = header('content-type').to_s
41
+
42
+ # @return [Boolean] whether response content is JSON
43
+ def json_response? = content_type.include?('application/json')
44
+
45
+ # @return [Boolean] whether response content is HTML
46
+ def html_response? = content_type.include?('text/html')
47
+
48
+ ##
49
+ # @return [Nokogiri::HTML::Document, Hash] the parsed body of the response, frozen object
50
+ # @raise [UnsupportedResponseContentType] if the content type is not supported
51
+ def parsed_body
52
+ @parsed_body ||= if html_response?
53
+ Nokogiri::HTML(body).tap do |doc|
54
+ # Remove comments from the document to avoid processing irrelevant content
55
+ doc.xpath('//comment()').each(&:remove)
56
+ end.freeze
57
+ elsif json_response?
58
+ JSON.parse(body, symbolize_names: true).freeze
59
+ else
60
+ raise UnsupportedResponseContentType, "Unsupported content type: #{content_type}"
61
+ end
62
+ end
63
+
64
+ private
65
+
66
+ # @param name [String] canonical header name
67
+ # @return [Object, nil] header value when present
68
+ def header(name)
69
+ headers.fetch(name) do
70
+ headers.find { |key, _value| key.casecmp?(name) }&.last
71
+ end
72
+ end
25
73
  end
26
74
  end
27
75
  end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # Enforces response-size limits before parsing.
7
+ class ResponseGuard
8
+ ##
9
+ # @param policy [Policy] request policy that defines byte ceilings
10
+ def initialize(policy:)
11
+ @policy = policy
12
+ @streamed_bytes = 0
13
+ end
14
+
15
+ ##
16
+ # Validates response headers and streamed byte count.
17
+ #
18
+ # @param total_bytes [Integer] cumulative byte count received so far
19
+ # @param headers [Hash, nil] response headers if known
20
+ # @return [void]
21
+ # @raise [ResponseTooLarge] if the response exceeds configured limits
22
+ def inspect_chunk!(total_bytes:, headers: nil)
23
+ header_length = headers&.fetch('content-length', headers&.fetch('Content-Length', nil))
24
+ raise_if_too_large!(header_length.to_i, policy.max_response_bytes) if header_length
25
+
26
+ @streamed_bytes = total_bytes
27
+ raise_if_too_large!(@streamed_bytes, policy.max_response_bytes)
28
+ end
29
+
30
+ ##
31
+ # Validates the final response body after middleware processing.
32
+ #
33
+ # @param body [String, nil] final response body
34
+ # @return [void]
35
+ # @raise [ResponseTooLarge] if the final body exceeds configured limits
36
+ # @raise [BlockedSurfaceDetected] if the body matches known anti-bot interstitial signatures
37
+ def inspect_body!(body)
38
+ normalized_body = body.to_s
39
+ size = normalized_body.bytesize
40
+ raise_if_too_large!(size, policy.max_decompressed_bytes)
41
+ raise_if_blocked_surface!(normalized_body)
42
+ end
43
+
44
+ private
45
+
46
+ attr_reader :policy
47
+
48
+ def raise_if_blocked_surface!(body)
49
+ signature = Html2rss::BlockedSurface.interstitial_signature_for(body)
50
+ return unless signature
51
+
52
+ raise BlockedSurfaceDetected, signature.fetch(:message)
53
+ end
54
+
55
+ def raise_if_too_large!(bytes, limit)
56
+ return unless bytes > limit
57
+
58
+ raise ResponseTooLarge, "Response exceeded #{limit} bytes"
59
+ end
60
+ end
61
+ end
62
+ end
@@ -6,13 +6,38 @@ require 'forwardable'
6
6
  module Html2rss
7
7
  ##
8
8
  # Requests website URLs to retrieve their HTML for further processing.
9
- # Provides strategies, i.e. to integrate Browserless.io.
9
+ # Provides strategies, e.g. integrating Browserless.io.
10
10
  class RequestService
11
11
  include Singleton
12
12
 
13
+ # Raised when an unknown request strategy is requested.
13
14
  class UnknownStrategy < Html2rss::Error; end
15
+ # Raised when a URL cannot be parsed or validated.
14
16
  class InvalidUrl < Html2rss::Error; end
17
+ # Raised when a URL uses an unsupported scheme.
15
18
  class UnsupportedUrlScheme < Html2rss::Error; end
19
+ # Raised when a response type cannot be parsed.
20
+ class UnsupportedResponseContentType < Html2rss::Error; end
21
+ # Raised when request limits are exceeded.
22
+ class RequestBudgetExceeded < Html2rss::Error; end
23
+ # Raised when policy denies private-network access.
24
+ class PrivateNetworkDenied < Html2rss::Error; end
25
+ # Raised when cross-origin follow-up requests are denied.
26
+ class CrossOriginFollowUpDenied < Html2rss::Error; end
27
+ # Raised when a response exceeds configured size limits.
28
+ class ResponseTooLarge < Html2rss::Error; end
29
+ # Raised when blocked content surfaces are detected.
30
+ class BlockedSurfaceDetected < Html2rss::Error; end
31
+ # Raised when a request times out.
32
+ class RequestTimedOut < Html2rss::Error; end
33
+ # Raised when Browserless configuration is missing or invalid.
34
+ class BrowserlessConfigurationError < Html2rss::Error; end
35
+ # Raised when Browserless cannot be reached.
36
+ class BrowserlessConnectionFailed < Html2rss::Error; end
37
+ # Raised when Botasaurus configuration is missing or invalid.
38
+ class BotasaurusConfigurationError < Html2rss::Error; end
39
+ # Raised when Botasaurus cannot be reached or returns invalid payloads.
40
+ class BotasaurusConnectionFailed < Html2rss::Error; end
16
41
 
17
42
  class << self
18
43
  extend Forwardable
@@ -31,6 +56,7 @@ module Html2rss
31
56
  def initialize
32
57
  @strategies = {
33
58
  faraday: FaradayStrategy,
59
+ botasaurus: BotasaurusStrategy,
34
60
  browserless: BrowserlessStrategy
35
61
  }
36
62
  @default_strategy_name = :faraday
@@ -42,6 +68,7 @@ module Html2rss
42
68
  ##
43
69
  # Sets the default strategy.
44
70
  # @param strategy [Symbol] the name of the strategy
71
+ # @return [Symbol] the selected default strategy name
45
72
  # @raise [UnknownStrategy] if the strategy is not registered
46
73
  def default_strategy_name=(strategy)
47
74
  raise UnknownStrategy unless strategy_registered?(strategy)
@@ -55,9 +82,13 @@ module Html2rss
55
82
  ##
56
83
  # Registers a new strategy.
57
84
  # @param name [Symbol] the name of the strategy
58
- # @param strategy_class [Class] the class of the strategy
85
+ # @param strategy_class [Class] the class implementing the strategy
86
+ # @return [Class] the registered strategy class
87
+ # @raise [ArgumentError] if strategy_class is not a Class
59
88
  def register_strategy(name, strategy_class)
60
- raise ArgumentError, 'Strategy class must be a Class' unless strategy_class.is_a?(Class)
89
+ unless strategy_class.is_a?(Class)
90
+ raise ArgumentError, "Expected a Class for strategy, got #{strategy_class.class}"
91
+ end
61
92
 
62
93
  @strategies[name.to_sym] = strategy_class
63
94
  end
@@ -65,7 +96,7 @@ module Html2rss
65
96
  ##
66
97
  # Checks if a strategy is registered.
67
98
  # @param name [Symbol] the name of the strategy
68
- # @return [Boolean] true if the strategy is registered, false otherwise
99
+ # @return [Boolean] true if the strategy is registered, false otherwise.
69
100
  def strategy_registered?(name)
70
101
  @strategies.key?(name.to_sym)
71
102
  end
@@ -73,24 +104,28 @@ module Html2rss
73
104
  ##
74
105
  # Unregisters a strategy.
75
106
  # @param name [Symbol] the name of the strategy
76
- # @return [Boolean] true if the strategy was unregistered, false otherwise
77
- def unregister_strategy(name)
78
- raise ArgumentError, 'Cannot unregister the default strategy' if name.to_sym == @default_strategy_name
107
+ # @return [Boolean] true if the strategy was unregistered, false otherwise.
108
+ # @raise [ArgumentError] if attempting to unregister the default strategy.
109
+ def unregister_strategy(name) # rubocop:disable Naming/PredicateMethod
110
+ name_sym = name.to_sym
111
+ raise ArgumentError, 'Cannot unregister the default strategy.' if name_sym == @default_strategy_name
79
112
 
80
- !!@strategies.delete(name.to_sym)
113
+ !!@strategies.delete(name_sym)
81
114
  end
82
115
 
83
116
  ##
84
- # Executes the request.
85
- # @param ctx [Context] the context for the request
86
- # @param strategy [Symbol] the strategy to use
87
- # @return [Response] the response from the strategy
88
- # @raise [UnknownStrategy] if the strategy is not known
117
+ # Executes the request using the specified strategy.
118
+ # @param ctx [Context] the context for the request.
119
+ # @param strategy [Symbol] the strategy to use (defaults to the default strategy).
120
+ # @return [Response] the response from the executed strategy.
121
+ # @raise [ArgumentError] if the context is nil.
122
+ # @raise [UnknownStrategy] if the strategy is not registered.
89
123
  def execute(ctx, strategy: default_strategy_name)
90
- strategy_class = @strategies.fetch(strategy) do
124
+ strategy_class = @strategies.fetch(strategy.to_sym) do
91
125
  raise UnknownStrategy,
92
- "The strategy '#{strategy}' is not known. Available strategies are: #{strategy_names.join(', ')}"
126
+ "The strategy '#{strategy}' is not known. Available strategies: #{strategy_names.join(', ')}"
93
127
  end
128
+
94
129
  strategy_class.new(ctx).execute
95
130
  end
96
131
  end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestSession
5
+ ##
6
+ # Traverses a rel=next pagination chain for selector-driven extraction.
7
+ class RelNextPager
8
+ include Enumerable
9
+
10
+ ##
11
+ # @param session [RequestSession] request session used to execute follow-ups
12
+ # @param initial_response [RequestService::Response] first page response
13
+ # @param max_pages [Integer] configured page budget, including the initial page
14
+ # @param logger [Logger] logger used for pagination stop reasons
15
+ def initialize(session:, initial_response:, max_pages:, logger: Html2rss::Log)
16
+ @session = session
17
+ @initial_response = initial_response
18
+ @max_pages = max_pages
19
+ @logger = logger
20
+ end
21
+
22
+ ##
23
+ # Iterates over all paginated responses, beginning with the initial response.
24
+ #
25
+ # @yield [RequestService::Response] each page response
26
+ # @return [Enumerator] enumerator when no block is given
27
+ def each
28
+ return enum_for(:each) unless block_given?
29
+
30
+ yield initial_response
31
+
32
+ current_response = initial_response
33
+ session.effective_page_budget(max_pages).pred.times do
34
+ next_url = next_page_url(current_response)
35
+ break unless follow_up_allowed?(next_url)
36
+
37
+ current_response = fetch_follow_up_response_or_stop(next_url, current_response.url)
38
+ break unless current_response
39
+
40
+ yield current_response
41
+ end
42
+ end
43
+
44
+ private
45
+
46
+ attr_reader :session, :initial_response, :max_pages, :logger
47
+
48
+ def next_page_url(page_response)
49
+ href = page_response.parsed_body.at_css('link[rel~="next"][href], a[rel~="next"][href]')&.[]('href')
50
+ return nil if href.nil? || href.empty?
51
+
52
+ Html2rss::Url.from_relative(href, page_response.url)
53
+ end
54
+
55
+ def follow_up_allowed?(next_url)
56
+ next_url && !session.visited?(next_url)
57
+ end
58
+
59
+ def fetch_follow_up_response_or_stop(next_url, origin_url)
60
+ session.follow_up(url: next_url, relation: :pagination, origin_url:)
61
+ rescue RequestService::RequestBudgetExceeded => error
62
+ logger.warn(
63
+ "#{self.class}: pagination stopped at #{next_url} - #{error.message}. " \
64
+ "Retry with --max-requests #{session.max_requests + 1} or increase request.max_requests in the config."
65
+ )
66
+ nil
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestSession
5
+ ##
6
+ # Carries the runtime request inputs needed to build a RequestSession.
7
+ class RuntimeInput
8
+ ##
9
+ # @param config [Html2rss::Config] validated feed config
10
+ # @return [RuntimeInput] runtime request inputs derived from the config
11
+ def self.from_config(config)
12
+ new(
13
+ url: config.url,
14
+ headers: config.headers,
15
+ request: config.request,
16
+ strategy: config.strategy,
17
+ request_policy: RuntimePolicy.from_config(config)
18
+ )
19
+ end
20
+
21
+ ##
22
+ # @param url [String, Html2rss::Url] initial request URL
23
+ # @param headers [Hash] normalized request headers
24
+ # @param request [Hash] validated request options for strategies
25
+ # @param strategy [Symbol] request strategy to use for the session
26
+ # @param request_policy [RequestService::Policy] request policy for the session
27
+ def initialize(url:, headers:, request:, strategy:, request_policy:)
28
+ @url = Html2rss::Url.from_absolute(url)
29
+ @headers = normalize_headers(headers).freeze
30
+ @request = normalize_request(request).freeze
31
+ @strategy = strategy
32
+ @request_policy = request_policy
33
+ freeze
34
+ end
35
+
36
+ ##
37
+ # @return [Html2rss::Url] initial request URL
38
+ attr_reader :url
39
+
40
+ ##
41
+ # @return [Hash] normalized request headers
42
+ attr_reader :headers
43
+
44
+ ##
45
+ # @return [Hash] validated request options for strategies
46
+ attr_reader :request
47
+
48
+ ##
49
+ # @return [Symbol] request strategy to use for the session
50
+ attr_reader :strategy
51
+
52
+ ##
53
+ # @return [RequestService::Policy] policy derived from the runtime request inputs
54
+ attr_reader :request_policy
55
+
56
+ private
57
+
58
+ def normalize_headers(headers)
59
+ headers.to_h do |key, value|
60
+ [key.to_s, value]
61
+ end
62
+ end
63
+
64
+ def normalize_request(request)
65
+ normalized = HashUtil.deep_symbolize_keys(request, context: 'request')
66
+ HashUtil.assert_symbol_keys!(normalized, context: 'request')
67
+ normalized
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestSession
5
+ ##
6
+ # Builds the runtime request policy for a feed run.
7
+ class RuntimePolicy
8
+ ##
9
+ # @param config [Html2rss::Config] validated feed config
10
+ # @return [Html2rss::RequestService::Policy] request policy derived from runtime config
11
+ def self.from_config(config)
12
+ RequestService::Policy.new(
13
+ max_requests: effective_max_requests_for(config),
14
+ max_redirects: config.max_redirects
15
+ )
16
+ end
17
+
18
+ class << self
19
+ private
20
+
21
+ def effective_max_requests_for(config)
22
+ return config.max_requests if config.explicit_max_requests?
23
+
24
+ [baseline_request_budget_for(config), config.max_requests].max
25
+ end
26
+
27
+ # Reserve enough budget for the initial request plus predictable follow-ups
28
+ # that the top-level pipeline may trigger during a normal feed build.
29
+ def baseline_request_budget_for(config)
30
+ 1 + pagination_follow_up_budget_for(config) +
31
+ known_auto_source_follow_up_budget_for(config) +
32
+ auto_strategy_fallback_budget_for(config) +
33
+ browserless_preload_budget_for(config)
34
+ end
35
+
36
+ def auto_strategy_fallback_budget_for(config)
37
+ return 0 unless config.strategy == :auto
38
+
39
+ [FeedPipeline::AutoFallback::CHAIN.size - 1, 0].max
40
+ end
41
+
42
+ def pagination_follow_up_budget_for(config)
43
+ [config.selectors&.dig(:items, :pagination, :max_pages).to_i - 1, 0].max
44
+ end
45
+
46
+ def known_auto_source_follow_up_budget_for(config)
47
+ config.auto_source&.dig(:scraper, :wordpress_api, :enabled) ? 1 : 0
48
+ end
49
+
50
+ def browserless_preload_budget_for(config)
51
+ preload = config.request.dig(:browserless, :preload)
52
+ return 0 unless preload
53
+
54
+ top_level_preload_wait_budget(preload) +
55
+ click_selector_preload_budget(preload) +
56
+ scroll_preload_budget(preload)
57
+ end
58
+
59
+ def top_level_preload_wait_budget(preload)
60
+ preload[:wait_after_ms] ? 2 : 0
61
+ end
62
+
63
+ def click_selector_preload_budget(preload)
64
+ preload.fetch(:click_selectors, []).sum { preload_action_budget(_1, :max_clicks) }
65
+ end
66
+
67
+ def scroll_preload_budget(preload)
68
+ scroll = preload[:scroll_down]
69
+ return 0 unless scroll
70
+
71
+ preload_action_budget(scroll, :iterations)
72
+ end
73
+
74
+ def preload_action_budget(config, count_key)
75
+ action_count = config.fetch(count_key, 1)
76
+ wait_budget = config[:wait_after_ms] ? action_count : 0
77
+
78
+ action_count + wait_budget
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Coordinates multi-request feed builds on top of RequestService.
6
+ class RequestSession
7
+ class << self
8
+ ##
9
+ # Builds a request session from translated runtime request inputs.
10
+ #
11
+ # @param runtime_input [RuntimeInput] translated runtime request inputs
12
+ # @param budget [RequestService::Budget, nil] optional shared budget for multi-attempt runs
13
+ # @param logger [Logger] logger used for operational warnings
14
+ # @return [RequestSession] configured request session
15
+ def from_runtime_input(runtime_input, budget: nil, logger: Html2rss::Log) # rubocop:disable Metrics/MethodLength
16
+ context_options = {
17
+ url: runtime_input.url,
18
+ headers: runtime_input.headers,
19
+ request: runtime_input.request,
20
+ policy: runtime_input.request_policy
21
+ }
22
+ context_options[:budget] = budget unless budget.nil?
23
+
24
+ new(
25
+ context: RequestService::Context.new(**context_options),
26
+ strategy: runtime_input.strategy,
27
+ logger:
28
+ )
29
+ end
30
+ end
31
+
32
+ ##
33
+ # @param context [RequestService::Context] initial request context
34
+ # @param strategy [Symbol] request strategy to use for all requests in the session
35
+ # @param logger [Logger] logger used for operational warnings
36
+ def initialize(context:, strategy:, logger: Html2rss::Log)
37
+ @context = context
38
+ @strategy = strategy
39
+ @logger = logger
40
+ @visited_urls = Set.new
41
+ end
42
+
43
+ ##
44
+ # Executes the initial request for the session.
45
+ #
46
+ # @return [RequestService::Response] initial response
47
+ def fetch_initial_response
48
+ execute(context).tap { |response| remember!(response.url) }
49
+ end
50
+
51
+ ##
52
+ # Executes a follow-up request sharing policy, headers, and budget.
53
+ #
54
+ # @param url [String, Html2rss::Url] follow-up request url
55
+ # @param relation [Symbol] why the follow-up is being made
56
+ # @param origin_url [String, Html2rss::Url] effective origin for same-origin checks
57
+ # @return [RequestService::Response] follow-up response
58
+ def follow_up(url:, relation:, origin_url:)
59
+ execute(context.follow_up(url:, relation:, origin_url:)).tap { |response| remember!(response.url) }
60
+ end
61
+
62
+ ##
63
+ # Returns the effective page budget after applying the policy ceiling.
64
+ #
65
+ # @param requested_pages [Integer] configured page budget
66
+ # @return [Integer] effective page budget for the session
67
+ def effective_page_budget(requested_pages)
68
+ effective_pages = [requested_pages, context.policy.max_requests].min
69
+ return effective_pages if effective_pages == requested_pages
70
+
71
+ logger.warn(
72
+ "#{self.class}: pagination max_pages=#{requested_pages} " \
73
+ "exceeds system ceiling=#{context.policy.max_requests}; " \
74
+ "clamping to #{effective_pages}"
75
+ )
76
+ effective_pages
77
+ end
78
+
79
+ ##
80
+ # Returns the configured request budget for the session.
81
+ #
82
+ # @return [Integer] maximum requests allowed for the feed build
83
+ def max_requests
84
+ context.policy.max_requests
85
+ end
86
+
87
+ ##
88
+ # @param url [String, Html2rss::Url] follow-up target URL for the request
89
+ # @return [Boolean] whether the url was already visited in this session
90
+ def visited?(url)
91
+ visited_urls.include?(normalize_url(url))
92
+ end
93
+
94
+ ##
95
+ # Records a visited url in the session.
96
+ #
97
+ # @param url [String, Html2rss::Url] URL used to update relation tracking state
98
+ # @return [Set<Html2rss::Url>] visited urls
99
+ def remember!(url)
100
+ visited_urls.add(normalize_url(url))
101
+ end
102
+
103
+ private
104
+
105
+ attr_reader :context, :strategy, :logger, :visited_urls
106
+
107
+ def execute(request_context)
108
+ RequestService.execute(request_context, strategy:).tap do |response|
109
+ logger.debug(
110
+ "#{self.class}: relation=#{request_context.relation} " \
111
+ "request_url=#{request_context.url} final_url=#{response.url} " \
112
+ "status=#{response.status || 'unknown'} content_type=#{response.content_type.inspect} " \
113
+ "bytes=#{response.body.bytesize}"
114
+ )
115
+ end
116
+ end
117
+
118
+ def normalize_url(url)
119
+ Html2rss::Url.from_absolute(url)
120
+ end
121
+ end
122
+ end