html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,161 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ ##
6
+ # Main html2rss namespace.
7
+ module Html2rss
8
+ ##
9
+ # Request transport orchestration and strategies.
10
+ class RequestService
11
+ ##
12
+ # Maps html2rss request/response handling to the botasaurus-scrape-api contract.
13
+ class BotasaurusContract
14
+ # Default Botasaurus scrape options when no explicit config is provided.
15
+ DEFAULT_OPTIONS = {
16
+ navigation_mode: 'auto',
17
+ max_retries: 2,
18
+ headless: false
19
+ }.freeze
20
+
21
+ # Allowlisted request.botasaurus keys forwarded to upstream.
22
+ OPTION_KEYS = %i[
23
+ navigation_mode
24
+ max_retries
25
+ wait_for_selector
26
+ wait_timeout_seconds
27
+ block_images
28
+ block_images_and_css
29
+ wait_for_complete_page_load
30
+ headless
31
+ proxy
32
+ user_agent
33
+ window_size
34
+ lang
35
+ ].freeze
36
+
37
+ # Parsed Botasaurus response wrapper.
38
+ class ParsedResponse
39
+ # Fallback headers when upstream omits response headers.
40
+ DEFAULT_HEADERS = { 'content-type' => 'text/html' }.freeze
41
+
42
+ # @param payload [Hash{String => Object}] parsed Botasaurus response payload
43
+ # @param transport_status [Integer] HTTP status returned by Botasaurus
44
+ def initialize(payload:, transport_status:)
45
+ @payload = payload
46
+ @transport_status = transport_status
47
+ end
48
+
49
+ # @return [Boolean] true when upstream classified request as challenge blocked
50
+ def challenge_block? = error_category == 'challenge_block'
51
+
52
+ # @return [Boolean] true when upstream returned non-200 or an error payload
53
+ def upstream_failure?
54
+ status != 200 || error_message?
55
+ end
56
+
57
+ # @return [String] normalized challenge error message
58
+ def challenge_message
59
+ error || 'Botasaurus challenge block detected.'
60
+ end
61
+
62
+ # @return [String] actionable upstream failure summary
63
+ def upstream_failure_message
64
+ details = ["status=#{status}"]
65
+ details << "error_category=#{error_category}" if error_category
66
+ details << "error=#{error}" if error
67
+ details << "request_id=#{request_id}" if request_id
68
+ "Botasaurus scrape failed (#{details.join(', ')})."
69
+ end
70
+
71
+ # @return [String] rendered HTML body from Botasaurus
72
+ # @raise [BotasaurusConnectionFailed] when html is missing
73
+ def html
74
+ value = payload['html']
75
+ raise BotasaurusConnectionFailed, "Botasaurus response missing required 'html' field" if value.nil?
76
+
77
+ value.to_s
78
+ end
79
+
80
+ # @return [Hash{String => String}] normalized response headers
81
+ def headers
82
+ raw_headers = payload['headers']
83
+ return DEFAULT_HEADERS.dup unless raw_headers.is_a?(Hash) && raw_headers.any?
84
+
85
+ raw_headers.to_h { |key, value| [key.to_s, value.to_s] }
86
+ end
87
+
88
+ # @return [Integer] resolved status code (payload status_code or transport status)
89
+ def status
90
+ status_code = payload['status_code']
91
+ status_code.is_a?(Integer) ? status_code : transport_status
92
+ end
93
+
94
+ # @return [String, nil] final URL reported by upstream
95
+ def final_url = payload['final_url']
96
+
97
+ private
98
+
99
+ attr_reader :payload, :transport_status
100
+
101
+ def error = payload['error']
102
+
103
+ def request_id = payload['request_id']
104
+
105
+ def error_category = payload['error_category']
106
+
107
+ def error_message?
108
+ value = error
109
+ value.is_a?(String) ? !value.empty? : !value.nil?
110
+ end
111
+ end
112
+
113
+ ##
114
+ # @param url [Html2rss::Url] canonical URL to scrape
115
+ # @param options [Hash] validated request.botasaurus options
116
+ # @option options [String] :navigation_mode
117
+ # @option options [Integer] :max_retries
118
+ # @option options [String] :wait_for_selector
119
+ # @option options [Integer] :wait_timeout_seconds
120
+ # @option options [Boolean] :block_images
121
+ # @option options [Boolean] :block_images_and_css
122
+ # @option options [Boolean] :wait_for_complete_page_load
123
+ # @option options [Boolean] :headless
124
+ # @option options [String] :proxy
125
+ # @option options [String] :user_agent
126
+ # @option options [Array<Integer>] :window_size
127
+ # @option options [String] :lang
128
+ def initialize(url:, options: {})
129
+ @url = url
130
+ @options = options
131
+ end
132
+
133
+ # @return [Hash] payload for POST /scrape
134
+ def request_payload
135
+ DEFAULT_OPTIONS.merge(filtered_options).merge(url: url.to_s)
136
+ end
137
+
138
+ # @param transport_response [Faraday::Response] upstream HTTP response
139
+ # @return [ParsedResponse]
140
+ # @raise [BotasaurusConnectionFailed] when payload is not valid JSON object
141
+ def parse_response(transport_response)
142
+ payload = JSON.parse(transport_response.body.to_s)
143
+ raise BotasaurusConnectionFailed, 'Botasaurus response must be a JSON object' unless payload.is_a?(Hash)
144
+
145
+ ParsedResponse.new(payload:, transport_status: transport_response.status)
146
+ rescue JSON::ParserError => error
147
+ raise BotasaurusConnectionFailed, "Botasaurus response JSON parse failed: #{error.message}"
148
+ end
149
+
150
+ private
151
+
152
+ attr_reader :url, :options
153
+
154
+ def filtered_options
155
+ OPTION_KEYS.each_with_object({}) do |key, normalized|
156
+ normalized[key] = options[key] if options.key?(key)
157
+ end
158
+ end
159
+ end
160
+ end
161
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'json'
5
+
6
+ module Html2rss
7
+ class RequestService
8
+ ##
9
+ # Strategy to delegate fetching to a Botasaurus scrape API.
10
+ class BotasaurusStrategy < Strategy
11
+ ##
12
+ # Executes a Botasaurus-backed request with shared request policy guards.
13
+ #
14
+ # @return [Response] normalized request response
15
+ # @raise [BotasaurusConfigurationError] when BOTASAURUS_SCRAPER_URL is missing or invalid
16
+ # @raise [BotasaurusConnectionFailed] when Botasaurus cannot be reached or returns an invalid payload
17
+ # @raise [RequestTimedOut] when the Botasaurus request exceeds configured timeout
18
+ def execute
19
+ validate_request!
20
+ transport_response = client.post('/scrape', JSON.generate(contract.request_payload), content_type_header)
21
+ parsed_response = contract.parse_response(transport_response)
22
+ raise_if_challenge_blocked!(parsed_response)
23
+ raise_if_upstream_failed!(parsed_response)
24
+ build_response(parsed_response)
25
+ rescue Faraday::TimeoutError, Timeout::Error => error
26
+ raise RequestTimedOut, error.message
27
+ rescue Faraday::ConnectionFailed, Faraday::SSLError => error
28
+ raise BotasaurusConnectionFailed, "Botasaurus connection failed: #{error.message}"
29
+ end
30
+
31
+ private
32
+
33
+ def validate_request!
34
+ ctx.budget.consume!
35
+ ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
36
+ end
37
+
38
+ def build_response(parsed_response)
39
+ body = parsed_response.html
40
+ ResponseGuard.new(policy: ctx.policy).inspect_body!(body)
41
+
42
+ Response.new(
43
+ body:,
44
+ headers: parsed_response.headers,
45
+ url: response_url(parsed_response.final_url),
46
+ status: parsed_response.status
47
+ )
48
+ end
49
+
50
+ def raise_if_challenge_blocked!(parsed_response)
51
+ return unless parsed_response.challenge_block?
52
+
53
+ raise BlockedSurfaceDetected, "Blocked surface detected: #{parsed_response.challenge_message}"
54
+ end
55
+
56
+ def raise_if_upstream_failed!(parsed_response)
57
+ return unless parsed_response.upstream_failure?
58
+
59
+ raise BotasaurusConnectionFailed, parsed_response.upstream_failure_message
60
+ end
61
+
62
+ def response_url(final_url)
63
+ return ctx.url if final_url.nil?
64
+
65
+ Html2rss::Url.from_absolute(final_url)
66
+ rescue ArgumentError
67
+ ctx.url
68
+ end
69
+
70
+ def contract
71
+ @contract ||= BotasaurusContract.new(url: ctx.url, options: ctx.request.fetch(:botasaurus, {}))
72
+ end
73
+
74
+ def client
75
+ @client ||= Faraday.new(url: scraper_base_url.to_s, request: request_options)
76
+ end
77
+
78
+ def request_options
79
+ { timeout: ctx.policy.total_timeout_seconds }
80
+ end
81
+
82
+ def content_type_header
83
+ { 'Content-Type' => 'application/json' }
84
+ end
85
+
86
+ def scraper_base_url
87
+ @scraper_base_url ||= begin
88
+ configured = ENV.fetch('BOTASAURUS_SCRAPER_URL') do
89
+ raise BotasaurusConfigurationError, 'BOTASAURUS_SCRAPER_URL is required for strategy=botasaurus.'
90
+ end
91
+ Html2rss::Url.for_channel(configured)
92
+ rescue ArgumentError => error
93
+ raise BotasaurusConfigurationError, "BOTASAURUS_SCRAPER_URL is invalid: #{error.message}"
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
@@ -31,23 +31,119 @@ module Html2rss
31
31
  # are aligned with the default values.
32
32
  # @see https://github.com/browserless/browserless/pkgs/container/chromium
33
33
  class BrowserlessStrategy < Strategy
34
- # return [Response]
34
+ ##
35
+ # Executes a Browserless-backed request with the shared request policy.
36
+ #
37
+ # @return [Response] normalized request response
38
+ # @raise [RequestTimedOut] if the browser session exceeds the configured timeout
35
39
  def execute
36
- Puppeteer.connect(browser_ws_endpoint:) do |browser|
37
- PuppetCommander.new(ctx, browser).call
38
- ensure
39
- browser.disconnect
40
- end
40
+ validate_request!
41
+ execute_browserless_request
42
+ rescue Puppeteer::TimeoutError => error
43
+ raise RequestTimedOut, error.message
41
44
  end
42
45
 
46
+ ##
47
+ # @return [String] the Browserless websocket endpoint with token query param
48
+ # @raise [ArgumentError] if a custom endpoint is configured without an API token
43
49
  def browser_ws_endpoint
44
50
  @browser_ws_endpoint ||= begin
45
- api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', '6R0W53R135510')
46
51
  ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
52
+ api_token = browserless_api_token(ws_url)
47
53
 
48
54
  "#{ws_url}?token=#{api_token}"
49
55
  end
50
56
  end
57
+
58
+ private
59
+
60
+ def validate_request!
61
+ ctx.budget.consume!
62
+ ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
63
+ end
64
+
65
+ def execute_browserless_request
66
+ connect_with_timeout_support do |browser|
67
+ PuppetCommander.new(ctx, browser).call
68
+ ensure
69
+ browser.disconnect
70
+ end
71
+ end
72
+
73
+ def protocol_timeout_ms
74
+ ctx.policy.total_timeout_seconds * 1000
75
+ end
76
+
77
+ def connect_with_timeout_support(&)
78
+ connect_browserless(protocol_timeout: protocol_timeout_ms, &)
79
+ rescue ArgumentError => error
80
+ raise unless unsupported_protocol_timeout?(error)
81
+
82
+ connect_browserless(&)
83
+ end
84
+
85
+ def unsupported_protocol_timeout?(error)
86
+ error.message.include?('unknown keyword: :protocol_timeout')
87
+ end
88
+
89
+ def connect_browserless(protocol_timeout: nil, &)
90
+ connected = false
91
+
92
+ Puppeteer.connect(**browserless_connect_options(protocol_timeout)) do |browser|
93
+ connected = true
94
+ yield browser
95
+ end
96
+ rescue ArgumentError => error
97
+ handle_connection_error(error, connected:, protocol_timeout:)
98
+ rescue StandardError => error
99
+ handle_connection_error(error, connected:)
100
+ end
101
+
102
+ def browserless_connect_options(protocol_timeout)
103
+ { browser_ws_endpoint:, protocol_timeout: }.compact
104
+ end
105
+
106
+ def handle_connection_error(error, connected:, protocol_timeout: nil)
107
+ raise if connected || compatibility_timeout_error?(error, protocol_timeout:)
108
+
109
+ raise BrowserlessConnectionFailed, browserless_connection_message(error), cause: error
110
+ end
111
+
112
+ def compatibility_timeout_error?(error, protocol_timeout:)
113
+ protocol_timeout && unsupported_protocol_timeout?(error)
114
+ end
115
+
116
+ def browserless_connection_message(error)
117
+ base = "Browserless connection failed (#{error.class}: #{error.message})."
118
+ endpoint_hint = "Check BROWSERLESS_IO_WEBSOCKET_URL (currently #{browserless_websocket_url})."
119
+ token_hint = 'Check BROWSERLESS_IO_API_TOKEN and ensure it matches your Browserless TOKEN.'
120
+ local_hint = 'For local Browserless, confirm the service is running and reachable.'
121
+
122
+ if likely_authentication_error?(error)
123
+ "#{base} #{token_hint} #{endpoint_hint}"
124
+ else
125
+ "#{base} #{endpoint_hint} #{token_hint} #{local_hint}"
126
+ end
127
+ end
128
+
129
+ def likely_authentication_error?(error)
130
+ message = error.message.downcase
131
+ message.include?('unauthorized') || message.include?('forbidden') || message.include?('401')
132
+ end
133
+
134
+ def browserless_websocket_url
135
+ ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
136
+ end
137
+
138
+ def browserless_api_token(ws_url)
139
+ ENV.fetch('BROWSERLESS_IO_API_TOKEN') do
140
+ return '6R0W53R135510' if ws_url == 'ws://127.0.0.1:3000'
141
+
142
+ raise BrowserlessConfigurationError,
143
+ 'BROWSERLESS_IO_API_TOKEN is required for custom Browserless endpoints. ' \
144
+ 'Set BROWSERLESS_IO_API_TOKEN or use ws://127.0.0.1:3000 for local defaults.'
145
+ end
146
+ end
51
147
  end
52
148
  end
53
149
  end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # Tracks how many outbound requests a single feed build may still perform.
7
+ class Budget
8
+ ##
9
+ # @param max_requests [Integer] the maximum number of requests allowed
10
+ def initialize(max_requests:)
11
+ unless max_requests.is_a?(Integer) && max_requests.positive?
12
+ raise ArgumentError, 'max_requests must be positive'
13
+ end
14
+
15
+ @remaining = max_requests
16
+ @mutex = Mutex.new
17
+ end
18
+
19
+ ##
20
+ # Consumes one request from the budget.
21
+ #
22
+ # @return [Integer] remaining request count after consumption
23
+ # @raise [RequestBudgetExceeded] if no requests remain
24
+ def consume!
25
+ @mutex.synchronize do
26
+ raise RequestBudgetExceeded, 'Request budget exhausted' if @remaining.zero?
27
+
28
+ @remaining -= 1
29
+ end
30
+ end
31
+
32
+ ##
33
+ # @return [Integer] requests still available
34
+ def remaining
35
+ @mutex.synchronize { @remaining }
36
+ end
37
+ end
38
+ end
39
+ end
@@ -1,45 +1,101 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'addressable/uri'
4
-
5
3
  module Html2rss
6
4
  class RequestService
7
5
  ##
8
6
  # Holds information needed to send requests to websites.
9
7
  # To be passed down to the RequestService's strategies.
10
8
  class Context
11
- SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
12
-
13
9
  ##
14
- # @param url [String, Addressable::URI] the URL to request
10
+ # @param url [String, Html2rss::Url] the URL to request
15
11
  # @param headers [Hash] HTTP request headers
16
- def initialize(url:, headers: {})
17
- @url = Addressable::URI.parse(url)
18
- assert_valid_url!
19
-
20
- @headers = headers
12
+ # @param request [Hash] request specific options passed to strategies
13
+ # @param request_options [Hash] runtime request options
14
+ # @option request_options [Symbol] :relation why this request is being made
15
+ # @option request_options [String, Html2rss::Url, nil] :origin_url originating URL for same-origin checks
16
+ # @option request_options [Policy] :policy runtime request policy
17
+ # @option request_options [Budget] :budget shared request budget for the feed build
18
+ # @raise [ArgumentError] if policy or budget is explicitly nil
19
+ def initialize(url:, headers: {}, request: {}, **request_options)
20
+ @url = Html2rss::Url.from_absolute(url)
21
+ @headers = normalize_headers(headers).freeze
22
+ @request = normalize_request(request).freeze
23
+ assign_request_options(request_options)
21
24
  end
22
25
 
23
- # @return [Addressable::URI] the parsed URL
26
+ # @return [Html2rss::Url] the parsed and normalized URL
24
27
  attr_reader :url
25
28
 
26
29
  # @return [Hash] the HTTP request headers
27
30
  attr_reader :headers
28
31
 
29
- private
32
+ # @return [Hash] the request specific options
33
+ attr_reader :request
34
+
35
+ # @return [Hash] browserless specific options
36
+ def browserless = request.fetch(:browserless, {})
37
+
38
+ # @return [Hash, nil] preload options for browserless requests
39
+ def browserless_preload = browserless[:preload]
40
+
41
+ # @return [Symbol] the request relation
42
+ attr_reader :relation
43
+
44
+ # @return [Html2rss::Url] the initial URL for the feed build
45
+ attr_reader :origin_url
46
+
47
+ # @return [Policy] the runtime request policy
48
+ attr_reader :policy
49
+
50
+ # @return [Budget] the shared request budget
51
+ attr_reader :budget
30
52
 
31
53
  ##
32
- # Validates the URL.
33
- # @raise [InvalidUrl] if the URL is not valid
34
- # @raise [UnsupportedUrlScheme] if the URL scheme is not supported
35
- def assert_valid_url!
36
- raise InvalidUrl, 'URL must be absolute' unless url.absolute?
37
- raise InvalidUrl, 'URL must not contain an @ character' if url.to_s.include?('@')
54
+ # Builds a follow-up request context sharing headers, budget, and policy.
55
+ #
56
+ # @param url [String, Html2rss::Url] the follow-up URL
57
+ # @param relation [Symbol] why the follow-up is being made
58
+ # @param origin_url [String, Html2rss::Url] effective origin for same-origin checks
59
+ # @return [Context] derived request context
60
+ def follow_up(url:, relation:, origin_url: self.origin_url)
61
+ self.class.new(
62
+ url:,
63
+ headers:,
64
+ request:,
65
+ relation:,
66
+ origin_url:,
67
+ policy:,
68
+ budget:
69
+ )
70
+ end
71
+
72
+ private
38
73
 
39
- return if SUPPORTED_URL_SCHEMES.include?(url.scheme)
74
+ def assign_request_options(request_options)
75
+ @relation = request_options.fetch(:relation, :initial)
76
+ @policy = request_options.fetch(:policy, Policy.default)
77
+ raise ArgumentError, 'policy must not be nil' if @policy.nil?
78
+
79
+ @origin_url = normalized_origin_url(request_options[:origin_url])
80
+ @budget = request_options.fetch(:budget) { Budget.new(max_requests: policy.max_requests) }
81
+ raise ArgumentError, 'budget must not be nil' if @budget.nil?
82
+ end
83
+
84
+ def normalized_origin_url(origin_url)
85
+ source = origin_url || @url
86
+ Html2rss::Url.from_absolute(source)
87
+ end
88
+
89
+ def normalize_headers(headers)
90
+ headers.to_h do |key, value|
91
+ [key.to_s, value]
92
+ end
93
+ end
40
94
 
41
- raise UnsupportedUrlScheme,
42
- "URL scheme '#{url.scheme}' is not supported"
95
+ def normalize_request(request)
96
+ normalized = HashUtil.deep_symbolize_keys(request, context: 'request')
97
+ HashUtil.assert_symbol_keys!(normalized, context: 'request')
98
+ normalized
43
99
  end
44
100
  end
45
101
  end