html2rss 0.16.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -657
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +7 -4
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +120 -46
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'cgi'
4
+
5
+ module Html2rss
6
+ module Rendering
7
+ # Renders an HTML <video> tag from a URL and type.
8
+ class VideoRenderer
9
+ def initialize(url:, type:)
10
+ @url = url
11
+ @type = type
12
+ end
13
+
14
+ def to_html
15
+ %(<video controls preload="none" referrerpolicy="no-referrer" crossorigin="anonymous" playsinline>
16
+ <source src="#{escaped_url}" type="#{escaped_type}">
17
+ </video>)
18
+ end
19
+
20
+ private
21
+
22
+ def escaped_url
23
+ CGI.escapeHTML(@url.to_s)
24
+ end
25
+
26
+ def escaped_type
27
+ CGI.escapeHTML(@type.to_s)
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ # Namespace for HTML rendering logic, used to generate rich content such as
5
+ # images, audio, video, or embedded documents for feed descriptions.
6
+ #
7
+ # @example
8
+ # Html2rss::Rendering::ImageRenderer.new(...).to_html
9
+ # Html2rss::Rendering::MediaRenderer.for(...)
10
+ #
11
+ # @see Html2rss::Rendering::DescriptionBuilder
12
+ module Rendering
13
+ end
14
+ end
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ ##
5
+ # Tracks runtime request controls together with whether each value was explicitly set.
6
+ class RequestControls
7
+ TOP_LEVEL_KEYS = %i[strategy].freeze
8
+ REQUEST_KEYS = %i[max_redirects max_requests].freeze
9
+
10
+ ##
11
+ # @param config [Hash<Symbol, Object>, Hash<String, Object>] raw config input
12
+ # @return [RequestControls] request controls extracted from the config hash
13
+ def self.from_config(config)
14
+ new(
15
+ strategy: value_for(config, :strategy),
16
+ max_redirects: request_value_for(config, :max_redirects),
17
+ max_requests: request_value_for(config, :max_requests),
18
+ explicit_keys: explicit_keys_for(config)
19
+ )
20
+ end
21
+
22
+ def self.explicit_keys_for(config)
23
+ TOP_LEVEL_KEYS.filter { top_level_key?(config, _1) } +
24
+ REQUEST_KEYS.filter { request_key?(config, _1) }
25
+ end
26
+
27
+ def self.value_for(config, key)
28
+ return config[key] if config.key?(key)
29
+ return config[key.to_s] if config.key?(key.to_s)
30
+
31
+ nil
32
+ end
33
+
34
+ def self.request_value_for(config, key)
35
+ request_config = value_for(config, :request)
36
+ return nil unless request_config.is_a?(Hash)
37
+
38
+ value_for(request_config, key)
39
+ end
40
+
41
+ def self.top_level_key?(config, key)
42
+ config.key?(key) || config.key?(key.to_s)
43
+ end
44
+
45
+ def self.request_key?(config, key)
46
+ request_config = value_for(config, :request)
47
+ request_config.is_a?(Hash) && top_level_key?(request_config, key)
48
+ end
49
+ private_class_method :explicit_keys_for, :request_value_for, :top_level_key?, :request_key?, :value_for
50
+
51
+ ##
52
+ # @param strategy [Symbol, nil] effective request strategy
53
+ # @param max_redirects [Integer, nil] effective redirect limit
54
+ # @param max_requests [Integer, nil] effective request budget
55
+ # @param explicit_keys [Array<Symbol>] controls explicitly supplied by the caller
56
+ def initialize(strategy: nil, max_redirects: nil, max_requests: nil, explicit_keys: [])
57
+ @strategy = strategy
58
+ @max_redirects = max_redirects
59
+ @max_requests = max_requests
60
+ @explicit_keys = explicit_keys.map(&:to_sym).uniq.freeze
61
+ freeze
62
+ end
63
+
64
+ ##
65
+ # @return [Symbol, nil] effective request strategy
66
+ attr_reader :strategy
67
+
68
+ ##
69
+ # @return [Integer, nil] effective redirect limit
70
+ attr_reader :max_redirects
71
+
72
+ ##
73
+ # @return [Integer, nil] effective request budget
74
+ attr_reader :max_requests
75
+
76
+ ##
77
+ # @param name [Symbol, String] request control name
78
+ # @return [Boolean] whether the control was explicitly supplied
79
+ def explicit?(name)
80
+ explicit_keys.include?(name.to_sym)
81
+ end
82
+
83
+ ##
84
+ # @param strategy [Symbol, nil] validated request strategy
85
+ # @param max_redirects [Integer, nil] validated redirect limit
86
+ # @param max_requests [Integer, nil] validated request budget
87
+ # @return [RequestControls] controls updated with validated effective values
88
+ def with_effective_values(strategy:, max_redirects:, max_requests:)
89
+ self.class.new(
90
+ strategy:,
91
+ max_redirects:,
92
+ max_requests:,
93
+ explicit_keys:
94
+ )
95
+ end
96
+
97
+ ##
98
+ # Applies only explicitly set controls to the provided config hash.
99
+ #
100
+ # @param config [Hash<Symbol, Object>] mutable config hash
101
+ # @return [Hash<Symbol, Object>] the same hash with explicit controls written
102
+ def apply_to(config)
103
+ config[:strategy] = strategy if explicit?(:strategy)
104
+ apply_request_value(config, :max_redirects, max_redirects)
105
+ apply_request_value(config, :max_requests, max_requests)
106
+ config
107
+ end
108
+
109
+ private
110
+
111
+ attr_reader :explicit_keys
112
+
113
+ def apply_request_value(config, key, value)
114
+ return unless explicit?(key)
115
+
116
+ ensure_request_config!(config)
117
+ config[:request][key] = value
118
+ end
119
+
120
+ def ensure_request_config!(config)
121
+ request_config = config[:request]
122
+ return config[:request] = {} if request_config.nil?
123
+ return if request_config.is_a?(Hash)
124
+
125
+ raise ArgumentError, 'request config must be a hash'
126
+ end
127
+ end
128
+ end
@@ -31,23 +31,119 @@ module Html2rss
31
31
  # are aligned with the default values.
32
32
  # @see https://github.com/browserless/browserless/pkgs/container/chromium
33
33
  class BrowserlessStrategy < Strategy
34
- # return [Response]
34
+ ##
35
+ # Executes a Browserless-backed request with the shared request policy.
36
+ #
37
+ # @return [Response] normalized request response
38
+ # @raise [RequestTimedOut] if the browser session exceeds the configured timeout
35
39
  def execute
36
- Puppeteer.connect(browser_ws_endpoint:) do |browser|
37
- PuppetCommander.new(ctx, browser).call
38
- ensure
39
- browser.disconnect
40
- end
40
+ validate_request!
41
+ execute_browserless_request
42
+ rescue Puppeteer::TimeoutError => error
43
+ raise RequestTimedOut, error.message
41
44
  end
42
45
 
46
+ ##
47
+ # @return [String] the Browserless websocket endpoint with token query param
48
+ # @raise [ArgumentError] if a custom endpoint is configured without an API token
43
49
  def browser_ws_endpoint
44
50
  @browser_ws_endpoint ||= begin
45
- api_token = ENV.fetch('BROWSERLESS_IO_API_TOKEN', '6R0W53R135510')
46
51
  ws_url = ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
52
+ api_token = browserless_api_token(ws_url)
47
53
 
48
54
  "#{ws_url}?token=#{api_token}"
49
55
  end
50
56
  end
57
+
58
+ private
59
+
60
+ def validate_request!
61
+ ctx.budget.consume!
62
+ ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
63
+ end
64
+
65
+ def execute_browserless_request
66
+ connect_with_timeout_support do |browser|
67
+ PuppetCommander.new(ctx, browser).call
68
+ ensure
69
+ browser.disconnect
70
+ end
71
+ end
72
+
73
+ def protocol_timeout_ms
74
+ ctx.policy.total_timeout_seconds * 1000
75
+ end
76
+
77
+ def connect_with_timeout_support(&)
78
+ connect_browserless(protocol_timeout: protocol_timeout_ms, &)
79
+ rescue ArgumentError => error
80
+ raise unless unsupported_protocol_timeout?(error)
81
+
82
+ connect_browserless(&)
83
+ end
84
+
85
+ def unsupported_protocol_timeout?(error)
86
+ error.message.include?('unknown keyword: :protocol_timeout')
87
+ end
88
+
89
+ def connect_browserless(protocol_timeout: nil, &)
90
+ connected = false
91
+
92
+ Puppeteer.connect(**browserless_connect_options(protocol_timeout)) do |browser|
93
+ connected = true
94
+ yield browser
95
+ end
96
+ rescue ArgumentError => error
97
+ handle_connection_error(error, connected:, protocol_timeout:)
98
+ rescue StandardError => error
99
+ handle_connection_error(error, connected:)
100
+ end
101
+
102
+ def browserless_connect_options(protocol_timeout)
103
+ { browser_ws_endpoint:, protocol_timeout: }.compact
104
+ end
105
+
106
+ def handle_connection_error(error, connected:, protocol_timeout: nil)
107
+ raise if connected || compatibility_timeout_error?(error, protocol_timeout:)
108
+
109
+ raise BrowserlessConnectionFailed, browserless_connection_message(error), cause: error
110
+ end
111
+
112
+ def compatibility_timeout_error?(error, protocol_timeout:)
113
+ protocol_timeout && unsupported_protocol_timeout?(error)
114
+ end
115
+
116
+ def browserless_connection_message(error)
117
+ base = "Browserless connection failed (#{error.class}: #{error.message})."
118
+ endpoint_hint = "Check BROWSERLESS_IO_WEBSOCKET_URL (currently #{browserless_websocket_url})."
119
+ token_hint = 'Check BROWSERLESS_IO_API_TOKEN and ensure it matches your Browserless TOKEN.'
120
+ local_hint = 'For local Browserless, confirm the service is running and reachable.'
121
+
122
+ if likely_authentication_error?(error)
123
+ "#{base} #{token_hint} #{endpoint_hint}"
124
+ else
125
+ "#{base} #{endpoint_hint} #{token_hint} #{local_hint}"
126
+ end
127
+ end
128
+
129
+ def likely_authentication_error?(error)
130
+ message = error.message.downcase
131
+ message.include?('unauthorized') || message.include?('forbidden') || message.include?('401')
132
+ end
133
+
134
+ def browserless_websocket_url
135
+ ENV.fetch('BROWSERLESS_IO_WEBSOCKET_URL', 'ws://127.0.0.1:3000')
136
+ end
137
+
138
+ def browserless_api_token(ws_url)
139
+ ENV.fetch('BROWSERLESS_IO_API_TOKEN') do
140
+ return '6R0W53R135510' if ws_url == 'ws://127.0.0.1:3000'
141
+
142
+ raise BrowserlessConfigurationError,
143
+ 'BROWSERLESS_IO_API_TOKEN is required for custom Browserless endpoints. ' \
144
+ 'Set BROWSERLESS_IO_API_TOKEN or use ws://127.0.0.1:3000 for local defaults.'
145
+ end
146
+ end
51
147
  end
52
148
  end
53
149
  end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class RequestService
5
+ ##
6
+ # Tracks how many outbound requests a single feed build may still perform.
7
+ class Budget
8
+ ##
9
+ # @param max_requests [Integer] the maximum number of requests allowed
10
+ def initialize(max_requests:)
11
+ unless max_requests.is_a?(Integer) && max_requests.positive?
12
+ raise ArgumentError, 'max_requests must be positive'
13
+ end
14
+
15
+ @remaining = max_requests
16
+ @mutex = Mutex.new
17
+ end
18
+
19
+ ##
20
+ # Consumes one request from the budget.
21
+ #
22
+ # @return [Integer] remaining request count after consumption
23
+ # @raise [RequestBudgetExceeded] if no requests remain
24
+ def consume!
25
+ @mutex.synchronize do
26
+ raise RequestBudgetExceeded, 'Request budget exhausted' if @remaining.zero?
27
+
28
+ @remaining -= 1
29
+ end
30
+ end
31
+
32
+ ##
33
+ # @return [Integer] requests still available
34
+ def remaining
35
+ @mutex.synchronize { @remaining }
36
+ end
37
+ end
38
+ end
39
+ end
@@ -1,45 +1,89 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'addressable/uri'
4
-
5
3
  module Html2rss
6
4
  class RequestService
7
5
  ##
8
6
  # Holds information needed to send requests to websites.
9
7
  # To be passed down to the RequestService's strategies.
10
8
  class Context
11
- SUPPORTED_URL_SCHEMES = %w[http https].to_set.freeze
12
-
13
9
  ##
14
- # @param url [String, Addressable::URI] the URL to request
10
+ # @param url [String, Html2rss::Url] the URL to request
15
11
  # @param headers [Hash] HTTP request headers
16
- def initialize(url:, headers: {})
17
- @url = Addressable::URI.parse(url)
18
- assert_valid_url!
19
-
12
+ # @param request [Hash] request specific options passed to strategies
13
+ # @param request_options [Hash] runtime request options
14
+ # @option request_options [Symbol] :relation why this request is being made
15
+ # @option request_options [String, Html2rss::Url, nil] :origin_url originating URL for same-origin checks
16
+ # @option request_options [Policy] :policy runtime request policy
17
+ # @option request_options [Budget] :budget shared request budget for the feed build
18
+ # @raise [ArgumentError] if policy or budget is explicitly nil
19
+ def initialize(url:, headers: {}, request: {}, **request_options)
20
+ @url = Html2rss::Url.from_absolute(url)
20
21
  @headers = headers
22
+ @request = request.freeze
23
+ assign_request_options(request_options)
21
24
  end
22
25
 
23
- # @return [Addressable::URI] the parsed URL
26
+ # @return [Html2rss::Url] the parsed and normalized URL
24
27
  attr_reader :url
25
28
 
26
29
  # @return [Hash] the HTTP request headers
27
30
  attr_reader :headers
28
31
 
29
- private
32
+ # @return [Hash] the request specific options
33
+ attr_reader :request
34
+
35
+ # @return [Hash] browserless specific options
36
+ def browserless = request.fetch(:browserless, {})
37
+
38
+ # @return [Hash, nil] preload options for browserless requests
39
+ def browserless_preload = browserless[:preload]
40
+
41
+ # @return [Symbol] the request relation
42
+ attr_reader :relation
43
+
44
+ # @return [Html2rss::Url] the initial URL for the feed build
45
+ attr_reader :origin_url
46
+
47
+ # @return [Policy] the runtime request policy
48
+ attr_reader :policy
49
+
50
+ # @return [Budget] the shared request budget
51
+ attr_reader :budget
30
52
 
31
53
  ##
32
- # Validates the URL.
33
- # @raise [InvalidUrl] if the URL is not valid
34
- # @raise [UnsupportedUrlScheme] if the URL scheme is not supported
35
- def assert_valid_url!
36
- raise InvalidUrl, 'URL must be absolute' unless url.absolute?
37
- raise InvalidUrl, 'URL must not contain an @ character' if url.to_s.include?('@')
54
+ # Builds a follow-up request context sharing headers, budget, and policy.
55
+ #
56
+ # @param url [String, Html2rss::Url] the follow-up URL
57
+ # @param relation [Symbol] why the follow-up is being made
58
+ # @param origin_url [String, Html2rss::Url] effective origin for same-origin checks
59
+ # @return [Context] derived request context
60
+ def follow_up(url:, relation:, origin_url: self.origin_url)
61
+ self.class.new(
62
+ url:,
63
+ headers:,
64
+ request:,
65
+ relation:,
66
+ origin_url:,
67
+ policy:,
68
+ budget:
69
+ )
70
+ end
38
71
 
39
- return if SUPPORTED_URL_SCHEMES.include?(url.scheme)
72
+ private
73
+
74
+ def assign_request_options(request_options)
75
+ @relation = request_options.fetch(:relation, :initial)
76
+ @policy = request_options.fetch(:policy, Policy.default)
77
+ raise ArgumentError, 'policy must not be nil' if @policy.nil?
78
+
79
+ @origin_url = normalized_origin_url(request_options[:origin_url])
80
+ @budget = request_options.fetch(:budget) { Budget.new(max_requests: policy.max_requests) }
81
+ raise ArgumentError, 'budget must not be nil' if @budget.nil?
82
+ end
40
83
 
41
- raise UnsupportedUrlScheme,
42
- "URL scheme '#{url.scheme}' is not supported"
84
+ def normalized_origin_url(origin_url)
85
+ source = origin_url || @url
86
+ Html2rss::Url.from_absolute(source)
43
87
  end
44
88
  end
45
89
  end
@@ -2,6 +2,7 @@
2
2
 
3
3
  require 'faraday'
4
4
  require 'faraday/follow_redirects'
5
+ require 'faraday/gzip'
5
6
 
6
7
  module Html2rss
7
8
  class RequestService
@@ -9,15 +10,144 @@ module Html2rss
9
10
  # Strategy to use Faraday for the request.
10
11
  # @see https://rubygems.org/gems/faraday
11
12
  class FaradayStrategy < Strategy
12
- # return [Response]
13
+ ##
14
+ # Restores buffered streamed bytes so response middleware can process them.
15
+ class StreamingBodyMiddleware < Faraday::Middleware
16
+ STREAM_BUFFER_KEY = :html2rss_stream_buffer
17
+
18
+ def on_complete(env)
19
+ buffer = env.request.context&.delete(STREAM_BUFFER_KEY)
20
+ return if buffer.nil? || buffer.empty?
21
+
22
+ env.body = buffer
23
+ end
24
+ end
25
+
26
+ ##
27
+ # NOTE: Unlike BrowserlessStrategy, Faraday does not expose the remote IP after connect.
28
+ # SSRF protection here is pre-connection only (DNS resolution via Policy).
29
+ # A DNS rebinding attack between resolution and connect cannot be caught at this layer.
30
+ #
31
+ # Executes a request with runtime policy enforcement.
32
+ #
33
+ # @return [Response] normalized request response
13
34
  def execute
14
- request = Faraday.new(url: ctx.url, headers: ctx.headers) do |faraday|
15
- faraday.use Faraday::FollowRedirects::Middleware
35
+ deadline = request_deadline
36
+ response_guard, response = perform_request(deadline:)
37
+ response_guard.inspect_body!(response.body)
38
+ build_response(response)
39
+ rescue Faraday::TimeoutError, Timeout::Error => error
40
+ raise RequestTimedOut, error.message
41
+ end
42
+
43
+ private
44
+
45
+ def request_deadline
46
+ monotonic_now + ctx.policy.total_timeout_seconds
47
+ end
48
+
49
+ def perform_request(deadline:)
50
+ response_guard = ResponseGuard.new(policy: ctx.policy)
51
+ response = faraday_request(response_guard, deadline:, streaming_buffer: true)
52
+ response = retry_without_streaming(response_guard, deadline:) if retry_without_streaming?(response)
53
+ [response_guard, response]
54
+ end
55
+
56
+ def build_response(response)
57
+ Response.new(body: response.body, headers: response.headers, url: response_url(response),
58
+ status: response.status)
59
+ end
60
+
61
+ def validate_request!(consume_budget: true)
62
+ ctx.budget.consume! if consume_budget
63
+ ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
64
+ end
65
+
66
+ def faraday_request(response_guard, deadline:, streaming_buffer:, consume_budget: true)
67
+ validate_request!(consume_budget:)
68
+
69
+ client.get do |req|
70
+ apply_timeouts(req, deadline:)
71
+ buffer = prepare_stream_buffer(req) if streaming_buffer
72
+ req.options.on_data = on_data_callback(response_guard, buffer)
73
+ end
74
+ end
75
+
76
+ def retry_without_streaming(response_guard, deadline:)
77
+ faraday_request(response_guard, deadline:, streaming_buffer: false, consume_budget: false)
78
+ end
79
+
80
+ def client
81
+ @client ||= Faraday.new(url: ctx.url.to_s, headers: ctx.headers) do |faraday|
82
+ faraday.use Faraday::FollowRedirects::Middleware, limit: ctx.policy.max_redirects, callback: redirect_callback
83
+ faraday.request :gzip
84
+ faraday.use StreamingBodyMiddleware
16
85
  faraday.adapter Faraday.default_adapter
17
86
  end
18
- response = request.get
87
+ end
88
+
89
+ def apply_timeouts(request, deadline:)
90
+ remaining_timeout = remaining_timeout_seconds(deadline)
91
+ request.options.timeout = remaining_timeout
92
+ request.options.open_timeout = [ctx.policy.connect_timeout_seconds, remaining_timeout].min
93
+ request.options.read_timeout = [ctx.policy.read_timeout_seconds, remaining_timeout].min
94
+ end
95
+
96
+ def prepare_stream_buffer(request)
97
+ request.options.context ||= {}
98
+ request.options.context[StreamingBodyMiddleware::STREAM_BUFFER_KEY] = +''
99
+ end
100
+
101
+ def on_data_callback(response_guard, buffer)
102
+ proc do |chunk, total_bytes, env|
103
+ response_guard.inspect_chunk!(total_bytes:, headers: env&.response_headers)
104
+ buffer&.<< chunk
105
+ end
106
+ end
107
+
108
+ def remaining_timeout_seconds(deadline)
109
+ remaining = deadline - monotonic_now
110
+ raise RequestTimedOut, 'Request timed out' if remaining <= 0
111
+
112
+ remaining
113
+ end
114
+
115
+ def retry_without_streaming?(response)
116
+ return false if response.body.to_s.empty? == false
117
+ return false unless response_success?(response)
118
+
119
+ final_url = response.env&.url
120
+ return false unless final_url
121
+
122
+ final_url.to_s != ctx.url.to_s
123
+ end
124
+
125
+ def response_success?(response)
126
+ return true if response.status.nil?
127
+
128
+ response.status >= 200 && response.status < 300
129
+ end
130
+
131
+ def response_url(response)
132
+ return ctx.url unless (url = response.env&.url)
133
+
134
+ Html2rss::Url.from_absolute(url.to_s)
135
+ end
136
+
137
+ def redirect_callback
138
+ lambda do |old_env, new_env|
139
+ from_url = normalize_url(old_env[:url])
140
+ to_url = normalize_url(new_env[:url])
141
+ ctx.policy.validate_redirect!(from_url:, to_url:, origin_url: ctx.origin_url, relation: ctx.relation)
142
+ end
143
+ end
144
+
145
+ def normalize_url(url)
146
+ Html2rss::Url.from_absolute(url.to_s)
147
+ end
19
148
 
20
- Response.new(body: response.body, headers: response.headers)
149
+ def monotonic_now
150
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
21
151
  end
22
152
  end
23
153
  end