html2rss 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -1
  3. data/lib/html2rss/articles/deduplicator.rb +1 -0
  4. data/lib/html2rss/auto_source/cleanup.rb +11 -0
  5. data/lib/html2rss/auto_source/scraper/html.rb +5 -0
  6. data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
  7. data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
  8. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
  13. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
  14. data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
  15. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
  16. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
  17. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
  18. data/lib/html2rss/auto_source/scraper.rb +19 -1
  19. data/lib/html2rss/auto_source.rb +4 -0
  20. data/lib/html2rss/blocked_surface.rb +1 -0
  21. data/lib/html2rss/category_extractor.rb +2 -2
  22. data/lib/html2rss/cli.rb +30 -6
  23. data/lib/html2rss/config/class_methods.rb +24 -35
  24. data/lib/html2rss/config/dynamic_params.rb +6 -4
  25. data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
  26. data/lib/html2rss/config/request_headers.rb +9 -3
  27. data/lib/html2rss/config/schema.rb +33 -1
  28. data/lib/html2rss/config/validator.rb +40 -2
  29. data/lib/html2rss/config.rb +19 -13
  30. data/lib/html2rss/error.rb +25 -0
  31. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  32. data/lib/html2rss/feed_pipeline.rb +127 -0
  33. data/lib/html2rss/hash_util.rb +101 -0
  34. data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
  35. data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
  36. data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
  37. data/lib/html2rss/html_extractor.rb +5 -0
  38. data/lib/html2rss/html_navigator.rb +8 -0
  39. data/lib/html2rss/json_feed_builder.rb +1 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +8 -3
  41. data/lib/html2rss/rendering/description_builder.rb +0 -1
  42. data/lib/html2rss/rendering/image_renderer.rb +17 -7
  43. data/lib/html2rss/rendering/media_renderer.rb +4 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
  45. data/lib/html2rss/rendering/video_renderer.rb +8 -3
  46. data/lib/html2rss/rendering.rb +11 -2
  47. data/lib/html2rss/request_controls.rb +16 -21
  48. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  49. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  50. data/lib/html2rss/request_service/context.rb +14 -2
  51. data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
  52. data/lib/html2rss/request_service/policy.rb +4 -0
  53. data/lib/html2rss/request_service/response.rb +9 -1
  54. data/lib/html2rss/request_service.rb +19 -0
  55. data/lib/html2rss/request_session/runtime_input.rb +16 -2
  56. data/lib/html2rss/request_session/runtime_policy.rb +7 -0
  57. data/lib/html2rss/request_session.rb +13 -9
  58. data/lib/html2rss/rss_builder/article.rb +22 -1
  59. data/lib/html2rss/rss_builder/channel.rb +11 -2
  60. data/lib/html2rss/rss_builder/enclosure.rb +15 -1
  61. data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
  62. data/lib/html2rss/rss_builder.rb +4 -0
  63. data/lib/html2rss/selectors/config.rb +1 -0
  64. data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
  65. data/lib/html2rss/selectors/extractors/href.rb +2 -0
  66. data/lib/html2rss/selectors/extractors/html.rb +1 -0
  67. data/lib/html2rss/selectors/extractors/static.rb +2 -1
  68. data/lib/html2rss/selectors/extractors/text.rb +1 -0
  69. data/lib/html2rss/selectors/extractors.rb +2 -1
  70. data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
  71. data/lib/html2rss/selectors/post_processors/base.rb +13 -7
  72. data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
  73. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
  74. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
  75. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
  76. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
  77. data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
  78. data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
  79. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
  80. data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
  81. data/lib/html2rss/selectors/post_processors/template.rb +3 -0
  82. data/lib/html2rss/selectors/post_processors.rb +5 -0
  83. data/lib/html2rss/selectors.rb +7 -0
  84. data/lib/html2rss/url.rb +27 -23
  85. data/lib/html2rss/version.rb +2 -1
  86. data/lib/html2rss.rb +15 -78
  87. data/schema/html2rss-config.schema.json +83 -1
  88. metadata +7 -2
@@ -0,0 +1,161 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ ##
6
+ # Main html2rss namespace.
7
+ module Html2rss
8
+ ##
9
+ # Request transport orchestration and strategies.
10
+ class RequestService
11
+ ##
12
+ # Maps html2rss request/response handling to the botasaurus-scrape-api contract.
13
+ class BotasaurusContract
14
+ # Default Botasaurus scrape options when no explicit config is provided.
15
+ DEFAULT_OPTIONS = {
16
+ navigation_mode: 'auto',
17
+ max_retries: 2,
18
+ headless: false
19
+ }.freeze
20
+
21
+ # Allowlisted request.botasaurus keys forwarded to upstream.
22
+ OPTION_KEYS = %i[
23
+ navigation_mode
24
+ max_retries
25
+ wait_for_selector
26
+ wait_timeout_seconds
27
+ block_images
28
+ block_images_and_css
29
+ wait_for_complete_page_load
30
+ headless
31
+ proxy
32
+ user_agent
33
+ window_size
34
+ lang
35
+ ].freeze
36
+
37
+ # Parsed Botasaurus response wrapper.
38
+ class ParsedResponse
39
+ # Fallback headers when upstream omits response headers.
40
+ DEFAULT_HEADERS = { 'content-type' => 'text/html' }.freeze
41
+
42
+ # @param payload [Hash{String => Object}] parsed Botasaurus response payload
43
+ # @param transport_status [Integer] HTTP status returned by Botasaurus
44
+ def initialize(payload:, transport_status:)
45
+ @payload = payload
46
+ @transport_status = transport_status
47
+ end
48
+
49
+ # @return [Boolean] true when upstream classified request as challenge blocked
50
+ def challenge_block? = error_category == 'challenge_block'
51
+
52
+ # @return [Boolean] true when upstream returned non-200 or an error payload
53
+ def upstream_failure?
54
+ status != 200 || error_message?
55
+ end
56
+
57
+ # @return [String] normalized challenge error message
58
+ def challenge_message
59
+ error || 'Botasaurus challenge block detected.'
60
+ end
61
+
62
+ # @return [String] actionable upstream failure summary
63
+ def upstream_failure_message
64
+ details = ["status=#{status}"]
65
+ details << "error_category=#{error_category}" if error_category
66
+ details << "error=#{error}" if error
67
+ details << "request_id=#{request_id}" if request_id
68
+ "Botasaurus scrape failed (#{details.join(', ')})."
69
+ end
70
+
71
+ # @return [String] rendered HTML body from Botasaurus
72
+ # @raise [BotasaurusConnectionFailed] when html is missing
73
+ def html
74
+ value = payload['html']
75
+ raise BotasaurusConnectionFailed, "Botasaurus response missing required 'html' field" if value.nil?
76
+
77
+ value.to_s
78
+ end
79
+
80
+ # @return [Hash{String => String}] normalized response headers
81
+ def headers
82
+ raw_headers = payload['headers']
83
+ return DEFAULT_HEADERS.dup unless raw_headers.is_a?(Hash) && raw_headers.any?
84
+
85
+ raw_headers.to_h { |key, value| [key.to_s, value.to_s] }
86
+ end
87
+
88
+ # @return [Integer] resolved status code (payload status_code or transport status)
89
+ def status
90
+ status_code = payload['status_code']
91
+ status_code.is_a?(Integer) ? status_code : transport_status
92
+ end
93
+
94
+ # @return [String, nil] final URL reported by upstream
95
+ def final_url = payload['final_url']
96
+
97
+ private
98
+
99
+ attr_reader :payload, :transport_status
100
+
101
+ def error = payload['error']
102
+
103
+ def request_id = payload['request_id']
104
+
105
+ def error_category = payload['error_category']
106
+
107
+ def error_message?
108
+ value = error
109
+ value.is_a?(String) ? !value.empty? : !value.nil?
110
+ end
111
+ end
112
+
113
+ ##
114
+ # @param url [Html2rss::Url] canonical URL to scrape
115
+ # @param options [Hash] validated request.botasaurus options
116
+ # @option options [String] :navigation_mode
117
+ # @option options [Integer] :max_retries
118
+ # @option options [String] :wait_for_selector
119
+ # @option options [Integer] :wait_timeout_seconds
120
+ # @option options [Boolean] :block_images
121
+ # @option options [Boolean] :block_images_and_css
122
+ # @option options [Boolean] :wait_for_complete_page_load
123
+ # @option options [Boolean] :headless
124
+ # @option options [String] :proxy
125
+ # @option options [String] :user_agent
126
+ # @option options [Array<Integer>] :window_size
127
+ # @option options [String] :lang
128
+ def initialize(url:, options: {})
129
+ @url = url
130
+ @options = options
131
+ end
132
+
133
+ # @return [Hash] payload for POST /scrape
134
+ def request_payload
135
+ DEFAULT_OPTIONS.merge(filtered_options).merge(url: url.to_s)
136
+ end
137
+
138
+ # @param transport_response [Faraday::Response] upstream HTTP response
139
+ # @return [ParsedResponse]
140
+ # @raise [BotasaurusConnectionFailed] when payload is not valid JSON object
141
+ def parse_response(transport_response)
142
+ payload = JSON.parse(transport_response.body.to_s)
143
+ raise BotasaurusConnectionFailed, 'Botasaurus response must be a JSON object' unless payload.is_a?(Hash)
144
+
145
+ ParsedResponse.new(payload:, transport_status: transport_response.status)
146
+ rescue JSON::ParserError => error
147
+ raise BotasaurusConnectionFailed, "Botasaurus response JSON parse failed: #{error.message}"
148
+ end
149
+
150
+ private
151
+
152
+ attr_reader :url, :options
153
+
154
+ def filtered_options
155
+ OPTION_KEYS.each_with_object({}) do |key, normalized|
156
+ normalized[key] = options[key] if options.key?(key)
157
+ end
158
+ end
159
+ end
160
+ end
161
+ end
@@ -0,0 +1,98 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'faraday'
4
+ require 'json'
5
+
6
+ module Html2rss
7
+ class RequestService
8
+ ##
9
+ # Strategy to delegate fetching to a Botasaurus scrape API.
10
+ class BotasaurusStrategy < Strategy
11
+ ##
12
+ # Executes a Botasaurus-backed request with shared request policy guards.
13
+ #
14
+ # @return [Response] normalized request response
15
+ # @raise [BotasaurusConfigurationError] when BOTASAURUS_SCRAPER_URL is missing or invalid
16
+ # @raise [BotasaurusConnectionFailed] when Botasaurus cannot be reached or returns an invalid payload
17
+ # @raise [RequestTimedOut] when the Botasaurus request exceeds configured timeout
18
+ def execute
19
+ validate_request!
20
+ transport_response = client.post('/scrape', JSON.generate(contract.request_payload), content_type_header)
21
+ parsed_response = contract.parse_response(transport_response)
22
+ raise_if_challenge_blocked!(parsed_response)
23
+ raise_if_upstream_failed!(parsed_response)
24
+ build_response(parsed_response)
25
+ rescue Faraday::TimeoutError, Timeout::Error => error
26
+ raise RequestTimedOut, error.message
27
+ rescue Faraday::ConnectionFailed, Faraday::SSLError => error
28
+ raise BotasaurusConnectionFailed, "Botasaurus connection failed: #{error.message}"
29
+ end
30
+
31
+ private
32
+
33
+ def validate_request!
34
+ ctx.budget.consume!
35
+ ctx.policy.validate_request!(url: ctx.url, origin_url: ctx.origin_url, relation: ctx.relation)
36
+ end
37
+
38
+ def build_response(parsed_response)
39
+ body = parsed_response.html
40
+ ResponseGuard.new(policy: ctx.policy).inspect_body!(body)
41
+
42
+ Response.new(
43
+ body:,
44
+ headers: parsed_response.headers,
45
+ url: response_url(parsed_response.final_url),
46
+ status: parsed_response.status
47
+ )
48
+ end
49
+
50
+ def raise_if_challenge_blocked!(parsed_response)
51
+ return unless parsed_response.challenge_block?
52
+
53
+ raise BlockedSurfaceDetected, "Blocked surface detected: #{parsed_response.challenge_message}"
54
+ end
55
+
56
+ def raise_if_upstream_failed!(parsed_response)
57
+ return unless parsed_response.upstream_failure?
58
+
59
+ raise BotasaurusConnectionFailed, parsed_response.upstream_failure_message
60
+ end
61
+
62
+ def response_url(final_url)
63
+ return ctx.url if final_url.nil?
64
+
65
+ Html2rss::Url.from_absolute(final_url)
66
+ rescue ArgumentError
67
+ ctx.url
68
+ end
69
+
70
+ def contract
71
+ @contract ||= BotasaurusContract.new(url: ctx.url, options: ctx.request.fetch(:botasaurus, {}))
72
+ end
73
+
74
+ def client
75
+ @client ||= Faraday.new(url: scraper_base_url.to_s, request: request_options)
76
+ end
77
+
78
+ def request_options
79
+ { timeout: ctx.policy.total_timeout_seconds }
80
+ end
81
+
82
+ def content_type_header
83
+ { 'Content-Type' => 'application/json' }
84
+ end
85
+
86
+ def scraper_base_url
87
+ @scraper_base_url ||= begin
88
+ configured = ENV.fetch('BOTASAURUS_SCRAPER_URL') do
89
+ raise BotasaurusConfigurationError, 'BOTASAURUS_SCRAPER_URL is required for strategy=botasaurus.'
90
+ end
91
+ Html2rss::Url.for_channel(configured)
92
+ rescue ArgumentError => error
93
+ raise BotasaurusConfigurationError, "BOTASAURUS_SCRAPER_URL is invalid: #{error.message}"
94
+ end
95
+ end
96
+ end
97
+ end
98
+ end
@@ -18,8 +18,8 @@ module Html2rss
18
18
  # @raise [ArgumentError] if policy or budget is explicitly nil
19
19
  def initialize(url:, headers: {}, request: {}, **request_options)
20
20
  @url = Html2rss::Url.from_absolute(url)
21
- @headers = headers
22
- @request = request.freeze
21
+ @headers = normalize_headers(headers).freeze
22
+ @request = normalize_request(request).freeze
23
23
  assign_request_options(request_options)
24
24
  end
25
25
 
@@ -85,6 +85,18 @@ module Html2rss
85
85
  source = origin_url || @url
86
86
  Html2rss::Url.from_absolute(source)
87
87
  end
88
+
89
+ def normalize_headers(headers)
90
+ headers.to_h do |key, value|
91
+ [key.to_s, value]
92
+ end
93
+ end
94
+
95
+ def normalize_request(request)
96
+ normalized = HashUtil.deep_symbolize_keys(request, context: 'request')
97
+ HashUtil.assert_symbol_keys!(normalized, context: 'request')
98
+ normalized
99
+ end
88
100
  end
89
101
  end
90
102
  end
@@ -13,8 +13,11 @@ module Html2rss
13
13
  ##
14
14
  # Restores buffered streamed bytes so response middleware can process them.
15
15
  class StreamingBodyMiddleware < Faraday::Middleware
16
+ # Request-context key used to store streamed chunks before middleware completion.
16
17
  STREAM_BUFFER_KEY = :html2rss_stream_buffer
17
18
 
19
+ # @param env [Faraday::Env] completed response environment
20
+ # @return [void]
18
21
  def on_complete(env)
19
22
  buffer = env.request.context&.delete(STREAM_BUFFER_KEY)
20
23
  return if buffer.nil? || buffer.empty?
@@ -24,13 +27,12 @@ module Html2rss
24
27
  end
25
28
 
26
29
  ##
27
- # NOTE: Unlike BrowserlessStrategy, Faraday does not expose the remote IP after connect.
28
- # SSRF protection here is pre-connection only (DNS resolution via Policy).
29
- # A DNS rebinding attack between resolution and connect cannot be caught at this layer.
30
- #
31
30
  # Executes a request with runtime policy enforcement.
32
31
  #
33
32
  # @return [Response] normalized request response
33
+ # @note Unlike BrowserlessStrategy, Faraday does not expose the remote IP after connect.
34
+ # SSRF protection here is pre-connection only (DNS resolution via Policy).
35
+ # A DNS rebinding attack between resolution and connect cannot be caught at this layer.
34
36
  def execute
35
37
  deadline = request_deadline
36
38
  response_guard, response = perform_request(deadline:)
@@ -10,7 +10,9 @@ module Html2rss
10
10
  # Describes the runtime request envelope for a single feed build.
11
11
  class Policy # rubocop:disable Metrics/ClassLength
12
12
  MAX_REQUESTS_CEILING = 10
13
+ # Hostnames treated as local/private surfaces.
13
14
  LOCAL_HOSTS = %w[localhost localhost.localdomain metadata.google.internal].to_set.freeze
15
+ # IP ranges blocked when private networks are disabled.
14
16
  BLOCKED_IP_RANGES = [
15
17
  IPAddr.new('0.0.0.0/8'),
16
18
  IPAddr.new('10.0.0.0/8'),
@@ -26,6 +28,7 @@ module Html2rss
26
28
  IPAddr.new('ff00::/8')
27
29
  ].freeze
28
30
 
31
+ # Default policy values used when request controls are not explicitly set.
29
32
  DEFAULTS = {
30
33
  connect_timeout_seconds: 5,
31
34
  read_timeout_seconds: 10,
@@ -243,6 +246,7 @@ module Html2rss
243
246
  end
244
247
  end
245
248
 
249
+ # Shared immutable policy instance used for default request execution.
246
250
  Policy::DEFAULT_POLICY = Policy.new
247
251
  end
248
252
  end
@@ -17,6 +17,7 @@ module Html2rss
17
17
 
18
18
  headers = headers.dup
19
19
  headers.transform_keys!(&:to_s)
20
+ HashUtil.assert_string_keys!(headers, context: 'response headers', deep: false)
20
21
 
21
22
  @headers = headers
22
23
  @status = status
@@ -26,7 +27,7 @@ module Html2rss
26
27
  # @return [String] the raw body of the response
27
28
  attr_reader :body
28
29
 
29
- # @return [Hash<String, Object>] the headers of the response
30
+ # @return [Hash{String => Object}] the headers of the response
30
31
  attr_reader :headers
31
32
 
32
33
  # @return [Integer, nil] the HTTP status code when known
@@ -35,8 +36,13 @@ module Html2rss
35
36
  # @return [Html2rss::Url] the URL of the response
36
37
  attr_reader :url
37
38
 
39
+ # @return [String] normalized content type header value
38
40
  def content_type = header('content-type').to_s
41
+
42
+ # @return [Boolean] whether response content is JSON
39
43
  def json_response? = content_type.include?('application/json')
44
+
45
+ # @return [Boolean] whether response content is HTML
40
46
  def html_response? = content_type.include?('text/html')
41
47
 
42
48
  ##
@@ -57,6 +63,8 @@ module Html2rss
57
63
 
58
64
  private
59
65
 
66
+ # @param name [String] canonical header name
67
+ # @return [Object, nil] header value when present
60
68
  def header(name)
61
69
  headers.fetch(name) do
62
70
  headers.find { |key, _value| key.casecmp?(name) }&.last
@@ -10,18 +10,34 @@ module Html2rss
10
10
  class RequestService
11
11
  include Singleton
12
12
 
13
+ # Raised when an unknown request strategy is requested.
13
14
  class UnknownStrategy < Html2rss::Error; end
15
+ # Raised when a URL cannot be parsed or validated.
14
16
  class InvalidUrl < Html2rss::Error; end
17
+ # Raised when a URL uses an unsupported scheme.
15
18
  class UnsupportedUrlScheme < Html2rss::Error; end
19
+ # Raised when a response type cannot be parsed.
16
20
  class UnsupportedResponseContentType < Html2rss::Error; end
21
+ # Raised when request limits are exceeded.
17
22
  class RequestBudgetExceeded < Html2rss::Error; end
23
+ # Raised when policy denies private-network access.
18
24
  class PrivateNetworkDenied < Html2rss::Error; end
25
+ # Raised when cross-origin follow-up requests are denied.
19
26
  class CrossOriginFollowUpDenied < Html2rss::Error; end
27
+ # Raised when a response exceeds configured size limits.
20
28
  class ResponseTooLarge < Html2rss::Error; end
29
+ # Raised when blocked content surfaces are detected.
21
30
  class BlockedSurfaceDetected < Html2rss::Error; end
31
+ # Raised when a request times out.
22
32
  class RequestTimedOut < Html2rss::Error; end
33
+ # Raised when Browserless configuration is missing or invalid.
23
34
  class BrowserlessConfigurationError < Html2rss::Error; end
35
+ # Raised when Browserless cannot be reached.
24
36
  class BrowserlessConnectionFailed < Html2rss::Error; end
37
+ # Raised when Botasaurus configuration is missing or invalid.
38
+ class BotasaurusConfigurationError < Html2rss::Error; end
39
+ # Raised when Botasaurus cannot be reached or returns invalid payloads.
40
+ class BotasaurusConnectionFailed < Html2rss::Error; end
25
41
 
26
42
  class << self
27
43
  extend Forwardable
@@ -40,6 +56,7 @@ module Html2rss
40
56
  def initialize
41
57
  @strategies = {
42
58
  faraday: FaradayStrategy,
59
+ botasaurus: BotasaurusStrategy,
43
60
  browserless: BrowserlessStrategy
44
61
  }
45
62
  @default_strategy_name = :faraday
@@ -51,6 +68,7 @@ module Html2rss
51
68
  ##
52
69
  # Sets the default strategy.
53
70
  # @param strategy [Symbol] the name of the strategy
71
+ # @return [Symbol] the selected default strategy name
54
72
  # @raise [UnknownStrategy] if the strategy is not registered
55
73
  def default_strategy_name=(strategy)
56
74
  raise UnknownStrategy unless strategy_registered?(strategy)
@@ -65,6 +83,7 @@ module Html2rss
65
83
  # Registers a new strategy.
66
84
  # @param name [Symbol] the name of the strategy
67
85
  # @param strategy_class [Class] the class implementing the strategy
86
+ # @return [Class] the registered strategy class
68
87
  # @raise [ArgumentError] if strategy_class is not a Class
69
88
  def register_strategy(name, strategy_class)
70
89
  unless strategy_class.is_a?(Class)
@@ -26,8 +26,8 @@ module Html2rss
26
26
  # @param request_policy [RequestService::Policy] request policy for the session
27
27
  def initialize(url:, headers:, request:, strategy:, request_policy:)
28
28
  @url = Html2rss::Url.from_absolute(url)
29
- @headers = headers.freeze
30
- @request = request.freeze
29
+ @headers = normalize_headers(headers).freeze
30
+ @request = normalize_request(request).freeze
31
31
  @strategy = strategy
32
32
  @request_policy = request_policy
33
33
  freeze
@@ -52,6 +52,20 @@ module Html2rss
52
52
  ##
53
53
  # @return [RequestService::Policy] policy derived from the runtime request inputs
54
54
  attr_reader :request_policy
55
+
56
+ private
57
+
58
+ def normalize_headers(headers)
59
+ headers.to_h do |key, value|
60
+ [key.to_s, value]
61
+ end
62
+ end
63
+
64
+ def normalize_request(request)
65
+ normalized = HashUtil.deep_symbolize_keys(request, context: 'request')
66
+ HashUtil.assert_symbol_keys!(normalized, context: 'request')
67
+ normalized
68
+ end
55
69
  end
56
70
  end
57
71
  end
@@ -29,9 +29,16 @@ module Html2rss
29
29
  def baseline_request_budget_for(config)
30
30
  1 + pagination_follow_up_budget_for(config) +
31
31
  known_auto_source_follow_up_budget_for(config) +
32
+ auto_strategy_fallback_budget_for(config) +
32
33
  browserless_preload_budget_for(config)
33
34
  end
34
35
 
36
+ def auto_strategy_fallback_budget_for(config)
37
+ return 0 unless config.strategy == :auto
38
+
39
+ [FeedPipeline::AutoFallback::CHAIN.size - 1, 0].max
40
+ end
41
+
35
42
  def pagination_follow_up_budget_for(config)
36
43
  [config.selectors&.dig(:items, :pagination, :max_pages).to_i - 1, 0].max
37
44
  end
@@ -9,16 +9,20 @@ module Html2rss
9
9
  # Builds a request session from translated runtime request inputs.
10
10
  #
11
11
  # @param runtime_input [RuntimeInput] translated runtime request inputs
12
+ # @param budget [RequestService::Budget, nil] optional shared budget for multi-attempt runs
12
13
  # @param logger [Logger] logger used for operational warnings
13
14
  # @return [RequestSession] configured request session
14
- def from_runtime_input(runtime_input, logger: Html2rss::Log)
15
+ def from_runtime_input(runtime_input, budget: nil, logger: Html2rss::Log) # rubocop:disable Metrics/MethodLength
16
+ context_options = {
17
+ url: runtime_input.url,
18
+ headers: runtime_input.headers,
19
+ request: runtime_input.request,
20
+ policy: runtime_input.request_policy
21
+ }
22
+ context_options[:budget] = budget unless budget.nil?
23
+
15
24
  new(
16
- context: RequestService::Context.new(
17
- url: runtime_input.url,
18
- headers: runtime_input.headers,
19
- request: runtime_input.request,
20
- policy: runtime_input.request_policy
21
- ),
25
+ context: RequestService::Context.new(**context_options),
22
26
  strategy: runtime_input.strategy,
23
27
  logger:
24
28
  )
@@ -81,7 +85,7 @@ module Html2rss
81
85
  end
82
86
 
83
87
  ##
84
- # @param url [String, Html2rss::Url] url to query
88
+ # @param url [String, Html2rss::Url] follow-up target URL for the request
85
89
  # @return [Boolean] whether the url was already visited in this session
86
90
  def visited?(url)
87
91
  visited_urls.include?(normalize_url(url))
@@ -90,7 +94,7 @@ module Html2rss
90
94
  ##
91
95
  # Records a visited url in the session.
92
96
  #
93
- # @param url [String, Html2rss::Url] url to track
97
+ # @param url [String, Html2rss::Url] URL used to update relation tracking state
94
98
  # @return [Set<Html2rss::Url>] visited urls
95
99
  def remember!(url)
96
100
  visited_urls.add(normalize_url(url))
@@ -13,10 +13,23 @@ module Html2rss
13
13
  include Enumerable
14
14
  include Comparable
15
15
 
16
+ # Allowed article attributes accepted by the value object constructor.
16
17
  PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
18
+ # Separator used to build deterministic deduplication fingerprints.
17
19
  DEDUP_FINGERPRINT_SEPARATOR = '#!/'
18
20
 
19
- # @param options [Hash<Symbol, String>]
21
+ # @param options [Hash{Symbol => String}]
22
+ # @option options [String] :id stable article identifier
23
+ # @option options [String] :title article title
24
+ # @option options [String] :description article description/content
25
+ # @option options [String, Html2rss::Url] :url canonical article URL
26
+ # @option options [String, Html2rss::Url] :image image URL for fallback enclosure rendering
27
+ # @option options [String] :author author name
28
+ # @option options [String] :guid explicit GUID override
29
+ # @option options [String, Time, DateTime] :published_at publication timestamp
30
+ # @option options [Array<Hash{Symbol => Object}>] :enclosures enclosure attribute hashes
31
+ # @option options [Array<String>] :categories category labels
32
+ # @option options [Class] :scraper scraper class that produced the article
20
33
  def initialize(**options)
21
34
  @to_h = {}
22
35
  options.each_pair { |key, value| @to_h[key] = value.freeze if value }
@@ -41,10 +54,13 @@ module Html2rss
41
54
  PROVIDED_KEYS.each { |key| yield(key, public_send(key)) }
42
55
  end
43
56
 
57
+ # @return [String, nil] stable article identifier
44
58
  def id = blank_string_to_nil(@to_h[:id])
45
59
 
60
+ # @return [String, nil] article title
46
61
  def title = blank_string_to_nil(@to_h[:title])
47
62
 
63
+ # @return [String] rendered article description
48
64
  def description
49
65
  @description ||= Rendering::DescriptionBuilder.new(
50
66
  base: @to_h[:description],
@@ -82,6 +98,7 @@ module Html2rss
82
98
  dedup_from_url || dedup_from_id || dedup_from_guid || hash
83
99
  end
84
100
 
101
+ # @return [Array<Html2rss::RssBuilder::Enclosure>] normalized enclosure objects
85
102
  def enclosures
86
103
  @enclosures ||= Array(@to_h[:enclosures])
87
104
  .map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
@@ -101,6 +118,7 @@ module Html2rss
101
118
  end
102
119
  end
103
120
 
121
+ # @return [Array<String>] normalized, unique category names
104
122
  def categories
105
123
  @categories ||= @to_h[:categories].dup.to_a.tap do |categories|
106
124
  categories.map! { |category| category.to_s.strip }
@@ -119,10 +137,13 @@ module Html2rss
119
137
  nil
120
138
  end
121
139
 
140
+ # @return [Class, nil] scraper class that produced this article
122
141
  def scraper
123
142
  @to_h[:scraper]
124
143
  end
125
144
 
145
+ # @param other [Object] value compared against this article
146
+ # @return [Integer, nil] comparison result for compatible Article values
126
147
  def <=>(other)
127
148
  return nil unless other.is_a?(Article)
128
149
 
@@ -7,24 +7,28 @@ module Html2rss
7
7
  # 1. the HTML document's <head>.
8
8
  # 2. the HTTP response
9
9
  class Channel
10
+ # Fallback RSS ttl (in minutes) when no cache directives are present.
10
11
  DEFAULT_TTL_IN_MINUTES = 360
12
+ # Description template used when no explicit or discovered description exists.
11
13
  DEFAULT_DESCRIPTION_TEMPLATE = 'Latest items from %<url>s'
12
14
 
13
15
  ##
14
- #
15
16
  # @param response [Html2rss::RequestService::Response]
16
- # @param overrides [Hash<Symbol, String>] - Optional, overrides for any channel attribute
17
+ # @param overrides [Hash{Symbol => String}] optional overrides for channel attributes
17
18
  def initialize(response, overrides: {})
18
19
  @response = response
19
20
  @overrides = overrides
20
21
  end
21
22
 
23
+ # @return [String] channel title derived from overrides, document title, or URL
22
24
  def title
23
25
  @title ||= fetch_title
24
26
  end
25
27
 
28
+ # @return [Html2rss::Url] canonical channel URL
26
29
  def url = @url ||= Html2rss::Url.from_absolute(@response.url)
27
30
 
31
+ # @return [String] channel description text
28
32
  def description
29
33
  return overrides[:description] unless overrides[:description].to_s.empty?
30
34
 
@@ -35,6 +39,7 @@ module Html2rss
35
39
  description
36
40
  end
37
41
 
42
+ # @return [Integer] cache time-to-live in minutes
38
43
  def ttl
39
44
  return overrides[:ttl] if overrides[:ttl]
40
45
 
@@ -45,6 +50,7 @@ module Html2rss
45
50
  DEFAULT_TTL_IN_MINUTES
46
51
  end
47
52
 
53
+ # @return [String, nil] ISO-like language code when available
48
54
  def language
49
55
  return overrides[:language] if overrides[:language]
50
56
 
@@ -57,6 +63,7 @@ module Html2rss
57
63
  parsed_body['lang'] || parsed_body.at_css('[lang]')&.[]('lang')
58
64
  end
59
65
 
66
+ # @return [String, nil] channel author metadata
60
67
  def author
61
68
  return overrides[:author] if overrides[:author]
62
69
 
@@ -65,8 +72,10 @@ module Html2rss
65
72
  parsed_body.at_css('meta[name="author"]')&.[]('content')
66
73
  end
67
74
 
75
+ # @return [String, Time] source last-modified timestamp or current time fallback
68
76
  def last_build_date = headers['last-modified'] || Time.now
69
77
 
78
+ # @return [Html2rss::Url, nil] channel image URL
70
79
  def image
71
80
  return overrides[:image] if overrides[:image]
72
81