html2rss 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -1
  3. data/lib/html2rss/articles/deduplicator.rb +1 -0
  4. data/lib/html2rss/auto_source/cleanup.rb +11 -0
  5. data/lib/html2rss/auto_source/scraper/html.rb +5 -0
  6. data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
  7. data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
  8. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
  13. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
  14. data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
  15. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
  16. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
  17. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
  18. data/lib/html2rss/auto_source/scraper.rb +19 -1
  19. data/lib/html2rss/auto_source.rb +4 -0
  20. data/lib/html2rss/blocked_surface.rb +1 -0
  21. data/lib/html2rss/category_extractor.rb +2 -2
  22. data/lib/html2rss/cli.rb +30 -6
  23. data/lib/html2rss/config/class_methods.rb +24 -35
  24. data/lib/html2rss/config/dynamic_params.rb +6 -4
  25. data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
  26. data/lib/html2rss/config/request_headers.rb +9 -3
  27. data/lib/html2rss/config/schema.rb +33 -1
  28. data/lib/html2rss/config/validator.rb +40 -2
  29. data/lib/html2rss/config.rb +19 -13
  30. data/lib/html2rss/error.rb +25 -0
  31. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  32. data/lib/html2rss/feed_pipeline.rb +127 -0
  33. data/lib/html2rss/hash_util.rb +101 -0
  34. data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
  35. data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
  36. data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
  37. data/lib/html2rss/html_extractor.rb +5 -0
  38. data/lib/html2rss/html_navigator.rb +8 -0
  39. data/lib/html2rss/json_feed_builder.rb +1 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +8 -3
  41. data/lib/html2rss/rendering/description_builder.rb +0 -1
  42. data/lib/html2rss/rendering/image_renderer.rb +17 -7
  43. data/lib/html2rss/rendering/media_renderer.rb +4 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
  45. data/lib/html2rss/rendering/video_renderer.rb +8 -3
  46. data/lib/html2rss/rendering.rb +11 -2
  47. data/lib/html2rss/request_controls.rb +16 -21
  48. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  49. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  50. data/lib/html2rss/request_service/context.rb +14 -2
  51. data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
  52. data/lib/html2rss/request_service/policy.rb +4 -0
  53. data/lib/html2rss/request_service/response.rb +9 -1
  54. data/lib/html2rss/request_service.rb +19 -0
  55. data/lib/html2rss/request_session/runtime_input.rb +16 -2
  56. data/lib/html2rss/request_session/runtime_policy.rb +7 -0
  57. data/lib/html2rss/request_session.rb +13 -9
  58. data/lib/html2rss/rss_builder/article.rb +22 -1
  59. data/lib/html2rss/rss_builder/channel.rb +11 -2
  60. data/lib/html2rss/rss_builder/enclosure.rb +15 -1
  61. data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
  62. data/lib/html2rss/rss_builder.rb +4 -0
  63. data/lib/html2rss/selectors/config.rb +1 -0
  64. data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
  65. data/lib/html2rss/selectors/extractors/href.rb +2 -0
  66. data/lib/html2rss/selectors/extractors/html.rb +1 -0
  67. data/lib/html2rss/selectors/extractors/static.rb +2 -1
  68. data/lib/html2rss/selectors/extractors/text.rb +1 -0
  69. data/lib/html2rss/selectors/extractors.rb +2 -1
  70. data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
  71. data/lib/html2rss/selectors/post_processors/base.rb +13 -7
  72. data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
  73. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
  74. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
  75. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
  76. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
  77. data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
  78. data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
  79. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
  80. data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
  81. data/lib/html2rss/selectors/post_processors/template.rb +3 -0
  82. data/lib/html2rss/selectors/post_processors.rb +5 -0
  83. data/lib/html2rss/selectors.rb +7 -0
  84. data/lib/html2rss/url.rb +27 -23
  85. data/lib/html2rss/version.rb +2 -1
  86. data/lib/html2rss.rb +15 -78
  87. data/schema/html2rss-config.schema.json +83 -1
  88. metadata +7 -2
@@ -6,6 +6,7 @@ module Html2rss
6
6
  # Normalizes HTTP headers for outgoing requests.
7
7
  # Ensures a browser-like baseline while respecting caller overrides.
8
8
  class RequestHeaders
9
+ # Browser-like default `Accept` header value.
9
10
  DEFAULT_ACCEPT = %w[
10
11
  text/html
11
12
  application/xhtml+xml
@@ -16,6 +17,7 @@ module Html2rss
16
17
  */*;q=0.8
17
18
  ].join(',')
18
19
 
20
+ # Browser-like default `User-Agent` header value.
19
21
  DEFAULT_USER_AGENT = [
20
22
  'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
21
23
  'AppleWebKit/537.36 (KHTML, like Gecko)',
@@ -23,6 +25,7 @@ module Html2rss
23
25
  'Safari/537.36'
24
26
  ].join(' ')
25
27
 
28
+ # Baseline browser-like header set used for outbound requests.
26
29
  DEFAULT_HEADERS = {
27
30
  'Accept' => DEFAULT_ACCEPT,
28
31
  'Cache-Control' => 'max-age=0',
@@ -37,7 +40,7 @@ module Html2rss
37
40
 
38
41
  class << self
39
42
  ##
40
- # @return [Hash<String, String>] the unmodified default header set
43
+ # @return [Hash{String => String}] the unmodified default header set
41
44
  def browser_defaults
42
45
  DEFAULT_HEADERS.dup
43
46
  end
@@ -48,12 +51,15 @@ module Html2rss
48
51
  # @param headers [Hash, nil] caller provided headers
49
52
  # @param channel_language [String, nil] language defined on the channel
50
53
  # @param url [String] request URL used to infer the Host header
51
- # @return [Hash<String, String>] normalized HTTP headers
54
+ # @return [Hash{String => String}] normalized HTTP headers
52
55
  def normalize(headers, channel_language:, url:)
53
56
  new(headers || {}, channel_language:, url:).to_h
54
57
  end
55
58
  end
56
59
 
60
+ # @param headers [Hash{String, Symbol => String}] caller-provided headers
61
+ # @param channel_language [String, nil] channel language hint for Accept-Language
62
+ # @param url [String, Html2rss::Url, nil] request URL used to infer Host
57
63
  def initialize(headers, channel_language:, url:)
58
64
  @headers = headers
59
65
  @channel_language = channel_language
@@ -61,7 +67,7 @@ module Html2rss
61
67
  end
62
68
 
63
69
  ##
64
- # @return [Hash<String, String>] normalized HTTP headers
70
+ # @return [Hash{String => String}] normalized HTTP headers
65
71
  def to_h
66
72
  defaults = DEFAULT_HEADERS.dup
67
73
  normalized = normalize_custom_headers(headers)
@@ -7,12 +7,13 @@ module Html2rss
7
7
  module Schema
8
8
  module_function
9
9
 
10
+ # Canonical filename for the exported config JSON schema artifact.
10
11
  SCHEMA_FILENAME = 'html2rss-config.schema.json'
11
12
 
12
13
  ##
13
14
  # Returns the exported configuration JSON Schema.
14
15
  #
15
- # @return [Hash<String, Object>] JSON Schema represented as a Ruby hash
16
+ # @return [Hash{String => Object}] JSON Schema represented as a Ruby hash
16
17
  def json_schema
17
18
  load_json_schema_extension!
18
19
  Builder.call
@@ -38,6 +39,7 @@ module Html2rss
38
39
  File.expand_path("../../../schema/#{SCHEMA_FILENAME}", __dir__)
39
40
  end
40
41
 
42
+ # @return [void]
41
43
  def load_json_schema_extension!
42
44
  require 'dry/schema/extensions/json_schema'
43
45
  Dry::Schema.load_extensions(:json_schema)
@@ -48,11 +50,13 @@ module Html2rss
48
50
  # client-facing overlays.
49
51
  class Builder
50
52
  class << self
53
+ # @return [Hash{String => Object}] fully assembled JSON schema hash
51
54
  def call
52
55
  new.call
53
56
  end
54
57
  end
55
58
 
59
+ # @return [Hash{String => Object}] fully assembled JSON schema hash
56
60
  def call
57
61
  schema = validator_schema
58
62
  apply_top_level(schema)
@@ -76,6 +80,7 @@ module Html2rss
76
80
 
77
81
  def assign_properties(properties)
78
82
  properties.merge!(
83
+ strategy: Components.strategy,
79
84
  headers: Components.headers,
80
85
  stylesheets: Components.stylesheets,
81
86
  auto_source: Components.auto_source,
@@ -90,6 +95,15 @@ module Html2rss
90
95
  module Components
91
96
  module_function
92
97
 
98
+ # @return [Hash{Symbol => Object}] schema fragment for strategy selection
99
+ def strategy
100
+ {
101
+ type: 'string',
102
+ not: { type: 'null' }
103
+ }
104
+ end
105
+
106
+ # @return [Hash{Symbol => Object}] schema fragment for headers
93
107
  def headers
94
108
  {
95
109
  type: 'object',
@@ -98,6 +112,7 @@ module Html2rss
98
112
  }
99
113
  end
100
114
 
115
+ # @return [Hash{Symbol => Object}] schema fragment for stylesheet definitions
101
116
  def stylesheets
102
117
  {
103
118
  type: 'array',
@@ -106,12 +121,14 @@ module Html2rss
106
121
  }
107
122
  end
108
123
 
124
+ # @return [Hash{Symbol => Object}] schema fragment for auto_source configuration
109
125
  def auto_source
110
126
  schema = Html2rss::AutoSource::Config.json_schema(loose: true)
111
127
  schema[:default] = DeepStringifier.call(Html2rss::AutoSource::DEFAULT_CONFIG)
112
128
  schema
113
129
  end
114
130
 
131
+ # @return [Hash{Symbol => Object}] schema fragment for selectors configuration
115
132
  def selectors
116
133
  Selectors.schema
117
134
  end
@@ -122,8 +139,10 @@ module Html2rss
122
139
  module Selectors
123
140
  module_function
124
141
 
142
+ # Pattern used for dynamic selector keys excluding reserved selector names.
125
143
  RESERVED_SELECTOR_PATTERN = '^(?!items$|enclosure$|guid$|categories$).+$'
126
144
 
145
+ # @return [Hash{Symbol => Object}] schema fragment for selectors root object
127
146
  def schema
128
147
  {
129
148
  type: 'object',
@@ -135,6 +154,7 @@ module Html2rss
135
154
  end
136
155
 
137
156
  # rubocop:disable Layout/LineLength
157
+ # @return [Hash{Symbol => Object}] schema map for reserved selector properties
138
158
  def selector_properties
139
159
  {
140
160
  items: items_schema,
@@ -145,22 +165,26 @@ module Html2rss
145
165
  end
146
166
  # rubocop:enable Layout/LineLength
147
167
 
168
+ # @return [Hash{String => Object}] schema map for dynamic selector keys
148
169
  def pattern_properties
149
170
  { RESERVED_SELECTOR_PATTERN => dynamic_selector_schema }
150
171
  end
151
172
 
173
+ # @return [Hash{Symbol => Object}] schema fragment for dynamic selector entries
152
174
  def dynamic_selector_schema
153
175
  Html2rss::Selectors::Config::Selector.new.schema.json_schema(loose: true).merge(
154
176
  description: 'Dynamic selector definition keyed by attribute name.'
155
177
  )
156
178
  end
157
179
 
180
+ # @return [Hash{Symbol => Object}] schema fragment for `items` selector configuration
158
181
  def items_schema
159
182
  Html2rss::Selectors::Config::Items.new.schema.json_schema(loose: true).merge(
160
183
  description: 'Defines the items selector and optional enhancement settings.'
161
184
  )
162
185
  end
163
186
 
187
+ # @return [Hash{Symbol => Object}] schema fragment for `enclosure` selector configuration
164
188
  def enclosure_schema
165
189
  Html2rss::Selectors::Config::Enclosure.new.schema.json_schema(loose: true).merge(
166
190
  description: 'Describes enclosure extraction settings.'
@@ -170,6 +194,8 @@ module Html2rss
170
194
  # JSON Schema can enforce non-empty reference arrays, while runtime
171
195
  # validation remains authoritative for checking that each entry points
172
196
  # to an existing sibling selector key.
197
+ # @param description [String] human-readable description for the reference field
198
+ # @return [Hash{Symbol => Object}] JSON schema fragment for selector references
173
199
  def reference_array(description)
174
200
  {
175
201
  type: 'array',
@@ -188,17 +214,23 @@ module Html2rss
188
214
  module DeepStringifier
189
215
  module_function
190
216
 
217
+ # @param object [Hash, Array, Object] nested data to normalize
218
+ # @return [Hash, Array, Object] deep copy with stringified hash keys
191
219
  def call(object)
192
220
  case object
193
221
  when Hash
194
222
  stringify_hash(object)
195
223
  when Array
196
224
  object.map { |value| call(value) }
225
+ when Symbol
226
+ object.to_s
197
227
  else
198
228
  object
199
229
  end
200
230
  end
201
231
 
232
+ # @param object [Hash{Object => Object}] hash whose keys should become strings
233
+ # @return [Hash{String => Object}] hash with recursively normalized values
202
234
  def stringify_hash(object)
203
235
  object.to_h { |key, value| [key.to_s, call(value)] }
204
236
  end
@@ -6,11 +6,17 @@ module Html2rss
6
6
  class Config
7
7
  # Validates the configuration hash using Dry::Validation.
8
8
  # The configuration options adhere to the documented schema in README.md.
9
- class Validator < Dry::Validation::Contract
9
+ class Validator < Dry::Validation::Contract # rubocop:disable Metrics/ClassLength
10
+ # URI format used for channel URL validation.
10
11
  URI_REGEXP = Url::URI_REGEXP
12
+ # Allowed stylesheet MIME types.
11
13
  STYLESHEET_TYPES = RssBuilder::Stylesheet::TYPES
14
+ # Optional language/region format (`en` or `en-US`).
12
15
  LANGUAGE_FORMAT_REGEX = /\A[a-z]{2}(-[A-Z]{2})?\z/
16
+ # Baseline strategy enum exported in static schema artifacts.
17
+ BASE_STRATEGY_OPTIONS = ([:auto] + Html2rss::RequestService.strategy_names.map(&:to_sym)).uniq.freeze
13
18
 
19
+ # Contract for the top-level `channel` section.
14
20
  ChannelConfig = Dry::Schema.Params do
15
21
  required(:url).filled(:string, format?: URI_REGEXP)
16
22
  optional(:title).maybe(:string)
@@ -20,41 +26,66 @@ module Html2rss
20
26
  optional(:time_zone).maybe(:string)
21
27
  end
22
28
 
29
+ # Contract for a stylesheet entry in `stylesheets`.
23
30
  StylesheetConfig = Dry::Schema.Params do
24
31
  required(:href).filled(:string)
25
32
  required(:type).filled(:string, included_in?: STYLESHEET_TYPES)
26
33
  optional(:media).maybe(:string)
27
34
  end
28
35
 
36
+ # Contract for Browserless click-preload options.
29
37
  BrowserlessPreloadClickSelectorConfig = Dry::Schema.Params do
30
38
  required(:selector).filled(:string)
31
39
  optional(:max_clicks).filled(:integer, gt?: 0)
32
40
  optional(:wait_after_ms).filled(:integer, gteq?: 0)
33
41
  end
34
42
 
43
+ # Contract for Browserless scroll-preload options.
35
44
  BrowserlessPreloadScrollConfig = Dry::Schema.Params do
36
45
  optional(:iterations).filled(:integer, gt?: 0)
37
46
  optional(:wait_after_ms).filled(:integer, gteq?: 0)
38
47
  end
39
48
 
49
+ # Contract for Browserless preload orchestration options.
40
50
  BrowserlessPreloadConfig = Dry::Schema.Params do
41
51
  optional(:wait_after_ms).filled(:integer, gteq?: 0)
42
52
  optional(:click_selectors).array(BrowserlessPreloadClickSelectorConfig)
43
53
  optional(:scroll_down).hash(BrowserlessPreloadScrollConfig)
44
54
  end
45
55
 
56
+ # Contract for Browserless-specific request options.
46
57
  BrowserlessRequestConfig = Dry::Schema.Params do
47
58
  optional(:preload).hash(BrowserlessPreloadConfig)
48
59
  end
49
60
 
61
+ # Contract for Botasaurus-specific request options.
62
+ BotasaurusRequestConfig = Dry::Schema.Params do
63
+ config.validate_keys = true
64
+
65
+ optional(:navigation_mode).filled(:string, included_in?: %w[auto get google_get google_get_bypass])
66
+ optional(:max_retries).filled(:integer, gteq?: 0, lteq?: 3)
67
+ optional(:wait_for_selector).maybe(:string)
68
+ optional(:wait_timeout_seconds).filled(:integer, gt?: 0)
69
+ optional(:block_images).filled(:bool)
70
+ optional(:block_images_and_css).filled(:bool)
71
+ optional(:wait_for_complete_page_load).filled(:bool)
72
+ optional(:headless).filled(:bool)
73
+ optional(:proxy).filled(:string)
74
+ optional(:user_agent).filled(:string)
75
+ optional(:window_size).value(:array, min_size?: 2, max_size?: 2).each(:integer, gt?: 0)
76
+ optional(:lang).filled(:string)
77
+ end
78
+
79
+ # Contract for the top-level `request` section.
50
80
  RequestConfig = Dry::Schema.Params do
51
81
  optional(:max_redirects).filled(:integer, gteq?: 0)
52
82
  optional(:max_requests).filled(:integer, gt?: 0)
53
83
  optional(:browserless).hash(BrowserlessRequestConfig)
84
+ optional(:botasaurus).hash(BotasaurusRequestConfig)
54
85
  end
55
86
 
56
87
  params do
57
- required(:strategy).filled(:symbol)
88
+ optional(:strategy).filled(:symbol)
58
89
  required(:channel).hash(ChannelConfig)
59
90
  optional(:headers).hash
60
91
  optional(:stylesheets).array(StylesheetConfig)
@@ -76,6 +107,13 @@ module Html2rss
76
107
  base.failure(value) if value
77
108
  end
78
109
 
110
+ rule(:strategy) do
111
+ next if value.nil?
112
+ next if value == :auto || Html2rss::RequestService.strategy_registered?(value)
113
+
114
+ key.failure("must be one of: #{BASE_STRATEGY_OPTIONS.join(', ')}")
115
+ end
116
+
79
117
  # Ensure at least one of :selectors or :auto_source is present.
80
118
  rule(:selectors, :auto_source) do
81
119
  unless values.key?(:selectors) || values.key?(:auto_source)
@@ -11,6 +11,7 @@ module Html2rss
11
11
  #
12
12
  # Configuration is validated during initialization.
13
13
  class Config
14
+ # Raised when a configuration hash fails runtime validation.
14
15
  class InvalidConfig < Html2rss::Error; end
15
16
  extend ClassMethods
16
17
 
@@ -19,7 +20,7 @@ module Html2rss
19
20
  #
20
21
  # Processes deprecated attributes, applies default values, and validates the configuration.
21
22
  #
22
- # @param config [Hash<Symbol, Object>] the configuration hash.
23
+ # @param config [Hash{Symbol => Object}] the configuration hash.
23
24
  # @raise [InvalidConfig] if the configuration fails validation.
24
25
  def initialize(config)
25
26
  @request_controls = RequestControls.from_config(config)
@@ -34,9 +35,13 @@ module Html2rss
34
35
  )
35
36
  end
36
37
 
38
+ # @return [Symbol, nil] selected request strategy
37
39
  def strategy = request_controls.strategy
40
+ # @return [Integer, nil] configured redirect budget
38
41
  def max_redirects = request_controls.max_redirects
42
+ # @return [Integer, nil] configured request budget
39
43
  def max_requests = request_controls.max_requests
44
+ # @return [Array<Hash>] stylesheet definitions
40
45
  def stylesheets = config[:stylesheets]
41
46
 
42
47
  ##
@@ -49,14 +54,21 @@ module Html2rss
49
54
  # @return [Html2rss::RequestControls] request controls with provenance
50
55
  attr_reader :request_controls
51
56
 
57
+ # @return [Hash{String => String}] normalized HTTP headers
52
58
  def headers = config[:headers]
59
+ # @return [Hash{Symbol => Object}] channel configuration
53
60
  def channel = config[:channel]
61
+ # @return [String] source channel URL
54
62
  def url = config.dig(:channel, :url)
63
+ # @return [String, nil] configured channel time zone
55
64
  def time_zone = config.dig(:channel, :time_zone)
56
65
 
66
+ # @return [Hash{Symbol => Object}] request envelope configuration
57
67
  def request = config[:request]
58
68
 
69
+ # @return [Hash{Symbol => Object}, nil] selectors configuration
59
70
  def selectors = config[:selectors]
71
+ # @return [Hash{Symbol => Object}, nil] auto-source configuration
60
72
  def auto_source = config[:auto_source]
61
73
 
62
74
  private
@@ -66,8 +78,8 @@ module Html2rss
66
78
  # Normalizes raw config input before validation.
67
79
  class Preparer
68
80
  ##
69
- # @param config [Hash<Symbol, Object>] raw config input
70
- # @return [Hash<Symbol, Object>] config with defaults and deprecations applied
81
+ # @param config [Hash{Symbol => Object}] raw config input
82
+ # @return [Hash{Symbol => Object}] config with defaults and deprecations applied
71
83
  def call(config)
72
84
  config = config.dup if config.frozen?
73
85
 
@@ -82,7 +94,7 @@ module Html2rss
82
94
  private
83
95
 
84
96
  def handle_deprecated_channel_attributes(config)
85
- { strategy: RequestService.default_strategy_name, headers: {} }.each_pair do |key, default_value|
97
+ { strategy: Config.default_strategy_name, headers: {} }.each_pair do |key, default_value|
86
98
  if !config[key] && (value = config.dig(:channel, key))
87
99
  Log.warn("The `channel.#{key}` key is deprecated. Please move the definition of `#{key}` to the top level.")
88
100
  config[key] = value
@@ -95,21 +107,15 @@ module Html2rss
95
107
  end
96
108
 
97
109
  def apply_default_config(config)
98
- deep_merge(Config.default_config, config)
110
+ HashUtil.deep_merge(Config.default_config, config)
99
111
  end
100
112
 
101
113
  def apply_default_selectors_config(config)
102
- deep_merge({ selectors: Selectors::DEFAULT_CONFIG }, config)
114
+ HashUtil.deep_merge({ selectors: Selectors::DEFAULT_CONFIG }, config)
103
115
  end
104
116
 
105
117
  def apply_default_auto_source_config(config)
106
- deep_merge({ auto_source: Html2rss::AutoSource::DEFAULT_CONFIG }, config)
107
- end
108
-
109
- def deep_merge(base_config, override_config)
110
- base_config.merge(override_config) do |_key, oldval, newval|
111
- oldval.is_a?(Hash) && newval.is_a?(Hash) ? deep_merge(oldval, newval) : newval
112
- end
118
+ HashUtil.deep_merge({ auto_source: Html2rss::AutoSource::DEFAULT_CONFIG }, config)
113
119
  end
114
120
  end
115
121
 
@@ -3,4 +3,29 @@
3
3
  module Html2rss
4
4
  # The Html2rss::Error base class.
5
5
  class Error < StandardError; end
6
+
7
+ # Raised when auto fallback exhausts all concrete tiers and extractors find no feed items.
8
+ class NoFeedItemsExtracted < Error
9
+ ##
10
+ # @param attempts [Array<Hash{Symbol => Object}>] tier attempt diagnostics
11
+ def initialize(attempts:)
12
+ @attempts = attempts
13
+ super(build_message)
14
+ end
15
+
16
+ # @return [Array<Hash{Symbol => Object}>] tier attempt diagnostics
17
+ attr_reader :attempts
18
+
19
+ private
20
+
21
+ def build_message
22
+ summaries = attempts.map do |attempt|
23
+ details = attempt[:items_count].nil? ? "#{attempt[:error_class]} error" : "#{attempt[:items_count]} items"
24
+ "#{attempt[:strategy]} (#{details})"
25
+ end.join(', ')
26
+
27
+ "No feed items extracted after auto fallback across strategies: #{summaries}. " \
28
+ 'Try a more specific listing URL or provide explicit selectors.'
29
+ end
30
+ end
6
31
  end
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ ##
4
+ # The Html2rss namespace.
5
+ module Html2rss
6
+ ##
7
+ # Coordinates feed generation pipeline stages.
8
+ class FeedPipeline
9
+ # Retries feed extraction across concrete request strategies for :auto mode.
10
+ class AutoFallback
11
+ # Ordered list of concrete request strategies attempted by auto mode.
12
+ CHAIN = %i[faraday botasaurus browserless].freeze
13
+
14
+ # Error classes that should abort auto fallback immediately.
15
+ NON_FALLBACK_ERRORS = [
16
+ RequestService::UnknownStrategy,
17
+ RequestService::InvalidUrl,
18
+ RequestService::UnsupportedUrlScheme,
19
+ RequestService::UnsupportedResponseContentType,
20
+ RequestService::RequestBudgetExceeded,
21
+ RequestService::PrivateNetworkDenied,
22
+ RequestService::CrossOriginFollowUpDenied,
23
+ RequestService::ResponseTooLarge,
24
+ RequestService::BrowserlessConfigurationError
25
+ ].freeze
26
+
27
+ ##
28
+ # @param strategies [Array<Symbol>] ordered concrete strategies for fallback
29
+ # @param budget [RequestService::Budget] shared request budget across retries
30
+ # @param session_for [Proc] request session factory proc
31
+ # @param articles_for [Proc] article extraction proc
32
+ # @return [void]
33
+ def initialize(strategies:, budget:, session_for:, articles_for:)
34
+ @strategies = strategies
35
+ @budget = budget
36
+ @session_for = session_for
37
+ @articles_for = articles_for
38
+ end
39
+
40
+ ##
41
+ # @return [Hash{Symbol => Object}] pipeline state containing :response and :articles
42
+ def call
43
+ state, attempts = run_attempts
44
+ return state if state
45
+
46
+ finalize_failure(attempts:)
47
+ end
48
+
49
+ private
50
+
51
+ attr_reader :strategies, :budget, :session_for, :articles_for
52
+
53
+ def run_attempts
54
+ state = { result: nil, attempts: [] }
55
+ strategies.each_with_index do |strategy, index|
56
+ run_attempt_for(strategy:, next_strategy: strategies[index + 1], state:)
57
+ break if state.fetch(:result)
58
+ end
59
+ [state.fetch(:result), state.fetch(:attempts)]
60
+ end
61
+
62
+ def run_attempt_for(strategy:, next_strategy:, state:)
63
+ result, attempts = attempt(
64
+ strategy:,
65
+ next_strategy:,
66
+ state: { attempts: state.fetch(:attempts) }
67
+ )
68
+ state[:result] = result
69
+ state[:attempts] = attempts
70
+ end
71
+
72
+ def attempt(strategy:, next_strategy:, state:)
73
+ request_session = session_for.call(strategy:, budget:)
74
+ response, state = fetch_response(
75
+ request_session:,
76
+ strategy:,
77
+ next_strategy:,
78
+ state:
79
+ )
80
+ return [nil, state.fetch(:attempts)] unless response
81
+
82
+ process_response(response:, strategy:, next_strategy:, request_session:, state:)
83
+ end
84
+
85
+ def fetch_response(request_session:, strategy:, next_strategy:, state:)
86
+ [request_session.fetch_initial_response, state]
87
+ rescue *NON_FALLBACK_ERRORS
88
+ raise
89
+ rescue StandardError => error
90
+ state[:attempts] << { strategy:, items_count: nil, error_class: error.class.name }
91
+ log_warn_fallback_error(strategy:, next_strategy:, error:) if next_strategy
92
+ Log.debug("#{self.class}: strategy=#{strategy} error=#{error.class}: #{error.message}")
93
+ [nil, state]
94
+ end
95
+
96
+ def process_response(response:, strategy:, next_strategy:, request_session:, state:)
97
+ articles = articles_for.call(response:, request_session:)
98
+ items_count = articles.size
99
+ state[:attempts] << { strategy:, items_count:, error_class: nil }
100
+ Log.debug("#{self.class}: strategy=#{strategy} items=#{items_count}")
101
+ return success_state(response:, strategy:, articles:, state:) if items_count.positive?
102
+
103
+ log_info_fallback_zero_items(strategy:, next_strategy:) if next_strategy
104
+ [nil, state.fetch(:attempts)]
105
+ end
106
+
107
+ def success_state(response:, strategy:, articles:, state:)
108
+ if state.fetch(:attempts).size > 1
109
+ Log.info("#{self.class}: auto selected strategy=#{strategy} after attempts=#{state.fetch(:attempts).size}")
110
+ end
111
+ [{ response:, articles: }, state.fetch(:attempts)]
112
+ end
113
+
114
+ def finalize_failure(attempts:)
115
+ raise NoFeedItemsExtracted.new(attempts:)
116
+ end
117
+
118
+ def log_warn_fallback_error(strategy:, next_strategy:, error:)
119
+ Log.warn("#{self.class}: auto fallback #{strategy} -> #{next_strategy} after error=#{error.class}")
120
+ end
121
+
122
+ def log_info_fallback_zero_items(strategy:, next_strategy:)
123
+ Log.info("#{self.class}: auto fallback #{strategy} -> #{next_strategy} after zero extracted items")
124
+ end
125
+ end
126
+ end
127
+ end