html2rss 0.18.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -1
  3. data/lib/html2rss/articles/deduplicator.rb +1 -0
  4. data/lib/html2rss/auto_source/cleanup.rb +11 -0
  5. data/lib/html2rss/auto_source/scraper/html.rb +5 -0
  6. data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
  7. data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
  8. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
  13. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
  14. data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
  15. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
  16. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
  17. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
  18. data/lib/html2rss/auto_source/scraper.rb +19 -1
  19. data/lib/html2rss/auto_source.rb +4 -0
  20. data/lib/html2rss/blocked_surface.rb +1 -0
  21. data/lib/html2rss/category_extractor.rb +2 -2
  22. data/lib/html2rss/cli.rb +30 -6
  23. data/lib/html2rss/config/class_methods.rb +24 -35
  24. data/lib/html2rss/config/dynamic_params.rb +6 -4
  25. data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
  26. data/lib/html2rss/config/request_headers.rb +9 -3
  27. data/lib/html2rss/config/schema.rb +33 -1
  28. data/lib/html2rss/config/validator.rb +40 -2
  29. data/lib/html2rss/config.rb +19 -13
  30. data/lib/html2rss/error.rb +25 -0
  31. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  32. data/lib/html2rss/feed_pipeline.rb +127 -0
  33. data/lib/html2rss/hash_util.rb +101 -0
  34. data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
  35. data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
  36. data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
  37. data/lib/html2rss/html_extractor.rb +5 -0
  38. data/lib/html2rss/html_navigator.rb +8 -0
  39. data/lib/html2rss/json_feed_builder.rb +1 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +8 -3
  41. data/lib/html2rss/rendering/description_builder.rb +0 -1
  42. data/lib/html2rss/rendering/image_renderer.rb +17 -7
  43. data/lib/html2rss/rendering/media_renderer.rb +4 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
  45. data/lib/html2rss/rendering/video_renderer.rb +8 -3
  46. data/lib/html2rss/rendering.rb +11 -2
  47. data/lib/html2rss/request_controls.rb +16 -21
  48. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  49. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  50. data/lib/html2rss/request_service/context.rb +14 -2
  51. data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
  52. data/lib/html2rss/request_service/policy.rb +4 -0
  53. data/lib/html2rss/request_service/response.rb +9 -1
  54. data/lib/html2rss/request_service.rb +19 -0
  55. data/lib/html2rss/request_session/runtime_input.rb +16 -2
  56. data/lib/html2rss/request_session/runtime_policy.rb +7 -0
  57. data/lib/html2rss/request_session.rb +13 -9
  58. data/lib/html2rss/rss_builder/article.rb +22 -1
  59. data/lib/html2rss/rss_builder/channel.rb +11 -2
  60. data/lib/html2rss/rss_builder/enclosure.rb +15 -1
  61. data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
  62. data/lib/html2rss/rss_builder.rb +4 -0
  63. data/lib/html2rss/selectors/config.rb +1 -0
  64. data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
  65. data/lib/html2rss/selectors/extractors/href.rb +2 -0
  66. data/lib/html2rss/selectors/extractors/html.rb +1 -0
  67. data/lib/html2rss/selectors/extractors/static.rb +2 -1
  68. data/lib/html2rss/selectors/extractors/text.rb +1 -0
  69. data/lib/html2rss/selectors/extractors.rb +2 -1
  70. data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
  71. data/lib/html2rss/selectors/post_processors/base.rb +13 -7
  72. data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
  73. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
  74. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
  75. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
  76. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
  77. data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
  78. data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
  79. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
  80. data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
  81. data/lib/html2rss/selectors/post_processors/template.rb +3 -0
  82. data/lib/html2rss/selectors/post_processors.rb +5 -0
  83. data/lib/html2rss/selectors.rb +7 -0
  84. data/lib/html2rss/url.rb +27 -23
  85. data/lib/html2rss/version.rb +2 -1
  86. data/lib/html2rss.rb +15 -78
  87. data/schema/html2rss-config.schema.json +83 -1
  88. metadata +7 -2
@@ -7,9 +7,16 @@ module Html2rss
7
7
  ##
8
8
  # Determines whether a WordPress page can safely be mapped to a posts query.
9
9
  class PageScope
10
+ # Canonical path segment for category archives.
10
11
  CATEGORY_SEGMENT = 'category'
12
+ # Canonical path segment for tag archives.
11
13
  TAG_SEGMENT = 'tag'
14
+ # Canonical path segment for author archives.
12
15
  AUTHOR_SEGMENT = 'author'
16
+ # Canonical path segment for paginated archives.
17
+ PAGE_SEGMENT = 'page'
18
+ # Canonical query key used for paginated archives.
19
+ PAGED_QUERY_KEY = 'paged'
13
20
 
14
21
  ##
15
22
  # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
@@ -20,7 +27,7 @@ module Html2rss
20
27
  end
21
28
 
22
29
  ##
23
- # @param query [Hash<String, String>] scoped query params for the posts endpoint
30
+ # @param query [Hash{String => String}] scoped query params for the posts endpoint
24
31
  # @param fetchable [Boolean] whether a posts follow-up is safe for this page
25
32
  # @param reason [Symbol] classification of the resolved page scope
26
33
  def initialize(query:, fetchable:, reason:)
@@ -31,7 +38,7 @@ module Html2rss
31
38
  end
32
39
 
33
40
  ##
34
- # @return [Hash<String, String>] query params to apply to the posts request
41
+ # @return [Hash{String => String}] query params to apply to the posts request
35
42
  attr_reader :query
36
43
 
37
44
  ##
@@ -58,11 +65,13 @@ module Html2rss
58
65
  ##
59
66
  # @return [PageScope] derived page scope
60
67
  def call
61
- category_scope ||
62
- tag_scope ||
63
- author_scope ||
64
- date_scope ||
65
- fallback_scope
68
+ scope = category_scope ||
69
+ tag_scope ||
70
+ author_scope ||
71
+ date_scope ||
72
+ fallback_scope
73
+
74
+ apply_pagination(scope)
66
75
  end
67
76
 
68
77
  private
@@ -103,6 +112,17 @@ module Html2rss
103
112
  PageScope.new(query: {}, fetchable: true, reason: :unscoped)
104
113
  end
105
114
 
115
+ def apply_pagination(scope)
116
+ page = archive_page_number
117
+ return scope unless scope.fetchable? && page
118
+
119
+ PageScope.new(
120
+ query: scope.query.merge('page' => page.to_s),
121
+ fetchable: scope.fetchable?,
122
+ reason: scope.reason
123
+ )
124
+ end
125
+
106
126
  def scoped_scope(query)
107
127
  return unknown_archive_scope if query.values.any?(&:nil?)
108
128
 
@@ -166,8 +186,12 @@ module Html2rss
166
186
  @path_segments ||= canonical_or_current_url.path_segments
167
187
  end
168
188
 
189
+ def scoped_path_segments
190
+ @scoped_path_segments ||= paginated_path? ? path_segments[0...-2] : path_segments
191
+ end
192
+
169
193
  def leading_path_segment
170
- path_segments.first
194
+ scoped_path_segments.first
171
195
  end
172
196
 
173
197
  def date_archive_path?
@@ -202,16 +226,33 @@ module Html2rss
202
226
  end
203
227
 
204
228
  def date_archive_segments
205
- year_index = path_segments.find_index { _1.match?(/\A\d{4}\z/) }
229
+ year_index = scoped_path_segments.find_index { _1.match?(/\A\d{4}\z/) }
206
230
  return unless year_index
207
231
 
208
- segments = path_segments.drop(year_index)
232
+ segments = scoped_path_segments.drop(year_index)
209
233
  return unless segments.length.between?(1, 3)
210
234
  return unless archive_segment_shape?(segments)
211
235
 
212
236
  segments
213
237
  end
214
238
 
239
+ def paginated_path?
240
+ return false if path_segments.length < 2
241
+ return false unless path_segments[-2] == PAGE_SEGMENT
242
+
243
+ !parse_positive_integer(path_segments[-1]).nil?
244
+ end
245
+
246
+ def archive_page_number
247
+ parse_positive_integer(canonical_or_current_url.query_values[PAGED_QUERY_KEY]) || path_page_number
248
+ end
249
+
250
+ def path_page_number
251
+ return unless paginated_path?
252
+
253
+ parse_positive_integer(path_segments[-1])
254
+ end
255
+
215
256
  def archive_segment_shape?(segments)
216
257
  month = segments[1]
217
258
  day = segments[2]
@@ -253,6 +294,15 @@ module Html2rss
253
294
 
254
295
  number
255
296
  end
297
+
298
+ def parse_positive_integer(value)
299
+ return nil unless value.to_s.match?(/\A\d+\z/)
300
+
301
+ number = value.to_i
302
+ return nil if number < 1
303
+
304
+ number
305
+ end
256
306
  end
257
307
  end
258
308
  end
@@ -7,13 +7,14 @@ module Html2rss
7
7
  ##
8
8
  # Resolves the WordPress posts endpoint for a given page and scope.
9
9
  class PostsEndpoint
10
+ # REST API collection path for posts resources.
10
11
  POSTS_PATH = 'wp/v2/posts'
11
12
 
12
13
  ##
13
14
  # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
14
15
  # @param page_url [Html2rss::Url] canonical page URL
15
16
  # @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
16
- # @param posts_query [Hash<String, String>] query params for the posts request
17
+ # @param posts_query [Hash{String => String}] query params for the posts request
17
18
  # @param logger [Logger] logger used for operational warnings
18
19
  # @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
19
20
  def self.resolve(parsed_body:, page_url:, page_scope:, posts_query:, logger: Html2rss::Log)
@@ -24,7 +25,7 @@ module Html2rss
24
25
  # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
25
26
  # @param page_url [Html2rss::Url] canonical page URL
26
27
  # @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
27
- # @param posts_query [Hash<String, String>] query params for the posts request
28
+ # @param posts_query [Hash{String => String}] query params for the posts request
28
29
  # @param logger [Logger] logger used for operational warnings
29
30
  def initialize(parsed_body:, page_url:, page_scope:, posts_query:, logger:)
30
31
  @parsed_body = parsed_body
@@ -10,9 +10,18 @@ module Html2rss
10
10
  class WordpressApi # rubocop:disable Metrics/ClassLength
11
11
  include Enumerable
12
12
 
13
+ # Selector for WordPress API discovery link tags.
13
14
  API_LINK_SELECTOR = 'link[rel="https://api.w.org/"][href]'
15
+ # Selector for canonical link tags used for scope normalization.
14
16
  CANONICAL_LINK_SELECTOR = 'link[rel="canonical"][href]'
17
+ # Fields requested from the WordPress posts endpoint.
15
18
  POSTS_FIELDS = %w[id title excerpt content link date categories].freeze
19
+ # Baseline query sent to WordPress posts API follow-ups.
20
+ POSTS_QUERY_DEFAULTS = {
21
+ '_fields' => POSTS_FIELDS.join(','),
22
+ 'per_page' => '100'
23
+ }.freeze
24
+ # @return [Symbol] scraper config key
16
25
  def self.options_key = :wordpress_api
17
26
 
18
27
  ##
@@ -29,6 +38,7 @@ module Html2rss
29
38
  # @param url [String, Html2rss::Url] canonical page URL
30
39
  # @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
31
40
  # @param _opts [Hash] unused scraper-specific options
41
+ # @option _opts [Object] :_reserved reserved for future scraper-specific options
32
42
  # @return [void]
33
43
  def initialize(parsed_body, url:, request_session: nil, **_opts)
34
44
  @parsed_body = parsed_body
@@ -40,7 +50,7 @@ module Html2rss
40
50
  ##
41
51
  # Yields article hashes from the WordPress posts API.
42
52
  #
43
- # @yieldparam article [Hash<Symbol, Object>] normalized article hash
53
+ # @yieldparam article [Hash{Symbol => Object}] normalized article hash
44
54
  # @return [Enumerator, void] enumerator when no block is given
45
55
  def each
46
56
  return enum_for(:each) unless block_given?
@@ -94,7 +104,7 @@ module Html2rss
94
104
  end
95
105
 
96
106
  def article_id(_post, article_url)
97
- root_path_query_id(article_url) || string(article_url.path) || article_url.to_s
107
+ root_path_query_id(article_url) || present_string(article_url.path) || article_url.to_s
98
108
  end
99
109
 
100
110
  def article_title(post)
@@ -106,11 +116,11 @@ module Html2rss
106
116
  end
107
117
 
108
118
  def article_published_at(post)
109
- string(post[:date])
119
+ present_string(post[:date])
110
120
  end
111
121
 
112
122
  def article_categories(post)
113
- Array(post[:categories]).filter_map { |value| string(value) }
123
+ Array(post[:categories]).filter_map { |value| present_string(value) }
114
124
  end
115
125
 
116
126
  def article_attributes(post, article_url)
@@ -125,7 +135,7 @@ module Html2rss
125
135
  end
126
136
 
127
137
  def absolute_link(link)
128
- value = string(link)
138
+ value = present_string(link)
129
139
  return unless value
130
140
 
131
141
  Html2rss::Url.from_relative(value, url)
@@ -138,17 +148,17 @@ module Html2rss
138
148
  end
139
149
 
140
150
  def rendered_html(value)
141
- text = string(value)
151
+ text = present_string(value)
142
152
  text unless text.nil?
143
153
  end
144
154
 
145
- def string(value)
155
+ def present_string(value)
146
156
  text = value.to_s.strip
147
157
  text unless text.empty?
148
158
  end
149
159
 
150
160
  def root_path_query_id(article_url)
151
- query = string(article_url.query)
161
+ query = present_string(article_url.query)
152
162
  return unless query
153
163
 
154
164
  path = article_url.path.to_s
@@ -158,10 +168,7 @@ module Html2rss
158
168
  end
159
169
 
160
170
  def posts_query
161
- {
162
- '_fields' => POSTS_FIELDS.join(','),
163
- 'per_page' => '100'
164
- }.merge(page_scope.query)
171
+ POSTS_QUERY_DEFAULTS.merge(page_scope.query).transform_values(&:to_s)
165
172
  end
166
173
 
167
174
  def posts_endpoint_url
@@ -14,12 +14,15 @@ module Html2rss
14
14
  # Scrapers run in parallel threads, so implementations must avoid shared
15
15
  # mutable state and degrade by returning no articles when a follow-up would
16
16
  # be unsafe or unsupported.
17
- #
18
17
  module Scraper
18
+ # Root markers indicating likely app-shell/client-rendered surfaces.
19
19
  APP_SHELL_ROOT_SELECTORS = '#app, #root, #__next, [data-reactroot], [ng-app], [id*="app-shell"]'
20
+ # Maximum anchors tolerated before app-shell detection is considered unlikely.
20
21
  APP_SHELL_MAX_ANCHORS = 2
22
+ # Maximum visible text length tolerated for app-shell classification.
21
23
  APP_SHELL_MAX_VISIBLE_TEXT_LENGTH = 220
22
24
 
25
+ # Ordered scraper classes considered during auto-source extraction.
23
26
  SCRAPERS = [
24
27
  WordpressApi,
25
28
  Schema,
@@ -32,6 +35,7 @@ module Html2rss
32
35
  ##
33
36
  # Error raised when no suitable scraper is found.
34
37
  class NoScraperFound < Html2rss::Error
38
+ # User-facing messages grouped by no-scraper surface category.
35
39
  CATEGORY_MESSAGES = {
36
40
  blocked_surface: 'No scrapers found: blocked surface likely (anti-bot or interstitial). ' \
37
41
  'Retry with --strategy browserless, try a more specific public listing URL, ' \
@@ -44,6 +48,8 @@ module Html2rss
44
48
  'or use explicit selectors in a feed config.'
45
49
  }.freeze
46
50
 
51
+ # @param message [String, nil] custom error message override
52
+ # @param category [Symbol] no-scraper classification
47
53
  def initialize(message = nil, category: :unsupported_surface)
48
54
  validate_category!(category)
49
55
  @category = category
@@ -66,6 +72,12 @@ module Html2rss
66
72
  # Returns an array of scraper classes that claim to find articles in the parsed body.
67
73
  # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
68
74
  # @param opts [Hash] The options hash.
75
+ # @option opts [Hash] :wordpress_api scraper toggle and configuration
76
+ # @option opts [Hash] :schema scraper toggle and configuration
77
+ # @option opts [Hash] :microdata scraper toggle and configuration
78
+ # @option opts [Hash] :json_state scraper toggle and configuration
79
+ # @option opts [Hash] :semantic_html scraper toggle and configuration
80
+ # @option opts [Hash] :html scraper toggle and configuration
69
81
  # @return [Array<Class>] An array of scraper classes that can handle the parsed body.
70
82
  def self.from(parsed_body, opts = Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
71
83
  scrapers = SCRAPERS.select { |scraper| opts.dig(scraper.options_key, :enabled) }
@@ -81,6 +93,12 @@ module Html2rss
81
93
  # @param url [String, Html2rss::Url] The page url.
82
94
  # @param request_session [Html2rss::RequestSession, nil] Shared follow-up session.
83
95
  # @param opts [Hash] The options hash.
96
+ # @option opts [Hash] :wordpress_api scraper toggle and configuration
97
+ # @option opts [Hash] :schema scraper toggle and configuration
98
+ # @option opts [Hash] :microdata scraper toggle and configuration
99
+ # @option opts [Hash] :json_state scraper toggle and configuration
100
+ # @option opts [Hash] :semantic_html scraper toggle and configuration
101
+ # @option opts [Hash] :html scraper toggle and configuration
84
102
  # @return [Array<Object>] An array of scraper instances that can handle the parsed body.
85
103
  #
86
104
  # `instances_for` is the main entrypoint for extraction. It lets a scraper
@@ -17,6 +17,7 @@ module Html2rss
17
17
  # @see Html2rss::AutoSource::Scraper::SemanticHtml
18
18
  # @see Html2rss::AutoSource::Scraper::Html
19
19
  class AutoSource
20
+ # Default auto-source configuration shipped for scraper and cleanup behavior.
20
21
  DEFAULT_CONFIG = {
21
22
  scraper: {
22
23
  wordpress_api: {
@@ -67,6 +68,7 @@ module Html2rss
67
68
  end.freeze
68
69
  private_constant :SCRAPER_CONFIG
69
70
 
71
+ # Runtime schema used to validate auto-source config values.
70
72
  Config = Dry::Schema.Params do
71
73
  optional(:scraper).hash(&SCRAPER_CONFIG)
72
74
 
@@ -80,6 +82,8 @@ module Html2rss
80
82
  # @param response [Html2rss::RequestService::Response] initial page response
81
83
  # @param opts [Hash] validated auto-source options
82
84
  # @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
85
+ # @option opts [Hash] :scraper scraper configuration map
86
+ # @option opts [Hash] :cleanup cleanup configuration map
83
87
  # @return [void]
84
88
  def initialize(response, opts = DEFAULT_CONFIG, request_session: nil)
85
89
  @parsed_body = response.parsed_body
@@ -7,6 +7,7 @@ module Html2rss
7
7
  # This module centralizes signature matching so request-time guards and
8
8
  # auto-source surface classification stay consistent.
9
9
  module BlockedSurface
10
+ # Known interstitial fingerprints used to detect blocked or anti-bot surfaces.
10
11
  INTERSTITIAL_SIGNATURES = [
11
12
  {
12
13
  key: :cloudflare_interstitial,
@@ -49,7 +49,7 @@ module Html2rss
49
49
  ##
50
50
  # Extracts categories from data attributes of a single element.
51
51
  #
52
- # @param element [Nokogiri::XML::Element] The element to process
52
+ # @param element [Nokogiri::XML::Element] metadata element that may contain category links
53
53
  # @return [Set<String>] Set of category strings
54
54
  def self.extract_element_data_categories(element)
55
55
  Set.new.tap do |categories|
@@ -65,7 +65,7 @@ module Html2rss
65
65
  ##
66
66
  # Extracts text-based categories from elements, splitting content into discrete values.
67
67
  #
68
- # @param element [Nokogiri::XML::Element] The element to process
68
+ # @param element [Nokogiri::XML::Element] metadata element whose text may contain delimiters
69
69
  # @return [Set<String>] Set of category strings
70
70
  def self.extract_text_categories(element)
71
71
  anchor_values = element.css('a').filter_map do |node|
data/lib/html2rss/cli.rb CHANGED
@@ -11,7 +11,23 @@ module Html2rss
11
11
  # The Html2rss command line interface.
12
12
  class CLI < Thor # rubocop:disable Metrics/ClassLength
13
13
  check_unknown_options!
14
+ # Ordered fallback chain attempted by auto strategy.
15
+ #
16
+ # @return [Array<Symbol>]
17
+ AUTO_FALLBACK_CHAIN = Html2rss::FeedPipeline::AutoFallback::CHAIN.freeze
18
+ # Supported CLI strategy option values.
19
+ #
20
+ # @return [Array<String>]
21
+ STRATEGY_OPTION_ENUM = (['auto'] + Html2rss::RequestService.strategy_names).uniq.freeze
22
+ # User-facing strategy help text that reflects the current fallback chain.
23
+ #
24
+ # @return [String]
25
+ STRATEGY_OPTION_DESC = [
26
+ 'Optional request strategy (defaults to auto; auto tries',
27
+ "#{AUTO_FALLBACK_CHAIN.join(' -> ')})"
28
+ ].join(' ').freeze
14
29
 
30
+ # @return [Boolean] whether Thor should terminate process on command failures
15
31
  def self.exit_on_failure?
16
32
  true
17
33
  end
@@ -24,14 +40,17 @@ module Html2rss
24
40
  default: {}
25
41
  method_option :strategy,
26
42
  type: :string,
27
- desc: 'The strategy to request the URL',
28
- enum: %w[faraday browserless]
43
+ desc: STRATEGY_OPTION_DESC,
44
+ enum: STRATEGY_OPTION_ENUM
29
45
  method_option :max_redirects,
30
46
  type: :numeric,
31
47
  desc: 'Maximum redirects to follow per request'
32
48
  method_option :max_requests,
33
49
  type: :numeric,
34
50
  desc: 'Maximum requests to allow for this feed build'
51
+ # @param yaml_file [String] path to YAML config
52
+ # @param feed_name [String, nil] optional named feed in multi-feed config
53
+ # @return [void]
35
54
  def feed(yaml_file, feed_name = nil)
36
55
  config = Html2rss.config_from_yaml_file(yaml_file, feed_name)
37
56
  config[:params] = options[:params] || {}
@@ -43,8 +62,8 @@ module Html2rss
43
62
  desc 'auto [URL]', 'Automatically sources an RSS feed from the URL'
44
63
  method_option :strategy,
45
64
  type: :string,
46
- desc: 'The strategy to request the URL',
47
- enum: %w[faraday browserless]
65
+ desc: STRATEGY_OPTION_DESC,
66
+ enum: STRATEGY_OPTION_ENUM
48
67
  method_option :format,
49
68
  type: :string,
50
69
  desc: 'Output format for the auto-sourced feed',
@@ -57,6 +76,8 @@ module Html2rss
57
76
  method_option :max_requests,
58
77
  type: :numeric,
59
78
  desc: 'Maximum requests to allow for this feed build'
79
+ # @param url [String] source page URL for auto discovery
80
+ # @return [void]
60
81
  def auto(url) # rubocop:disable Metrics/MethodLength
61
82
  format = options.fetch(:format, 'rss')
62
83
  source_method = format == 'jsonfeed' ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
@@ -156,7 +177,7 @@ module Html2rss
156
177
  end
157
178
 
158
179
  def current_strategy
159
- options[:strategy]&.to_sym || :faraday
180
+ options[:strategy]&.to_sym || :auto
160
181
  end
161
182
 
162
183
  def current_max_redirects
@@ -186,7 +207,10 @@ module Html2rss
186
207
  'or increase request.max_requests in the config.'
187
208
  rescue Html2rss::RequestService::BrowserlessConfigurationError,
188
209
  Html2rss::RequestService::BrowserlessConnectionFailed,
189
- Html2rss::RequestService::BlockedSurfaceDetected => error
210
+ Html2rss::RequestService::BotasaurusConfigurationError,
211
+ Html2rss::RequestService::BotasaurusConnectionFailed,
212
+ Html2rss::RequestService::BlockedSurfaceDetected,
213
+ Html2rss::NoFeedItemsExtracted => error
190
214
  raise Thor::Error, error.message
191
215
  end
192
216
  end
@@ -5,12 +5,13 @@ module Html2rss
5
5
  ##
6
6
  # Public class-level helpers for loading, validating, and exporting config.
7
7
  module ClassMethods
8
+ # Sentinel to differentiate omitted params from explicit `nil`.
8
9
  UNSET = Object.new.freeze
9
10
 
10
11
  ##
11
12
  # Returns the exported JSON Schema for html2rss configuration.
12
13
  #
13
- # @return [Hash<String, Object>] JSON Schema represented as a Ruby hash
14
+ # @return [Hash{String => Object}] JSON Schema represented as a Ruby hash
14
15
  def json_schema
15
16
  Schema.json_schema
16
17
  end
@@ -27,15 +28,15 @@ module Html2rss
27
28
  ##
28
29
  # Validates a configuration hash with the runtime validator.
29
30
  #
30
- # @param config [Hash<Symbol, Object>] the configuration hash
31
- # @param params [Hash<Symbol, Object>, Hash<String, Object>, nil] dynamic parameters for string formatting
31
+ # @param config [Hash{Symbol => Object}] the configuration hash
32
+ # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
32
33
  # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
33
34
  def validate(config, params: UNSET)
34
35
  prepared_config = prepare_for_validation(resolve_effective_config(config, params:))
35
36
 
36
37
  Validator.new.call(prepared_config)
37
38
  rescue DynamicParams::ParamsMissing => error
38
- prepared_config = prepare_for_validation(deep_dup(config))
39
+ prepared_config = prepare_for_validation(HashUtil.deep_symbolize_keys(config, context: 'config'))
39
40
  prepared_config[:dynamic_params_error] = error.message
40
41
 
41
42
  Validator.new.call(prepared_config)
@@ -55,7 +56,7 @@ module Html2rss
55
56
  # @param file [String] the YAML file to load
56
57
  # @param feed_name [String, nil] optional feed name for multi-feed files
57
58
  # @param multiple_feeds_key [Symbol] key under which multiple feeds are defined
58
- # @param params [Hash<Symbol, Object>, Hash<String, Object>, nil] dynamic parameters for string formatting
59
+ # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
59
60
  # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
60
61
  def validate_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS, params: UNSET)
61
62
  validate(load_yaml(file, feed_name, multiple_feeds_key:), params:)
@@ -69,7 +70,7 @@ module Html2rss
69
70
  # @param file [String] the YAML file to load.
70
71
  # @param feed_name [String, nil] the feed name when using multiple feeds.
71
72
  # @param multiple_feeds_key [Symbol] the key under which multiple feeds are defined.
72
- # @return [Hash<Symbol, Object>] the configuration hash.
73
+ # @return [Hash{Symbol => Object}] the configuration hash.
73
74
  # @raise [ArgumentError] if the file doesn't exist or feed is not found.
74
75
  # rubocop:disable Metrics/MethodLength
75
76
  def load_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS)
@@ -97,8 +98,8 @@ module Html2rss
97
98
  # Processes the provided configuration hash, applying dynamic parameters if given,
98
99
  # and returns a new configuration object.
99
100
  #
100
- # @param config [Hash<Symbol, Object>] the configuration hash.
101
- # @param params [Hash<Symbol, Object>, Hash<String, Object>, nil] dynamic parameters for string formatting.
101
+ # @param config [Hash{Symbol => Object}] the configuration hash.
102
+ # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting.
102
103
  # @return [Html2rss::Config] the configuration object.
103
104
  def from_hash(config, params: UNSET)
104
105
  new(resolve_effective_config(config, params:))
@@ -110,7 +111,7 @@ module Html2rss
110
111
  # @param url [String] source page URL
111
112
  # @param items_selector [String, nil] optional selector hint for item extraction
112
113
  # @param request_controls [Html2rss::RequestControls, nil] explicit request controls to write
113
- # @return [Hash<Symbol, Object>] feed config hash ready for {from_hash}
114
+ # @return [Hash{Symbol => Object}] feed config hash ready for {from_hash}
114
115
  def auto_source_config(url:, items_selector: nil, request_controls: nil)
115
116
  config = {
116
117
  channel: default_config[:channel].merge(url:),
@@ -127,10 +128,10 @@ module Html2rss
127
128
  ##
128
129
  # Provides a default configuration.
129
130
  #
130
- # @return [Hash<Symbol, Object>] a hash with default configuration values.
131
+ # @return [Hash{Symbol => Object}] a hash with default configuration values.
131
132
  def default_config
132
133
  {
133
- strategy: RequestService.default_strategy_name,
134
+ strategy: default_strategy_name,
134
135
  request: {
135
136
  max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
136
137
  max_requests: RequestService::Policy::DEFAULTS[:max_requests]
@@ -141,15 +142,22 @@ module Html2rss
141
142
  }
142
143
  end
143
144
 
145
+ # @return [Symbol] the default strategy for feed orchestration
146
+ def default_strategy_name
147
+ :auto
148
+ end
149
+
144
150
  private
145
151
 
146
152
  def resolve_effective_config(config, params:)
147
- effective_config = deep_dup(config)
153
+ effective_config = HashUtil.deep_symbolize_keys(config, context: 'config')
148
154
  resolved_params = parameter_defaults(effective_config)
149
- resolved_params.merge!(params) unless params.equal?(UNSET) || params.nil?
155
+ unless params.equal?(UNSET) || params.nil?
156
+ resolved_params.merge!(HashUtil.deep_symbolize_keys(params, context: 'params'))
157
+ end
150
158
 
151
- DynamicParams.call(effective_config[:headers], resolved_params)
152
- DynamicParams.call(effective_config[:channel], resolved_params)
159
+ effective_config[:headers] = DynamicParams.call(effective_config[:headers], resolved_params)
160
+ effective_config[:channel] = DynamicParams.call(effective_config[:channel], resolved_params)
153
161
 
154
162
  effective_config
155
163
  end
@@ -163,27 +171,8 @@ module Html2rss
163
171
  end
164
172
 
165
173
  def prepare_for_validation(config)
166
- Config::Preparer.new.call(deep_dup(config))
167
- end
168
-
169
- # rubocop:disable Metrics/MethodLength
170
- def deep_dup(object)
171
- case object
172
- when Hash
173
- object.transform_values do |value|
174
- deep_dup(value)
175
- end
176
- when Array
177
- object.map { |value| deep_dup(value) }
178
- else
179
- begin
180
- object.dup
181
- rescue TypeError
182
- object
183
- end
184
- end
174
+ Config::Preparer.new.call(HashUtil.deep_dup(config))
185
175
  end
186
- # rubocop:enable Metrics/MethodLength
187
176
  end
188
177
  end
189
178
  end
@@ -4,13 +4,14 @@ module Html2rss
4
4
  class Config
5
5
  # Processes and applies dynamic parameter formatting in configuration values.
6
6
  class DynamicParams
7
+ # Raised when string interpolation references an unavailable parameter.
7
8
  class ParamsMissing < Html2rss::Error; end
8
9
 
9
10
  class << self
10
11
  # Recursively traverses the given value and formats any strings containing
11
12
  # placeholders with values from the provided params.
12
13
  #
13
- # @param value [String, Hash, Enumerable, Object] The value to process.
14
+ # @param value [String, Hash, Enumerable, Object] value that may contain parameter placeholders
14
15
  # @param params [Hash] The parameters for substitution.
15
16
  # @param getter [Proc, nil] Optional proc to retrieve a key's value.
16
17
  # @param replace_missing_with [Object, nil] Value to substitute if a key is missing.
@@ -55,12 +56,13 @@ module Html2rss
55
56
  end
56
57
 
57
58
  def from_hash(hash, params, getter:, replace_missing_with:)
58
- hash.transform_keys!(&:to_sym)
59
- hash.transform_values! { |value| call(value, params, getter:, replace_missing_with:) }
59
+ HashUtil.deep_symbolize_keys(hash, context: 'dynamic params hash').to_h do |key, value|
60
+ [key, call(value, params, getter:, replace_missing_with:)]
61
+ end
60
62
  end
61
63
 
62
64
  def from_enumerable(enumerable, params, getter:, replace_missing_with:)
63
- enumerable.map! { |value| call(value, params, getter:, replace_missing_with:) }
65
+ enumerable.map { |value| call(value, params, getter:, replace_missing_with:) }
64
66
  end
65
67
  end
66
68
  end
@@ -7,6 +7,7 @@ module Html2rss
7
7
  # where each feed name is the key for its feed configuration.
8
8
  # All global configuration keys (outside :feeds) are merged into each feed's settings.
9
9
  class MultipleFeedsConfig
10
+ # Reserved YAML key under which multiple named feed configs are defined.
10
11
  CONFIG_KEY_FEEDS = :feeds
11
12
 
12
13
  class << self
@@ -37,11 +38,11 @@ module Html2rss
37
38
  local_value = config[key]
38
39
  case local_value
39
40
  when Hash
40
- global_value.is_a?(Hash) ? global_value.merge(local_value) : local_value
41
+ global_value.is_a?(Hash) ? HashUtil.deep_merge(global_value, local_value) : local_value
41
42
  when Array
42
43
  global_value.is_a?(Array) ? global_value + local_value : local_value
43
44
  else
44
- global_value
45
+ local_value.nil? ? global_value : local_value
45
46
  end
46
47
  end
47
48
  end