html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,240 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ ##
6
+ # Builds the exported configuration JSON Schema from the runtime validators.
7
+ module Schema
8
+ module_function
9
+
10
+ # Canonical filename for the exported config JSON schema artifact.
11
+ SCHEMA_FILENAME = 'html2rss-config.schema.json'
12
+
13
+ ##
14
+ # Returns the exported configuration JSON Schema.
15
+ #
16
+ # @return [Hash{String => Object}] JSON Schema represented as a Ruby hash
17
+ def json_schema
18
+ load_json_schema_extension!
19
+ Builder.call
20
+ end
21
+
22
+ ##
23
+ # Resolves the packaged schema path used by downstream tools.
24
+ #
25
+ # @return [String] absolute path to the packaged JSON schema file
26
+ def path
27
+ search_path = File.expand_path(__dir__)
28
+
29
+ loop do
30
+ candidate = File.join(search_path, 'schema', SCHEMA_FILENAME)
31
+ return candidate if File.exist?(candidate)
32
+
33
+ parent_path = File.dirname(search_path)
34
+ break if parent_path == search_path
35
+
36
+ search_path = parent_path
37
+ end
38
+
39
+ File.expand_path("../../../schema/#{SCHEMA_FILENAME}", __dir__)
40
+ end
41
+
42
+ # @return [void]
43
+ def load_json_schema_extension!
44
+ require 'dry/schema/extensions/json_schema'
45
+ Dry::Schema.load_extensions(:json_schema)
46
+ end
47
+
48
+ ##
49
+ # Orchestrates schema assembly from runtime validator contracts plus
50
+ # client-facing overlays.
51
+ class Builder
52
+ class << self
53
+ # @return [Hash{String => Object}] fully assembled JSON schema hash
54
+ def call
55
+ new.call
56
+ end
57
+ end
58
+
59
+ # @return [Hash{String => Object}] fully assembled JSON schema hash
60
+ def call
61
+ schema = validator_schema
62
+ apply_top_level(schema)
63
+ assign_properties(schema.fetch(:properties))
64
+ DeepStringifier.call(schema)
65
+ end
66
+
67
+ private
68
+
69
+ def validator_schema
70
+ Html2rss::Config::Validator.new.schema.json_schema(loose: true)
71
+ end
72
+
73
+ def apply_top_level(schema)
74
+ schema['$schema'] = 'https://json-schema.org/draft/2020-12/schema'
75
+ schema[:anyOf] = [
76
+ { 'required' => ['selectors'] },
77
+ { 'required' => ['auto_source'] }
78
+ ]
79
+ end
80
+
81
+ def assign_properties(properties)
82
+ properties.merge!(
83
+ strategy: Components.strategy,
84
+ headers: Components.headers,
85
+ stylesheets: Components.stylesheets,
86
+ auto_source: Components.auto_source,
87
+ selectors: Components.selectors
88
+ )
89
+ properties.delete(:dynamic_params_error)
90
+ end
91
+ end
92
+
93
+ ##
94
+ # Exposes schema fragments that populate the top-level configuration schema.
95
+ module Components
96
+ module_function
97
+
98
+ # @return [Hash{Symbol => Object}] schema fragment for strategy selection
99
+ def strategy
100
+ {
101
+ type: 'string',
102
+ not: { type: 'null' }
103
+ }
104
+ end
105
+
106
+ # @return [Hash{Symbol => Object}] schema fragment for headers
107
+ def headers
108
+ {
109
+ type: 'object',
110
+ description: 'HTTP headers applied to every request.',
111
+ additionalProperties: { type: 'string' }
112
+ }
113
+ end
114
+
115
+ # @return [Hash{Symbol => Object}] schema fragment for stylesheet definitions
116
+ def stylesheets
117
+ {
118
+ type: 'array',
119
+ description: 'Collection of stylesheets to attach to the RSS feed.',
120
+ items: Html2rss::Config::Validator::StylesheetConfig.json_schema(loose: true)
121
+ }
122
+ end
123
+
124
+ # @return [Hash{Symbol => Object}] schema fragment for auto_source configuration
125
+ def auto_source
126
+ schema = Html2rss::AutoSource::Config.json_schema(loose: true)
127
+ schema[:default] = DeepStringifier.call(Html2rss::AutoSource::DEFAULT_CONFIG)
128
+ schema
129
+ end
130
+
131
+ # @return [Hash{Symbol => Object}] schema fragment for selectors configuration
132
+ def selectors
133
+ Selectors.schema
134
+ end
135
+ end
136
+
137
+ ##
138
+ # Provides schema fragments that document selector configuration.
139
+ module Selectors
140
+ module_function
141
+
142
+ # Pattern used for dynamic selector keys excluding reserved selector names.
143
+ RESERVED_SELECTOR_PATTERN = '^(?!items$|enclosure$|guid$|categories$).+$'
144
+
145
+ # @return [Hash{Symbol => Object}] schema fragment for selectors root object
146
+ def schema
147
+ {
148
+ type: 'object',
149
+ description: 'Selectors used to extract article attributes.',
150
+ properties: selector_properties,
151
+ patternProperties: pattern_properties,
152
+ additionalProperties: true
153
+ }
154
+ end
155
+
156
+ # rubocop:disable Layout/LineLength
157
+ # @return [Hash{Symbol => Object}] schema map for reserved selector properties
158
+ def selector_properties
159
+ {
160
+ items: items_schema,
161
+ enclosure: enclosure_schema,
162
+ guid: reference_array('List of selector keys used to build the GUID. Each entry must reference a sibling selector key; runtime validation enforces those references.'),
163
+ categories: reference_array('List of selector keys whose values will be used as categories. Each entry must reference a sibling selector key; runtime validation enforces those references.')
164
+ }
165
+ end
166
+ # rubocop:enable Layout/LineLength
167
+
168
+ # @return [Hash{String => Object}] schema map for dynamic selector keys
169
+ def pattern_properties
170
+ { RESERVED_SELECTOR_PATTERN => dynamic_selector_schema }
171
+ end
172
+
173
+ # @return [Hash{Symbol => Object}] schema fragment for dynamic selector entries
174
+ def dynamic_selector_schema
175
+ Html2rss::Selectors::Config::Selector.new.schema.json_schema(loose: true).merge(
176
+ description: 'Dynamic selector definition keyed by attribute name.'
177
+ )
178
+ end
179
+
180
+ # @return [Hash{Symbol => Object}] schema fragment for `items` selector configuration
181
+ def items_schema
182
+ Html2rss::Selectors::Config::Items.new.schema.json_schema(loose: true).merge(
183
+ description: 'Defines the items selector and optional enhancement settings.'
184
+ )
185
+ end
186
+
187
+ # @return [Hash{Symbol => Object}] schema fragment for `enclosure` selector configuration
188
+ def enclosure_schema
189
+ Html2rss::Selectors::Config::Enclosure.new.schema.json_schema(loose: true).merge(
190
+ description: 'Describes enclosure extraction settings.'
191
+ )
192
+ end
193
+
194
+ # JSON Schema can enforce non-empty reference arrays, while runtime
195
+ # validation remains authoritative for checking that each entry points
196
+ # to an existing sibling selector key.
197
+ # @param description [String] human-readable description for the reference field
198
+ # @return [Hash{Symbol => Object}] JSON schema fragment for selector references
199
+ def reference_array(description)
200
+ {
201
+ type: 'array',
202
+ description:,
203
+ minItems: 1,
204
+ items: {
205
+ type: 'string',
206
+ description: 'Selector key defined elsewhere in this object.'
207
+ }
208
+ }
209
+ end
210
+ end
211
+
212
+ ##
213
+ # Converts nested hash keys to strings so the resulting schema serializes cleanly.
214
+ module DeepStringifier
215
+ module_function
216
+
217
+ # @param object [Hash, Array, Object] nested data to normalize
218
+ # @return [Hash, Array, Object] deep copy with stringified hash keys
219
+ def call(object)
220
+ case object
221
+ when Hash
222
+ stringify_hash(object)
223
+ when Array
224
+ object.map { |value| call(value) }
225
+ when Symbol
226
+ object.to_s
227
+ else
228
+ object
229
+ end
230
+ end
231
+
232
+ # @param object [Hash{Object => Object}] hash whose keys should become strings
233
+ # @return [Hash{String => Object}] hash with recursively normalized values
234
+ def stringify_hash(object)
235
+ object.to_h { |key, value| [key.to_s, call(value)] }
236
+ end
237
+ end
238
+ end
239
+ end
240
+ end
@@ -0,0 +1,146 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'dry-validation'
4
+
5
+ module Html2rss
6
+ class Config
7
+ # Validates the configuration hash using Dry::Validation.
8
+ # The configuration options adhere to the documented schema in README.md.
9
+ class Validator < Dry::Validation::Contract # rubocop:disable Metrics/ClassLength
10
+ # URI format used for channel URL validation.
11
+ URI_REGEXP = Url::URI_REGEXP
12
+ # Allowed stylesheet MIME types.
13
+ STYLESHEET_TYPES = RssBuilder::Stylesheet::TYPES
14
+ # Optional language/region format (`en` or `en-US`).
15
+ LANGUAGE_FORMAT_REGEX = /\A[a-z]{2}(-[A-Z]{2})?\z/
16
+ # Baseline strategy enum exported in static schema artifacts.
17
+ BASE_STRATEGY_OPTIONS = ([:auto] + Html2rss::RequestService.strategy_names.map(&:to_sym)).uniq.freeze
18
+
19
+ # Contract for the top-level `channel` section.
20
+ ChannelConfig = Dry::Schema.Params do
21
+ required(:url).filled(:string, format?: URI_REGEXP)
22
+ optional(:title).maybe(:string)
23
+ optional(:description).maybe(:string)
24
+ optional(:language).maybe(:string, format?: LANGUAGE_FORMAT_REGEX)
25
+ optional(:ttl).maybe(:integer, gt?: 0)
26
+ optional(:time_zone).maybe(:string)
27
+ end
28
+
29
+ # Contract for a stylesheet entry in `stylesheets`.
30
+ StylesheetConfig = Dry::Schema.Params do
31
+ required(:href).filled(:string)
32
+ required(:type).filled(:string, included_in?: STYLESHEET_TYPES)
33
+ optional(:media).maybe(:string)
34
+ end
35
+
36
+ # Contract for Browserless click-preload options.
37
+ BrowserlessPreloadClickSelectorConfig = Dry::Schema.Params do
38
+ required(:selector).filled(:string)
39
+ optional(:max_clicks).filled(:integer, gt?: 0)
40
+ optional(:wait_after_ms).filled(:integer, gteq?: 0)
41
+ end
42
+
43
+ # Contract for Browserless scroll-preload options.
44
+ BrowserlessPreloadScrollConfig = Dry::Schema.Params do
45
+ optional(:iterations).filled(:integer, gt?: 0)
46
+ optional(:wait_after_ms).filled(:integer, gteq?: 0)
47
+ end
48
+
49
+ # Contract for Browserless preload orchestration options.
50
+ BrowserlessPreloadConfig = Dry::Schema.Params do
51
+ optional(:wait_after_ms).filled(:integer, gteq?: 0)
52
+ optional(:click_selectors).array(BrowserlessPreloadClickSelectorConfig)
53
+ optional(:scroll_down).hash(BrowserlessPreloadScrollConfig)
54
+ end
55
+
56
+ # Contract for Browserless-specific request options.
57
+ BrowserlessRequestConfig = Dry::Schema.Params do
58
+ optional(:preload).hash(BrowserlessPreloadConfig)
59
+ end
60
+
61
+ # Contract for Botasaurus-specific request options.
62
+ BotasaurusRequestConfig = Dry::Schema.Params do
63
+ config.validate_keys = true
64
+
65
+ optional(:navigation_mode).filled(:string, included_in?: %w[auto get google_get google_get_bypass])
66
+ optional(:max_retries).filled(:integer, gteq?: 0, lteq?: 3)
67
+ optional(:wait_for_selector).maybe(:string)
68
+ optional(:wait_timeout_seconds).filled(:integer, gt?: 0)
69
+ optional(:block_images).filled(:bool)
70
+ optional(:block_images_and_css).filled(:bool)
71
+ optional(:wait_for_complete_page_load).filled(:bool)
72
+ optional(:headless).filled(:bool)
73
+ optional(:proxy).filled(:string)
74
+ optional(:user_agent).filled(:string)
75
+ optional(:window_size).value(:array, min_size?: 2, max_size?: 2).each(:integer, gt?: 0)
76
+ optional(:lang).filled(:string)
77
+ end
78
+
79
+ # Contract for the top-level `request` section.
80
+ RequestConfig = Dry::Schema.Params do
81
+ optional(:max_redirects).filled(:integer, gteq?: 0)
82
+ optional(:max_requests).filled(:integer, gt?: 0)
83
+ optional(:browserless).hash(BrowserlessRequestConfig)
84
+ optional(:botasaurus).hash(BotasaurusRequestConfig)
85
+ end
86
+
87
+ params do
88
+ optional(:strategy).filled(:symbol)
89
+ required(:channel).hash(ChannelConfig)
90
+ optional(:headers).hash
91
+ optional(:stylesheets).array(StylesheetConfig)
92
+ optional(:auto_source).hash(AutoSource::Config)
93
+ optional(:selectors).hash
94
+ optional(:dynamic_params_error).maybe(:string)
95
+ optional(:request).hash(RequestConfig)
96
+ end
97
+
98
+ rule(:headers) do
99
+ value&.each do |key, header_value|
100
+ unless header_value.is_a?(String)
101
+ key([:headers, key]).failure("must be a String, but got #{header_value.class}")
102
+ end
103
+ end
104
+ end
105
+
106
+ rule(:dynamic_params_error) do
107
+ base.failure(value) if value
108
+ end
109
+
110
+ rule(:strategy) do
111
+ next if value.nil?
112
+ next if value == :auto || Html2rss::RequestService.strategy_registered?(value)
113
+
114
+ key.failure("must be one of: #{BASE_STRATEGY_OPTIONS.join(', ')}")
115
+ end
116
+
117
+ # Ensure at least one of :selectors or :auto_source is present.
118
+ rule(:selectors, :auto_source) do
119
+ unless values.key?(:selectors) || values.key?(:auto_source)
120
+ base.failure("Configuration must include at least 'selectors' or 'auto_source'")
121
+ end
122
+ end
123
+
124
+ rule(:selectors) do
125
+ next unless value
126
+
127
+ errors = Html2rss::Selectors::Config.call(value).errors
128
+ errors.each { |error| key(:selectors).failure(error.text) } unless errors.empty?
129
+ end
130
+
131
+ # URL validation delegated to Url class
132
+ rule(:channel) do
133
+ next unless values[:channel]&.key?(:url)
134
+
135
+ url_string = values[:channel][:url]
136
+ next if url_string.nil? || url_string.empty?
137
+
138
+ begin
139
+ Html2rss::Url.for_channel(url_string)
140
+ rescue ArgumentError => error
141
+ key(%i[channel url]).failure(error.message)
142
+ end
143
+ end
144
+ end
145
+ end
146
+ end
@@ -1,82 +1,139 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'forwardable'
3
+ require 'json'
4
+ require 'yaml'
4
5
 
5
6
  module Html2rss
6
7
  ##
7
- # The Config class abstracts from the config data structure and
8
- # provides default values.
8
+ # The provided configuration is used to generate the RSS feed.
9
+ # This class provides methods to load and process configuration from a YAML file,
10
+ # supporting both single and multiple feed configurations.
11
+ #
12
+ # Configuration is validated during initialization.
9
13
  class Config
10
- extend Forwardable
14
+ # Raised when a configuration hash fails runtime validation.
15
+ class InvalidConfig < Html2rss::Error; end
16
+ extend ClassMethods
11
17
 
12
18
  ##
13
- # The Error class to be thrown when a feed config requires params, but none
14
- # were passed to Config.
15
- class ParamsMissing < Html2rss::Error; end
16
-
17
- ##
18
- # Thrown when the feed config does not contain a value at `:channel`.
19
- class ChannelMissing < Html2rss::Error; end
20
-
21
- def_delegator :@channel, :author
22
- def_delegator :@channel, :ttl
23
- def_delegator :@channel, :title
24
- def_delegator :@channel, :language
25
- def_delegator :@channel, :description
26
- def_delegator :@channel, :url
27
- def_delegator :@channel, :url, :link
28
- def_delegator :@channel, :time_zone
29
- def_delegator :@channel, :json?
30
- def_delegator :@channel, :strategy
31
-
32
- def_delegator :@selectors, :item_selector_names
33
- def_delegator :@selectors, :selector?
34
- def_delegator :@selectors, :category_selector_names
35
- def_delegator :@selectors, :guid_selector_names
36
- def_delegator :@selectors, :items_order
37
- def_delegator :@selectors, :selector_string
38
-
39
- ##
40
- # Initializes the Config object with feed configuration, global settings, and parameters.
19
+ # Initializes the configuration object.
41
20
  #
42
- # @param feed_config [Hash<Symbol, Object>] The configuration hash containing `:channel` and `:selectors`.
43
- # @param global [Hash<Symbol, Object>] Global settings hash.
44
- # @param params [Hash<Symbol, String>] Parameters hash.
45
- def initialize(feed_config, global = {}, params = {})
46
- channel_config = feed_config[:channel]
47
- raise ChannelMissing, 'Channel configuration is missing in feed_config' unless channel_config
48
-
49
- @channel = Channel.new(channel_config, params:)
50
- @selectors = Selectors.new(feed_config[:selectors])
51
- @global = global
21
+ # Processes deprecated attributes, applies default values, and validates the configuration.
22
+ #
23
+ # @param config [Hash{Symbol => Object}] the configuration hash.
24
+ # @raise [InvalidConfig] if the configuration fails validation.
25
+ def initialize(config)
26
+ @request_controls = RequestControls.from_config(config)
27
+ prepared_config = Preparer.new.call(config)
28
+ validated_config = validated_config_for(prepared_config)
29
+
30
+ @config = validated_config.freeze
31
+ @request_controls = request_controls.with_effective_values(
32
+ strategy: validated_config[:strategy],
33
+ max_redirects: validated_config.dig(:request, :max_redirects),
34
+ max_requests: validated_config.dig(:request, :max_requests)
35
+ )
52
36
  end
53
37
 
38
+ # @return [Symbol, nil] selected request strategy
39
+ def strategy = request_controls.strategy
40
+ # @return [Integer, nil] configured redirect budget
41
+ def max_redirects = request_controls.max_redirects
42
+ # @return [Integer, nil] configured request budget
43
+ def max_requests = request_controls.max_requests
44
+ # @return [Array<Hash>] stylesheet definitions
45
+ def stylesheets = config[:stylesheets]
46
+
54
47
  ##
55
- # Retrieves selector attributes merged with channel attributes.
56
- #
57
- # @param name [Symbol] Selector name.
58
- # @return [Hash<Symbol, Object>] Merged attributes hash.
59
- def selector_attributes_with_channel(name)
60
- @selectors.selector(name).to_h.merge(channel: @channel)
48
+ # @return [Boolean] whether max_requests was explicitly configured by the caller
49
+ def explicit_max_requests?
50
+ request_controls.explicit?(:max_requests)
61
51
  end
62
52
 
63
53
  ##
64
- # Retrieves headers merged from global settings and channel headers.
65
- #
66
- # @return [Hash] Merged headers hash.
67
- def headers
68
- @global.fetch(:headers, {}).merge(@channel.headers)
54
+ # @return [Html2rss::RequestControls] request controls with provenance
55
+ attr_reader :request_controls
56
+
57
+ # @return [Hash{String => String}] normalized HTTP headers
58
+ def headers = config[:headers]
59
+ # @return [Hash{Symbol => Object}] channel configuration
60
+ def channel = config[:channel]
61
+ # @return [String] source channel URL
62
+ def url = config.dig(:channel, :url)
63
+ # @return [String, nil] configured channel time zone
64
+ def time_zone = config.dig(:channel, :time_zone)
65
+
66
+ # @return [Hash{Symbol => Object}] request envelope configuration
67
+ def request = config[:request]
68
+
69
+ # @return [Hash{Symbol => Object}, nil] selectors configuration
70
+ def selectors = config[:selectors]
71
+ # @return [Hash{Symbol => Object}, nil] auto-source configuration
72
+ def auto_source = config[:auto_source]
73
+
74
+ private
75
+
76
+ attr_reader :config
77
+
78
+ # Normalizes raw config input before validation.
79
+ class Preparer
80
+ ##
81
+ # @param config [Hash{Symbol => Object}] raw config input
82
+ # @return [Hash{Symbol => Object}] config with defaults and deprecations applied
83
+ def call(config)
84
+ config = config.dup if config.frozen?
85
+
86
+ config = handle_deprecated_channel_attributes(config)
87
+ config = apply_default_config(config)
88
+ config = apply_default_selectors_config(config) if config[:selectors]
89
+ config = apply_default_auto_source_config(config) if config[:auto_source]
90
+
91
+ config
92
+ end
93
+
94
+ private
95
+
96
+ def handle_deprecated_channel_attributes(config)
97
+ { strategy: Config.default_strategy_name, headers: {} }.each_pair do |key, default_value|
98
+ if !config[key] && (value = config.dig(:channel, key))
99
+ Log.warn("The `channel.#{key}` key is deprecated. Please move the definition of `#{key}` to the top level.")
100
+ config[key] = value
101
+ end
102
+
103
+ config[key] ||= default_value
104
+ end
105
+
106
+ config
107
+ end
108
+
109
+ def apply_default_config(config)
110
+ HashUtil.deep_merge(Config.default_config, config)
111
+ end
112
+
113
+ def apply_default_selectors_config(config)
114
+ HashUtil.deep_merge({ selectors: Selectors::DEFAULT_CONFIG }, config)
115
+ end
116
+
117
+ def apply_default_auto_source_config(config)
118
+ HashUtil.deep_merge({ auto_source: Html2rss::AutoSource::DEFAULT_CONFIG }, config)
119
+ end
69
120
  end
70
121
 
71
- ##
72
- # Retrieves stylesheets from global settings.
73
- #
74
- # @return [Array<Stylesheet>] Array of Stylesheet structs.
75
- def stylesheets
76
- @global.fetch(:stylesheets, []).map { |attributes| Html2rss::RssBuilder::Stylesheet.new(**attributes) }
122
+ def validated_config_for(config)
123
+ validator = Validator.new.call(config)
124
+
125
+ raise InvalidConfig, "Invalid configuration: #{validator.errors.to_h}" unless validator.success?
126
+
127
+ normalized_headers(validator.to_h)
77
128
  end
78
129
 
79
- # Provides read-only access to the channel object.
80
- attr_reader :channel
130
+ def normalized_headers(validated_config)
131
+ validated_config[:headers] = RequestHeaders.normalize(
132
+ validated_config[:headers],
133
+ channel_language: validated_config.dig(:channel, :language),
134
+ url: validated_config.dig(:channel, :url)
135
+ )
136
+ validated_config
137
+ end
81
138
  end
82
139
  end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ # The Html2rss::Error base class.
5
+ class Error < StandardError; end
6
+
7
+ # Raised when auto fallback exhausts all concrete tiers and extractors find no feed items.
8
+ class NoFeedItemsExtracted < Error
9
+ ##
10
+ # @param attempts [Array<Hash{Symbol => Object}>] tier attempt diagnostics
11
+ def initialize(attempts:)
12
+ @attempts = attempts
13
+ super(build_message)
14
+ end
15
+
16
+ # @return [Array<Hash{Symbol => Object}>] tier attempt diagnostics
17
+ attr_reader :attempts
18
+
19
+ private
20
+
21
+ def build_message
22
+ summaries = attempts.map do |attempt|
23
+ details = attempt[:items_count].nil? ? "#{attempt[:error_class]} error" : "#{attempt[:items_count]} items"
24
+ "#{attempt[:strategy]} (#{details})"
25
+ end.join(', ')
26
+
27
+ "No feed items extracted after auto fallback across strategies: #{summaries}. " \
28
+ 'Try a more specific listing URL or provide explicit selectors.'
29
+ end
30
+ end
31
+ end