html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
data/lib/html2rss/cli.rb CHANGED
@@ -1,46 +1,217 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative '../html2rss'
3
+ require 'fileutils'
4
+ require 'json'
4
5
  require 'thor'
5
6
 
6
7
  ##
7
8
  # The Html2rss namespace / command line interface.
8
9
  module Html2rss
9
- Log = Logger.new($stderr)
10
-
11
10
  ##
12
11
  # The Html2rss command line interface.
13
- class CLI < Thor
12
+ class CLI < Thor # rubocop:disable Metrics/ClassLength
13
+ check_unknown_options!
14
+ # Ordered fallback chain attempted by auto strategy.
15
+ #
16
+ # @return [Array<Symbol>]
17
+ AUTO_FALLBACK_CHAIN = Html2rss::FeedPipeline::AutoFallback::CHAIN.freeze
18
+ # Supported CLI strategy option values.
19
+ #
20
+ # @return [Array<String>]
21
+ STRATEGY_OPTION_ENUM = (['auto'] + Html2rss::RequestService.strategy_names).uniq.freeze
22
+ # User-facing strategy help text that reflects the current fallback chain.
23
+ #
24
+ # @return [String]
25
+ STRATEGY_OPTION_DESC = [
26
+ 'Optional request strategy (defaults to auto; auto tries',
27
+ "#{AUTO_FALLBACK_CHAIN.join(' -> ')})"
28
+ ].join(' ').freeze
29
+
30
+ # @return [Boolean] whether Thor should terminate process on command failures
14
31
  def self.exit_on_failure?
15
32
  true
16
33
  end
17
34
 
18
- desc 'feed YAML_FILE [FEED_NAME] [param=value ...]', 'Print RSS built from the YAML_FILE file to stdout'
35
+ desc 'feed YAML_FILE [feed_name]', 'Print RSS built from the YAML_FILE file to stdout'
36
+ method_option :params,
37
+ type: :hash,
38
+ optional: true,
39
+ required: false,
40
+ default: {}
41
+ method_option :strategy,
42
+ type: :string,
43
+ desc: STRATEGY_OPTION_DESC,
44
+ enum: STRATEGY_OPTION_ENUM
45
+ method_option :max_redirects,
46
+ type: :numeric,
47
+ desc: 'Maximum redirects to follow per request'
48
+ method_option :max_requests,
49
+ type: :numeric,
50
+ desc: 'Maximum requests to allow for this feed build'
51
+ # @param yaml_file [String] path to YAML config
52
+ # @param feed_name [String, nil] optional named feed in multi-feed config
53
+ # @return [void]
54
+ def feed(yaml_file, feed_name = nil)
55
+ config = Html2rss.config_from_yaml_file(yaml_file, feed_name)
56
+ config[:params] = options[:params] || {}
57
+ apply_runtime_request_overrides!(config)
58
+
59
+ puts(execute_feed { Html2rss.feed(config) })
60
+ end
61
+
62
+ desc 'auto [URL]', 'Automatically sources an RSS feed from the URL'
63
+ method_option :strategy,
64
+ type: :string,
65
+ desc: STRATEGY_OPTION_DESC,
66
+ enum: STRATEGY_OPTION_ENUM
67
+ method_option :format,
68
+ type: :string,
69
+ desc: 'Output format for the auto-sourced feed',
70
+ enum: %w[rss jsonfeed],
71
+ default: 'rss'
72
+ method_option :items_selector, type: :string, desc: 'CSS selector for items (will be enhanced) (optional)'
73
+ method_option :max_redirects,
74
+ type: :numeric,
75
+ desc: 'Maximum redirects to follow per request'
76
+ method_option :max_requests,
77
+ type: :numeric,
78
+ desc: 'Maximum requests to allow for this feed build'
79
+ # @param url [String] source page URL for auto discovery
80
+ # @return [void]
81
+ def auto(url) # rubocop:disable Metrics/MethodLength
82
+ format = options.fetch(:format, 'rss')
83
+ source_method = format == 'jsonfeed' ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
84
+
85
+ result = execute_feed do
86
+ source_method.call(
87
+ url,
88
+ strategy: current_strategy,
89
+ items_selector: options[:items_selector],
90
+ max_redirects: options[:max_redirects],
91
+ max_requests: options[:max_requests]
92
+ )
93
+ end
94
+
95
+ puts(format == 'jsonfeed' ? JSON.pretty_generate(result) : result)
96
+ end
97
+
98
+ desc 'schema', 'Print the exported config JSON Schema'
99
+ method_option :pretty,
100
+ type: :boolean,
101
+ desc: 'Pretty-print the schema JSON',
102
+ default: true
103
+ method_option :write,
104
+ type: :string,
105
+ desc: 'Write the schema JSON to the given file path'
19
106
  ##
20
- # Prints the feed to STDOUT.
107
+ # Prints or writes the exported configuration JSON Schema.
21
108
  #
22
- # @param yaml_file [String] Path to the YAML configuration file.
23
- # @param options [Array<String>] Additional options including feed name and parameters.
24
- # @return [nil]
25
- def feed(yaml_file, *options)
26
- raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
109
+ # @return [void]
110
+ def schema
111
+ schema_json = Html2rss::Config.json_schema_json(pretty: options.fetch(:pretty, true))
27
112
 
28
- feed_name = options.shift unless options.first&.include?('=')
29
- params = options.to_h { |opt| opt.split('=', 2) }
113
+ if options[:write]
114
+ FileUtils.mkdir_p(File.dirname(options[:write]))
115
+ File.write(options[:write], "#{schema_json}\n")
116
+ puts options[:write]
117
+ return
118
+ end
30
119
 
31
- puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
120
+ puts schema_json
32
121
  end
33
122
 
34
- desc 'auto URL', 'Automatically sources an RSS feed from the URL'
35
- method_option :strategy,
36
- type: :string,
37
- desc: 'The strategy to request the URL',
38
- enum: RequestService.strategy_names,
39
- default: RequestService.default_strategy_name
40
- def auto(url)
41
- strategy = options.fetch(:strategy) { RequestService.default_strategy_name }.to_sym
123
+ desc 'validate YAML_FILE [feed_name]', 'Validate a YAML config with the runtime validator'
124
+ method_option :params,
125
+ type: :hash,
126
+ optional: true,
127
+ required: false,
128
+ default: {}
129
+ ##
130
+ # Validates a YAML config and prints the result.
131
+ #
132
+ # @param yaml_file [String] the YAML file to validate
133
+ # @param feed_name [String, nil] optional feed name for multi-feed files
134
+ # @return [void]
135
+ def validate(yaml_file, feed_name = nil)
136
+ result = Html2rss::Config.validate_yaml(yaml_file, feed_name, params: options[:params] || {})
137
+
138
+ raise Thor::Error, "Invalid configuration: #{result.errors.to_h}" unless result.success?
139
+
140
+ puts 'Configuration is valid'
141
+ end
142
+
143
+ private
144
+
145
+ def apply_runtime_request_overrides!(config)
146
+ clear_blank_request_overrides!(config)
147
+ request_controls.apply_to(config)
148
+ end
149
+
150
+ def clear_blank_request_overrides!(config)
151
+ config.delete(:strategy) if config[:strategy].nil?
152
+
153
+ request_config = config[:request]
154
+ return unless request_config.is_a?(Hash)
155
+
156
+ %i[max_redirects max_requests].each do |key|
157
+ request_config.delete(key) if request_config[key].nil?
158
+ end
159
+ config.delete(:request) if request_config.empty?
160
+ end
161
+
162
+ def request_controls
163
+ Html2rss::RequestControls.new(
164
+ strategy: options[:strategy]&.to_sym,
165
+ max_redirects: options[:max_redirects],
166
+ max_requests: options[:max_requests],
167
+ explicit_keys: explicit_request_control_keys
168
+ )
169
+ end
170
+
171
+ def explicit_request_control_keys
172
+ keys = []
173
+ keys << :strategy if options[:strategy]
174
+ keys << :max_redirects unless options[:max_redirects].nil?
175
+ keys << :max_requests unless options[:max_requests].nil?
176
+ keys
177
+ end
178
+
179
+ def current_strategy
180
+ options[:strategy]&.to_sym || :auto
181
+ end
182
+
183
+ def current_max_redirects
184
+ options.fetch(:max_redirects, Html2rss::RequestService::Policy::DEFAULTS[:max_redirects])
185
+ end
186
+
187
+ def current_max_requests
188
+ options.fetch(:max_requests, Html2rss::RequestService::Policy::DEFAULTS[:max_requests])
189
+ end
190
+
191
+ def suggested_max_redirects
192
+ current_max_redirects + 1
193
+ end
194
+
195
+ def suggested_max_requests
196
+ current_max_requests + 1
197
+ end
42
198
 
43
- puts Html2rss.auto_source(url, strategy:)
199
+ def execute_feed # rubocop:disable Metrics/MethodLength
200
+ yield
201
+ rescue Faraday::FollowRedirects::RedirectLimitReached => error
202
+ raise Thor::Error,
203
+ "#{error.message}. retry with --max-redirects #{suggested_max_redirects} or use the final URL directly."
204
+ rescue Html2rss::RequestService::RequestBudgetExceeded => error
205
+ raise Thor::Error,
206
+ "#{error.message}. retry with --max-requests #{suggested_max_requests} " \
207
+ 'or increase request.max_requests in the config.'
208
+ rescue Html2rss::RequestService::BrowserlessConfigurationError,
209
+ Html2rss::RequestService::BrowserlessConnectionFailed,
210
+ Html2rss::RequestService::BotasaurusConfigurationError,
211
+ Html2rss::RequestService::BotasaurusConnectionFailed,
212
+ Html2rss::RequestService::BlockedSurfaceDetected,
213
+ Html2rss::NoFeedItemsExtracted => error
214
+ raise Thor::Error, error.message
44
215
  end
45
216
  end
46
217
  end
@@ -0,0 +1,178 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ ##
6
+ # Public class-level helpers for loading, validating, and exporting config.
7
+ module ClassMethods
8
+ # Sentinel to differentiate omitted params from explicit `nil`.
9
+ UNSET = Object.new.freeze
10
+
11
+ ##
12
+ # Returns the exported JSON Schema for html2rss configuration.
13
+ #
14
+ # @return [Hash{String => Object}] JSON Schema represented as a Ruby hash
15
+ def json_schema
16
+ Schema.json_schema
17
+ end
18
+
19
+ ##
20
+ # Returns the exported JSON Schema as JSON.
21
+ #
22
+ # @param pretty [Boolean] whether to pretty-print the JSON output
23
+ # @return [String] serialized JSON Schema
24
+ def json_schema_json(pretty: true)
25
+ pretty ? JSON.pretty_generate(json_schema) : JSON.generate(json_schema)
26
+ end
27
+
28
+ ##
29
+ # Validates a configuration hash with the runtime validator.
30
+ #
31
+ # @param config [Hash{Symbol => Object}] the configuration hash
32
+ # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
33
+ # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
34
+ def validate(config, params: UNSET)
35
+ prepared_config = prepare_for_validation(resolve_effective_config(config, params:))
36
+
37
+ Validator.new.call(prepared_config)
38
+ rescue DynamicParams::ParamsMissing => error
39
+ prepared_config = prepare_for_validation(HashUtil.deep_symbolize_keys(config, context: 'config'))
40
+ prepared_config[:dynamic_params_error] = error.message
41
+
42
+ Validator.new.call(prepared_config)
43
+ end
44
+
45
+ ##
46
+ # Returns the packaged JSON Schema file path.
47
+ #
48
+ # @return [String] absolute path to the packaged JSON Schema file
49
+ def schema_path
50
+ Schema.path
51
+ end
52
+
53
+ ##
54
+ # Loads and validates a YAML configuration file.
55
+ #
56
+ # @param file [String] the YAML file to load
57
+ # @param feed_name [String, nil] optional feed name for multi-feed files
58
+ # @param multiple_feeds_key [Symbol] key under which multiple feeds are defined
59
+ # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
60
+ # @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
61
+ def validate_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS, params: UNSET)
62
+ validate(load_yaml(file, feed_name, multiple_feeds_key:), params:)
63
+ end
64
+
65
+ ##
66
+ # Loads the feed configuration from a YAML file.
67
+ #
68
+ # Supports multiple feeds defined under the specified key (default :feeds).
69
+ #
70
+ # @param file [String] the YAML file to load.
71
+ # @param feed_name [String, nil] the feed name when using multiple feeds.
72
+ # @param multiple_feeds_key [Symbol] the key under which multiple feeds are defined.
73
+ # @return [Hash{Symbol => Object}] the configuration hash.
74
+ # @raise [ArgumentError] if the file doesn't exist or feed is not found.
75
+ # rubocop:disable Metrics/MethodLength
76
+ def load_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS)
77
+ raise ArgumentError, "File '#{file}' does not exist" unless File.exist?(file)
78
+ raise ArgumentError, "`#{multiple_feeds_key}` is a reserved feed name" if feed_name == multiple_feeds_key
79
+
80
+ yaml = YAML.safe_load_file(file, symbolize_names: true)
81
+
82
+ return yaml unless yaml.key?(multiple_feeds_key)
83
+
84
+ unless feed_name
85
+ available_feeds = yaml.fetch(multiple_feeds_key).keys.join(', ')
86
+ raise ArgumentError,
87
+ "Feed name is required under `#{multiple_feeds_key}`. Available feeds: #{available_feeds}"
88
+ end
89
+
90
+ config = yaml.dig(multiple_feeds_key, feed_name.to_sym)
91
+ raise ArgumentError, "Feed '#{feed_name}' not found under `#{multiple_feeds_key}` key." unless config
92
+
93
+ MultipleFeedsConfig.to_single_feed(config, yaml, multiple_feeds_key:)
94
+ end
95
+ # rubocop:enable Metrics/MethodLength
96
+
97
+ ##
98
+ # Processes the provided configuration hash, applying dynamic parameters if given,
99
+ # and returns a new configuration object.
100
+ #
101
+ # @param config [Hash{Symbol => Object}] the configuration hash.
102
+ # @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting.
103
+ # @return [Html2rss::Config] the configuration object.
104
+ def from_hash(config, params: UNSET)
105
+ new(resolve_effective_config(config, params:))
106
+ end
107
+
108
+ ##
109
+ # Builds a top-level auto-source feed config for the public shortcut APIs.
110
+ #
111
+ # @param url [String] source page URL
112
+ # @param items_selector [String, nil] optional selector hint for item extraction
113
+ # @param request_controls [Html2rss::RequestControls, nil] explicit request controls to write
114
+ # @return [Hash{Symbol => Object}] feed config hash ready for {from_hash}
115
+ def auto_source_config(url:, items_selector: nil, request_controls: nil)
116
+ config = {
117
+ channel: default_config[:channel].merge(url:),
118
+ auto_source: AutoSource::DEFAULT_CONFIG
119
+ }
120
+
121
+ request_controls ||= Html2rss::RequestControls.new
122
+ request_controls.apply_to(config)
123
+
124
+ config[:selectors] = { items: { selector: items_selector, enhance: true } } if items_selector
125
+ config
126
+ end
127
+
128
+ ##
129
+ # Provides a default configuration.
130
+ #
131
+ # @return [Hash{Symbol => Object}] a hash with default configuration values.
132
+ def default_config
133
+ {
134
+ strategy: default_strategy_name,
135
+ request: {
136
+ max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
137
+ max_requests: RequestService::Policy::DEFAULTS[:max_requests]
138
+ },
139
+ channel: { time_zone: 'UTC' },
140
+ headers: RequestHeaders.browser_defaults,
141
+ stylesheets: []
142
+ }
143
+ end
144
+
145
+ # @return [Symbol] the default strategy for feed orchestration
146
+ def default_strategy_name
147
+ :auto
148
+ end
149
+
150
+ private
151
+
152
+ def resolve_effective_config(config, params:)
153
+ effective_config = HashUtil.deep_symbolize_keys(config, context: 'config')
154
+ resolved_params = parameter_defaults(effective_config)
155
+ unless params.equal?(UNSET) || params.nil?
156
+ resolved_params.merge!(HashUtil.deep_symbolize_keys(params, context: 'params'))
157
+ end
158
+
159
+ effective_config[:headers] = DynamicParams.call(effective_config[:headers], resolved_params)
160
+ effective_config[:channel] = DynamicParams.call(effective_config[:channel], resolved_params)
161
+
162
+ effective_config
163
+ end
164
+
165
+ def parameter_defaults(config)
166
+ config.fetch(:parameters, {})
167
+ .filter_map do |name, definition|
168
+ [name, definition[:default]] if definition.is_a?(Hash) && definition.key?(:default)
169
+ end
170
+ .to_h
171
+ end
172
+
173
+ def prepare_for_validation(config)
174
+ Config::Preparer.new.call(HashUtil.deep_dup(config))
175
+ end
176
+ end
177
+ end
178
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ # Processes and applies dynamic parameter formatting in configuration values.
6
+ class DynamicParams
7
+ # Raised when string interpolation references an unavailable parameter.
8
+ class ParamsMissing < Html2rss::Error; end
9
+
10
+ class << self
11
+ # Recursively traverses the given value and formats any strings containing
12
+ # placeholders with values from the provided params.
13
+ #
14
+ # @param value [String, Hash, Enumerable, Object] value that may contain parameter placeholders
15
+ # @param params [Hash] The parameters for substitution.
16
+ # @param getter [Proc, nil] Optional proc to retrieve a key's value.
17
+ # @param replace_missing_with [Object, nil] Value to substitute if a key is missing.
18
+ # @return [Object] The processed value.
19
+ def call(value, params = {}, getter: nil, replace_missing_with: nil)
20
+ case value
21
+ when String
22
+ from_string(value, params, getter:, replace_missing_with:)
23
+ when Hash
24
+ from_hash(value, params, getter:, replace_missing_with:)
25
+ when Enumerable
26
+ from_enumerable(value, params, getter:, replace_missing_with:)
27
+ else
28
+ value
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def format_params(params, getter:, replace_missing_with:)
35
+ Hash.new do |hash, key|
36
+ hash[key] = if getter
37
+ getter.call(key)
38
+ else
39
+ params.fetch(key.to_sym) { params[key.to_s] }
40
+ end
41
+ hash[key] = replace_missing_with if hash[key].nil? && !replace_missing_with.nil?
42
+ hash[key]
43
+ end
44
+ end
45
+
46
+ def from_string(string, params, getter:, replace_missing_with:)
47
+ # Return the original string if no format placeholders are found.
48
+ return string unless /%\{[^{}]*\}|%<[^<>]*>/.match?(string)
49
+
50
+ mapping = format_params(params, getter:, replace_missing_with:)
51
+ format(string, mapping)
52
+ rescue KeyError => error
53
+ raise ParamsMissing, "Missing parameter for formatting: #{error.message}" if replace_missing_with.nil?
54
+
55
+ string
56
+ end
57
+
58
+ def from_hash(hash, params, getter:, replace_missing_with:)
59
+ HashUtil.deep_symbolize_keys(hash, context: 'dynamic params hash').to_h do |key, value|
60
+ [key, call(value, params, getter:, replace_missing_with:)]
61
+ end
62
+ end
63
+
64
+ def from_enumerable(enumerable, params, getter:, replace_missing_with:)
65
+ enumerable.map { |value| call(value, params, getter:, replace_missing_with:) }
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ # Handles multiple feeds within a single configuration hash.
6
+ # Individual feed configurations should be placed under the :feeds key,
7
+ # where each feed name is the key for its feed configuration.
8
+ # All global configuration keys (outside :feeds) are merged into each feed's settings.
9
+ class MultipleFeedsConfig
10
+ # Reserved YAML key under which multiple named feed configs are defined.
11
+ CONFIG_KEY_FEEDS = :feeds
12
+
13
+ class << self
14
+ # Merges global configuration into each feed's configuration.
15
+ #
16
+ # @param config [Hash] The feed-specific configuration.
17
+ # @param yaml [Hash] The full YAML configuration.
18
+ # @param multiple_feeds_key [Symbol] The key under which multiple feeds are defined.
19
+ # @return [Hash] The merged configuration.
20
+ def to_single_feed(config, yaml, multiple_feeds_key: CONFIG_KEY_FEEDS)
21
+ global_keys = yaml.keys - [multiple_feeds_key]
22
+ global_keys.each do |key|
23
+ config[key] = merge_key(config, yaml, key)
24
+ end
25
+ config
26
+ end
27
+
28
+ private
29
+
30
+ # Merges a specific global key from the YAML configuration into the feed configuration.
31
+ #
32
+ # @param config [Hash] The feed-specific configuration.
33
+ # @param yaml [Hash] The full YAML configuration.
34
+ # @param key [Symbol] The global configuration key to merge.
35
+ # @return [Object] The merged value for the key.
36
+ def merge_key(config, yaml, key)
37
+ global_value = yaml.fetch(key, nil)
38
+ local_value = config[key]
39
+ case local_value
40
+ when Hash
41
+ global_value.is_a?(Hash) ? HashUtil.deep_merge(global_value, local_value) : local_value
42
+ when Array
43
+ global_value.is_a?(Array) ? global_value + local_value : local_value
44
+ else
45
+ local_value.nil? ? global_value : local_value
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ ##
6
+ # Normalizes HTTP headers for outgoing requests.
7
+ # Ensures a browser-like baseline while respecting caller overrides.
8
+ class RequestHeaders
9
+ # Browser-like default `Accept` header value.
10
+ DEFAULT_ACCEPT = %w[
11
+ text/html
12
+ application/xhtml+xml
13
+ application/xml;q=0.9
14
+ image/avif
15
+ image/webp
16
+ image/apng
17
+ */*;q=0.8
18
+ ].join(',')
19
+
20
+ # Browser-like default `User-Agent` header value.
21
+ DEFAULT_USER_AGENT = [
22
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
23
+ 'AppleWebKit/537.36 (KHTML, like Gecko)',
24
+ 'Chrome/123.0.0.0',
25
+ 'Safari/537.36'
26
+ ].join(' ')
27
+
28
+ # Baseline browser-like header set used for outbound requests.
29
+ DEFAULT_HEADERS = {
30
+ 'Accept' => DEFAULT_ACCEPT,
31
+ 'Cache-Control' => 'max-age=0',
32
+ 'Connection' => 'keep-alive',
33
+ 'Sec-Fetch-Dest' => 'document',
34
+ 'Sec-Fetch-Mode' => 'navigate',
35
+ 'Sec-Fetch-Site' => 'none',
36
+ 'Sec-Fetch-User' => '?1',
37
+ 'Upgrade-Insecure-Requests' => '1',
38
+ 'User-Agent' => DEFAULT_USER_AGENT
39
+ }.freeze
40
+
41
+ class << self
42
+ ##
43
+ # @return [Hash{String => String}] the unmodified default header set
44
+ def browser_defaults
45
+ DEFAULT_HEADERS.dup
46
+ end
47
+
48
+ ##
49
+ # Normalizes the provided headers while applying Html2rss defaults.
50
+ #
51
+ # @param headers [Hash, nil] caller provided headers
52
+ # @param channel_language [String, nil] language defined on the channel
53
+ # @param url [String] request URL used to infer the Host header
54
+ # @return [Hash{String => String}] normalized HTTP headers
55
+ def normalize(headers, channel_language:, url:)
56
+ new(headers || {}, channel_language:, url:).to_h
57
+ end
58
+ end
59
+
60
+ # @param headers [Hash{String, Symbol => String}] caller-provided headers
61
+ # @param channel_language [String, nil] channel language hint for Accept-Language
62
+ # @param url [String, Html2rss::Url, nil] request URL used to infer Host
63
+ def initialize(headers, channel_language:, url:)
64
+ @headers = headers
65
+ @channel_language = channel_language
66
+ @url = url
67
+ end
68
+
69
+ ##
70
+ # @return [Hash{String => String}] normalized HTTP headers
71
+ def to_h
72
+ defaults = DEFAULT_HEADERS.dup
73
+ normalized = normalize_custom_headers(headers)
74
+
75
+ accept_override = normalized.delete('Accept')
76
+ defaults.merge!(normalized)
77
+
78
+ defaults['Accept'] = normalize_accept(accept_override)
79
+ defaults['Accept-Language'] = build_accept_language
80
+ defaults['Host'] ||= request_host
81
+
82
+ defaults.compact
83
+ end
84
+
85
+ private
86
+
87
+ attr_reader :headers, :channel_language, :url
88
+
89
+ def normalize_custom_headers(custom)
90
+ custom.transform_keys { canonicalize(_1) }
91
+ end
92
+
93
+ def canonicalize(key)
94
+ key.to_s.split('-').map!(&:capitalize).join('-')
95
+ end
96
+
97
+ def normalize_accept(override)
98
+ return DEFAULT_ACCEPT if override.nil? || override.empty?
99
+
100
+ values = accept_values(DEFAULT_ACCEPT)
101
+
102
+ accept_values(override).reverse_each do |value|
103
+ next if values.include?(value)
104
+
105
+ values.unshift(value)
106
+ end
107
+
108
+ values.join(',')
109
+ end
110
+
111
+ def accept_values(header)
112
+ header.split(',').map!(&:strip).reject(&:empty?)
113
+ end
114
+
115
+ def build_accept_language
116
+ language = channel_language.to_s.strip
117
+ return 'en-US,en;q=0.9' if language.empty?
118
+
119
+ normalized = language.tr('_', '-')
120
+ primary, region = normalized.split('-', 2)
121
+ primary = primary.downcase
122
+ region = region&.upcase
123
+
124
+ return primary if region.nil?
125
+
126
+ "#{primary}-#{region},#{primary};q=0.9"
127
+ end
128
+
129
+ def request_host
130
+ return nil if url.nil? || url.empty?
131
+
132
+ Html2rss::Url.from_absolute(url).host
133
+ end
134
+ end
135
+ end
136
+ end