html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
data/lib/html2rss/cli.rb
CHANGED
|
@@ -1,46 +1,217 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'json'
|
|
4
5
|
require 'thor'
|
|
5
6
|
|
|
6
7
|
##
|
|
7
8
|
# The Html2rss namespace / command line interface.
|
|
8
9
|
module Html2rss
|
|
9
|
-
Log = Logger.new($stderr)
|
|
10
|
-
|
|
11
10
|
##
|
|
12
11
|
# The Html2rss command line interface.
|
|
13
|
-
class CLI < Thor
|
|
12
|
+
class CLI < Thor # rubocop:disable Metrics/ClassLength
|
|
13
|
+
check_unknown_options!
|
|
14
|
+
# Ordered fallback chain attempted by auto strategy.
|
|
15
|
+
#
|
|
16
|
+
# @return [Array<Symbol>]
|
|
17
|
+
AUTO_FALLBACK_CHAIN = Html2rss::FeedPipeline::AutoFallback::CHAIN.freeze
|
|
18
|
+
# Supported CLI strategy option values.
|
|
19
|
+
#
|
|
20
|
+
# @return [Array<String>]
|
|
21
|
+
STRATEGY_OPTION_ENUM = (['auto'] + Html2rss::RequestService.strategy_names).uniq.freeze
|
|
22
|
+
# User-facing strategy help text that reflects the current fallback chain.
|
|
23
|
+
#
|
|
24
|
+
# @return [String]
|
|
25
|
+
STRATEGY_OPTION_DESC = [
|
|
26
|
+
'Optional request strategy (defaults to auto; auto tries',
|
|
27
|
+
"#{AUTO_FALLBACK_CHAIN.join(' -> ')})"
|
|
28
|
+
].join(' ').freeze
|
|
29
|
+
|
|
30
|
+
# @return [Boolean] whether Thor should terminate process on command failures
|
|
14
31
|
def self.exit_on_failure?
|
|
15
32
|
true
|
|
16
33
|
end
|
|
17
34
|
|
|
18
|
-
desc 'feed YAML_FILE [
|
|
35
|
+
desc 'feed YAML_FILE [feed_name]', 'Print RSS built from the YAML_FILE file to stdout'
|
|
36
|
+
method_option :params,
|
|
37
|
+
type: :hash,
|
|
38
|
+
optional: true,
|
|
39
|
+
required: false,
|
|
40
|
+
default: {}
|
|
41
|
+
method_option :strategy,
|
|
42
|
+
type: :string,
|
|
43
|
+
desc: STRATEGY_OPTION_DESC,
|
|
44
|
+
enum: STRATEGY_OPTION_ENUM
|
|
45
|
+
method_option :max_redirects,
|
|
46
|
+
type: :numeric,
|
|
47
|
+
desc: 'Maximum redirects to follow per request'
|
|
48
|
+
method_option :max_requests,
|
|
49
|
+
type: :numeric,
|
|
50
|
+
desc: 'Maximum requests to allow for this feed build'
|
|
51
|
+
# @param yaml_file [String] path to YAML config
|
|
52
|
+
# @param feed_name [String, nil] optional named feed in multi-feed config
|
|
53
|
+
# @return [void]
|
|
54
|
+
def feed(yaml_file, feed_name = nil)
|
|
55
|
+
config = Html2rss.config_from_yaml_file(yaml_file, feed_name)
|
|
56
|
+
config[:params] = options[:params] || {}
|
|
57
|
+
apply_runtime_request_overrides!(config)
|
|
58
|
+
|
|
59
|
+
puts(execute_feed { Html2rss.feed(config) })
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
desc 'auto [URL]', 'Automatically sources an RSS feed from the URL'
|
|
63
|
+
method_option :strategy,
|
|
64
|
+
type: :string,
|
|
65
|
+
desc: STRATEGY_OPTION_DESC,
|
|
66
|
+
enum: STRATEGY_OPTION_ENUM
|
|
67
|
+
method_option :format,
|
|
68
|
+
type: :string,
|
|
69
|
+
desc: 'Output format for the auto-sourced feed',
|
|
70
|
+
enum: %w[rss jsonfeed],
|
|
71
|
+
default: 'rss'
|
|
72
|
+
method_option :items_selector, type: :string, desc: 'CSS selector for items (will be enhanced) (optional)'
|
|
73
|
+
method_option :max_redirects,
|
|
74
|
+
type: :numeric,
|
|
75
|
+
desc: 'Maximum redirects to follow per request'
|
|
76
|
+
method_option :max_requests,
|
|
77
|
+
type: :numeric,
|
|
78
|
+
desc: 'Maximum requests to allow for this feed build'
|
|
79
|
+
# @param url [String] source page URL for auto discovery
|
|
80
|
+
# @return [void]
|
|
81
|
+
def auto(url) # rubocop:disable Metrics/MethodLength
|
|
82
|
+
format = options.fetch(:format, 'rss')
|
|
83
|
+
source_method = format == 'jsonfeed' ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
|
|
84
|
+
|
|
85
|
+
result = execute_feed do
|
|
86
|
+
source_method.call(
|
|
87
|
+
url,
|
|
88
|
+
strategy: current_strategy,
|
|
89
|
+
items_selector: options[:items_selector],
|
|
90
|
+
max_redirects: options[:max_redirects],
|
|
91
|
+
max_requests: options[:max_requests]
|
|
92
|
+
)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
puts(format == 'jsonfeed' ? JSON.pretty_generate(result) : result)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
desc 'schema', 'Print the exported config JSON Schema'
|
|
99
|
+
method_option :pretty,
|
|
100
|
+
type: :boolean,
|
|
101
|
+
desc: 'Pretty-print the schema JSON',
|
|
102
|
+
default: true
|
|
103
|
+
method_option :write,
|
|
104
|
+
type: :string,
|
|
105
|
+
desc: 'Write the schema JSON to the given file path'
|
|
19
106
|
##
|
|
20
|
-
# Prints the
|
|
107
|
+
# Prints or writes the exported configuration JSON Schema.
|
|
21
108
|
#
|
|
22
|
-
# @
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def feed(yaml_file, *options)
|
|
26
|
-
raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
|
|
109
|
+
# @return [void]
|
|
110
|
+
def schema
|
|
111
|
+
schema_json = Html2rss::Config.json_schema_json(pretty: options.fetch(:pretty, true))
|
|
27
112
|
|
|
28
|
-
|
|
29
|
-
|
|
113
|
+
if options[:write]
|
|
114
|
+
FileUtils.mkdir_p(File.dirname(options[:write]))
|
|
115
|
+
File.write(options[:write], "#{schema_json}\n")
|
|
116
|
+
puts options[:write]
|
|
117
|
+
return
|
|
118
|
+
end
|
|
30
119
|
|
|
31
|
-
puts
|
|
120
|
+
puts schema_json
|
|
32
121
|
end
|
|
33
122
|
|
|
34
|
-
desc '
|
|
35
|
-
method_option :
|
|
36
|
-
type: :
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
default:
|
|
40
|
-
|
|
41
|
-
|
|
123
|
+
desc 'validate YAML_FILE [feed_name]', 'Validate a YAML config with the runtime validator'
|
|
124
|
+
method_option :params,
|
|
125
|
+
type: :hash,
|
|
126
|
+
optional: true,
|
|
127
|
+
required: false,
|
|
128
|
+
default: {}
|
|
129
|
+
##
|
|
130
|
+
# Validates a YAML config and prints the result.
|
|
131
|
+
#
|
|
132
|
+
# @param yaml_file [String] the YAML file to validate
|
|
133
|
+
# @param feed_name [String, nil] optional feed name for multi-feed files
|
|
134
|
+
# @return [void]
|
|
135
|
+
def validate(yaml_file, feed_name = nil)
|
|
136
|
+
result = Html2rss::Config.validate_yaml(yaml_file, feed_name, params: options[:params] || {})
|
|
137
|
+
|
|
138
|
+
raise Thor::Error, "Invalid configuration: #{result.errors.to_h}" unless result.success?
|
|
139
|
+
|
|
140
|
+
puts 'Configuration is valid'
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
private
|
|
144
|
+
|
|
145
|
+
def apply_runtime_request_overrides!(config)
|
|
146
|
+
clear_blank_request_overrides!(config)
|
|
147
|
+
request_controls.apply_to(config)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def clear_blank_request_overrides!(config)
|
|
151
|
+
config.delete(:strategy) if config[:strategy].nil?
|
|
152
|
+
|
|
153
|
+
request_config = config[:request]
|
|
154
|
+
return unless request_config.is_a?(Hash)
|
|
155
|
+
|
|
156
|
+
%i[max_redirects max_requests].each do |key|
|
|
157
|
+
request_config.delete(key) if request_config[key].nil?
|
|
158
|
+
end
|
|
159
|
+
config.delete(:request) if request_config.empty?
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def request_controls
|
|
163
|
+
Html2rss::RequestControls.new(
|
|
164
|
+
strategy: options[:strategy]&.to_sym,
|
|
165
|
+
max_redirects: options[:max_redirects],
|
|
166
|
+
max_requests: options[:max_requests],
|
|
167
|
+
explicit_keys: explicit_request_control_keys
|
|
168
|
+
)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
def explicit_request_control_keys
|
|
172
|
+
keys = []
|
|
173
|
+
keys << :strategy if options[:strategy]
|
|
174
|
+
keys << :max_redirects unless options[:max_redirects].nil?
|
|
175
|
+
keys << :max_requests unless options[:max_requests].nil?
|
|
176
|
+
keys
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
def current_strategy
|
|
180
|
+
options[:strategy]&.to_sym || :auto
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def current_max_redirects
|
|
184
|
+
options.fetch(:max_redirects, Html2rss::RequestService::Policy::DEFAULTS[:max_redirects])
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
def current_max_requests
|
|
188
|
+
options.fetch(:max_requests, Html2rss::RequestService::Policy::DEFAULTS[:max_requests])
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def suggested_max_redirects
|
|
192
|
+
current_max_redirects + 1
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
def suggested_max_requests
|
|
196
|
+
current_max_requests + 1
|
|
197
|
+
end
|
|
42
198
|
|
|
43
|
-
|
|
199
|
+
def execute_feed # rubocop:disable Metrics/MethodLength
|
|
200
|
+
yield
|
|
201
|
+
rescue Faraday::FollowRedirects::RedirectLimitReached => error
|
|
202
|
+
raise Thor::Error,
|
|
203
|
+
"#{error.message}. retry with --max-redirects #{suggested_max_redirects} or use the final URL directly."
|
|
204
|
+
rescue Html2rss::RequestService::RequestBudgetExceeded => error
|
|
205
|
+
raise Thor::Error,
|
|
206
|
+
"#{error.message}. retry with --max-requests #{suggested_max_requests} " \
|
|
207
|
+
'or increase request.max_requests in the config.'
|
|
208
|
+
rescue Html2rss::RequestService::BrowserlessConfigurationError,
|
|
209
|
+
Html2rss::RequestService::BrowserlessConnectionFailed,
|
|
210
|
+
Html2rss::RequestService::BotasaurusConfigurationError,
|
|
211
|
+
Html2rss::RequestService::BotasaurusConnectionFailed,
|
|
212
|
+
Html2rss::RequestService::BlockedSurfaceDetected,
|
|
213
|
+
Html2rss::NoFeedItemsExtracted => error
|
|
214
|
+
raise Thor::Error, error.message
|
|
44
215
|
end
|
|
45
216
|
end
|
|
46
217
|
end
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
##
|
|
6
|
+
# Public class-level helpers for loading, validating, and exporting config.
|
|
7
|
+
module ClassMethods
|
|
8
|
+
# Sentinel to differentiate omitted params from explicit `nil`.
|
|
9
|
+
UNSET = Object.new.freeze
|
|
10
|
+
|
|
11
|
+
##
|
|
12
|
+
# Returns the exported JSON Schema for html2rss configuration.
|
|
13
|
+
#
|
|
14
|
+
# @return [Hash{String => Object}] JSON Schema represented as a Ruby hash
|
|
15
|
+
def json_schema
|
|
16
|
+
Schema.json_schema
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
##
|
|
20
|
+
# Returns the exported JSON Schema as JSON.
|
|
21
|
+
#
|
|
22
|
+
# @param pretty [Boolean] whether to pretty-print the JSON output
|
|
23
|
+
# @return [String] serialized JSON Schema
|
|
24
|
+
def json_schema_json(pretty: true)
|
|
25
|
+
pretty ? JSON.pretty_generate(json_schema) : JSON.generate(json_schema)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
##
|
|
29
|
+
# Validates a configuration hash with the runtime validator.
|
|
30
|
+
#
|
|
31
|
+
# @param config [Hash{Symbol => Object}] the configuration hash
|
|
32
|
+
# @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
|
|
33
|
+
# @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
|
|
34
|
+
def validate(config, params: UNSET)
|
|
35
|
+
prepared_config = prepare_for_validation(resolve_effective_config(config, params:))
|
|
36
|
+
|
|
37
|
+
Validator.new.call(prepared_config)
|
|
38
|
+
rescue DynamicParams::ParamsMissing => error
|
|
39
|
+
prepared_config = prepare_for_validation(HashUtil.deep_symbolize_keys(config, context: 'config'))
|
|
40
|
+
prepared_config[:dynamic_params_error] = error.message
|
|
41
|
+
|
|
42
|
+
Validator.new.call(prepared_config)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
##
|
|
46
|
+
# Returns the packaged JSON Schema file path.
|
|
47
|
+
#
|
|
48
|
+
# @return [String] absolute path to the packaged JSON Schema file
|
|
49
|
+
def schema_path
|
|
50
|
+
Schema.path
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
##
|
|
54
|
+
# Loads and validates a YAML configuration file.
|
|
55
|
+
#
|
|
56
|
+
# @param file [String] the YAML file to load
|
|
57
|
+
# @param feed_name [String, nil] optional feed name for multi-feed files
|
|
58
|
+
# @param multiple_feeds_key [Symbol] key under which multiple feeds are defined
|
|
59
|
+
# @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
|
|
60
|
+
# @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
|
|
61
|
+
def validate_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS, params: UNSET)
|
|
62
|
+
validate(load_yaml(file, feed_name, multiple_feeds_key:), params:)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
##
|
|
66
|
+
# Loads the feed configuration from a YAML file.
|
|
67
|
+
#
|
|
68
|
+
# Supports multiple feeds defined under the specified key (default :feeds).
|
|
69
|
+
#
|
|
70
|
+
# @param file [String] the YAML file to load.
|
|
71
|
+
# @param feed_name [String, nil] the feed name when using multiple feeds.
|
|
72
|
+
# @param multiple_feeds_key [Symbol] the key under which multiple feeds are defined.
|
|
73
|
+
# @return [Hash{Symbol => Object}] the configuration hash.
|
|
74
|
+
# @raise [ArgumentError] if the file doesn't exist or feed is not found.
|
|
75
|
+
# rubocop:disable Metrics/MethodLength
|
|
76
|
+
def load_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS)
|
|
77
|
+
raise ArgumentError, "File '#{file}' does not exist" unless File.exist?(file)
|
|
78
|
+
raise ArgumentError, "`#{multiple_feeds_key}` is a reserved feed name" if feed_name == multiple_feeds_key
|
|
79
|
+
|
|
80
|
+
yaml = YAML.safe_load_file(file, symbolize_names: true)
|
|
81
|
+
|
|
82
|
+
return yaml unless yaml.key?(multiple_feeds_key)
|
|
83
|
+
|
|
84
|
+
unless feed_name
|
|
85
|
+
available_feeds = yaml.fetch(multiple_feeds_key).keys.join(', ')
|
|
86
|
+
raise ArgumentError,
|
|
87
|
+
"Feed name is required under `#{multiple_feeds_key}`. Available feeds: #{available_feeds}"
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
config = yaml.dig(multiple_feeds_key, feed_name.to_sym)
|
|
91
|
+
raise ArgumentError, "Feed '#{feed_name}' not found under `#{multiple_feeds_key}` key." unless config
|
|
92
|
+
|
|
93
|
+
MultipleFeedsConfig.to_single_feed(config, yaml, multiple_feeds_key:)
|
|
94
|
+
end
|
|
95
|
+
# rubocop:enable Metrics/MethodLength
|
|
96
|
+
|
|
97
|
+
##
|
|
98
|
+
# Processes the provided configuration hash, applying dynamic parameters if given,
|
|
99
|
+
# and returns a new configuration object.
|
|
100
|
+
#
|
|
101
|
+
# @param config [Hash{Symbol => Object}] the configuration hash.
|
|
102
|
+
# @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting.
|
|
103
|
+
# @return [Html2rss::Config] the configuration object.
|
|
104
|
+
def from_hash(config, params: UNSET)
|
|
105
|
+
new(resolve_effective_config(config, params:))
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
##
|
|
109
|
+
# Builds a top-level auto-source feed config for the public shortcut APIs.
|
|
110
|
+
#
|
|
111
|
+
# @param url [String] source page URL
|
|
112
|
+
# @param items_selector [String, nil] optional selector hint for item extraction
|
|
113
|
+
# @param request_controls [Html2rss::RequestControls, nil] explicit request controls to write
|
|
114
|
+
# @return [Hash{Symbol => Object}] feed config hash ready for {from_hash}
|
|
115
|
+
def auto_source_config(url:, items_selector: nil, request_controls: nil)
|
|
116
|
+
config = {
|
|
117
|
+
channel: default_config[:channel].merge(url:),
|
|
118
|
+
auto_source: AutoSource::DEFAULT_CONFIG
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
request_controls ||= Html2rss::RequestControls.new
|
|
122
|
+
request_controls.apply_to(config)
|
|
123
|
+
|
|
124
|
+
config[:selectors] = { items: { selector: items_selector, enhance: true } } if items_selector
|
|
125
|
+
config
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
##
|
|
129
|
+
# Provides a default configuration.
|
|
130
|
+
#
|
|
131
|
+
# @return [Hash{Symbol => Object}] a hash with default configuration values.
|
|
132
|
+
def default_config
|
|
133
|
+
{
|
|
134
|
+
strategy: default_strategy_name,
|
|
135
|
+
request: {
|
|
136
|
+
max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
|
|
137
|
+
max_requests: RequestService::Policy::DEFAULTS[:max_requests]
|
|
138
|
+
},
|
|
139
|
+
channel: { time_zone: 'UTC' },
|
|
140
|
+
headers: RequestHeaders.browser_defaults,
|
|
141
|
+
stylesheets: []
|
|
142
|
+
}
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
# @return [Symbol] the default strategy for feed orchestration
|
|
146
|
+
def default_strategy_name
|
|
147
|
+
:auto
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
private
|
|
151
|
+
|
|
152
|
+
def resolve_effective_config(config, params:)
|
|
153
|
+
effective_config = HashUtil.deep_symbolize_keys(config, context: 'config')
|
|
154
|
+
resolved_params = parameter_defaults(effective_config)
|
|
155
|
+
unless params.equal?(UNSET) || params.nil?
|
|
156
|
+
resolved_params.merge!(HashUtil.deep_symbolize_keys(params, context: 'params'))
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
effective_config[:headers] = DynamicParams.call(effective_config[:headers], resolved_params)
|
|
160
|
+
effective_config[:channel] = DynamicParams.call(effective_config[:channel], resolved_params)
|
|
161
|
+
|
|
162
|
+
effective_config
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def parameter_defaults(config)
|
|
166
|
+
config.fetch(:parameters, {})
|
|
167
|
+
.filter_map do |name, definition|
|
|
168
|
+
[name, definition[:default]] if definition.is_a?(Hash) && definition.key?(:default)
|
|
169
|
+
end
|
|
170
|
+
.to_h
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def prepare_for_validation(config)
|
|
174
|
+
Config::Preparer.new.call(HashUtil.deep_dup(config))
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
# Processes and applies dynamic parameter formatting in configuration values.
|
|
6
|
+
class DynamicParams
|
|
7
|
+
# Raised when string interpolation references an unavailable parameter.
|
|
8
|
+
class ParamsMissing < Html2rss::Error; end
|
|
9
|
+
|
|
10
|
+
class << self
|
|
11
|
+
# Recursively traverses the given value and formats any strings containing
|
|
12
|
+
# placeholders with values from the provided params.
|
|
13
|
+
#
|
|
14
|
+
# @param value [String, Hash, Enumerable, Object] value that may contain parameter placeholders
|
|
15
|
+
# @param params [Hash] The parameters for substitution.
|
|
16
|
+
# @param getter [Proc, nil] Optional proc to retrieve a key's value.
|
|
17
|
+
# @param replace_missing_with [Object, nil] Value to substitute if a key is missing.
|
|
18
|
+
# @return [Object] The processed value.
|
|
19
|
+
def call(value, params = {}, getter: nil, replace_missing_with: nil)
|
|
20
|
+
case value
|
|
21
|
+
when String
|
|
22
|
+
from_string(value, params, getter:, replace_missing_with:)
|
|
23
|
+
when Hash
|
|
24
|
+
from_hash(value, params, getter:, replace_missing_with:)
|
|
25
|
+
when Enumerable
|
|
26
|
+
from_enumerable(value, params, getter:, replace_missing_with:)
|
|
27
|
+
else
|
|
28
|
+
value
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def format_params(params, getter:, replace_missing_with:)
|
|
35
|
+
Hash.new do |hash, key|
|
|
36
|
+
hash[key] = if getter
|
|
37
|
+
getter.call(key)
|
|
38
|
+
else
|
|
39
|
+
params.fetch(key.to_sym) { params[key.to_s] }
|
|
40
|
+
end
|
|
41
|
+
hash[key] = replace_missing_with if hash[key].nil? && !replace_missing_with.nil?
|
|
42
|
+
hash[key]
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def from_string(string, params, getter:, replace_missing_with:)
|
|
47
|
+
# Return the original string if no format placeholders are found.
|
|
48
|
+
return string unless /%\{[^{}]*\}|%<[^<>]*>/.match?(string)
|
|
49
|
+
|
|
50
|
+
mapping = format_params(params, getter:, replace_missing_with:)
|
|
51
|
+
format(string, mapping)
|
|
52
|
+
rescue KeyError => error
|
|
53
|
+
raise ParamsMissing, "Missing parameter for formatting: #{error.message}" if replace_missing_with.nil?
|
|
54
|
+
|
|
55
|
+
string
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def from_hash(hash, params, getter:, replace_missing_with:)
|
|
59
|
+
HashUtil.deep_symbolize_keys(hash, context: 'dynamic params hash').to_h do |key, value|
|
|
60
|
+
[key, call(value, params, getter:, replace_missing_with:)]
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def from_enumerable(enumerable, params, getter:, replace_missing_with:)
|
|
65
|
+
enumerable.map { |value| call(value, params, getter:, replace_missing_with:) }
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
# Handles multiple feeds within a single configuration hash.
|
|
6
|
+
# Individual feed configurations should be placed under the :feeds key,
|
|
7
|
+
# where each feed name is the key for its feed configuration.
|
|
8
|
+
# All global configuration keys (outside :feeds) are merged into each feed's settings.
|
|
9
|
+
class MultipleFeedsConfig
|
|
10
|
+
# Reserved YAML key under which multiple named feed configs are defined.
|
|
11
|
+
CONFIG_KEY_FEEDS = :feeds
|
|
12
|
+
|
|
13
|
+
class << self
|
|
14
|
+
# Merges global configuration into each feed's configuration.
|
|
15
|
+
#
|
|
16
|
+
# @param config [Hash] The feed-specific configuration.
|
|
17
|
+
# @param yaml [Hash] The full YAML configuration.
|
|
18
|
+
# @param multiple_feeds_key [Symbol] The key under which multiple feeds are defined.
|
|
19
|
+
# @return [Hash] The merged configuration.
|
|
20
|
+
def to_single_feed(config, yaml, multiple_feeds_key: CONFIG_KEY_FEEDS)
|
|
21
|
+
global_keys = yaml.keys - [multiple_feeds_key]
|
|
22
|
+
global_keys.each do |key|
|
|
23
|
+
config[key] = merge_key(config, yaml, key)
|
|
24
|
+
end
|
|
25
|
+
config
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
# Merges a specific global key from the YAML configuration into the feed configuration.
|
|
31
|
+
#
|
|
32
|
+
# @param config [Hash] The feed-specific configuration.
|
|
33
|
+
# @param yaml [Hash] The full YAML configuration.
|
|
34
|
+
# @param key [Symbol] The global configuration key to merge.
|
|
35
|
+
# @return [Object] The merged value for the key.
|
|
36
|
+
def merge_key(config, yaml, key)
|
|
37
|
+
global_value = yaml.fetch(key, nil)
|
|
38
|
+
local_value = config[key]
|
|
39
|
+
case local_value
|
|
40
|
+
when Hash
|
|
41
|
+
global_value.is_a?(Hash) ? HashUtil.deep_merge(global_value, local_value) : local_value
|
|
42
|
+
when Array
|
|
43
|
+
global_value.is_a?(Array) ? global_value + local_value : local_value
|
|
44
|
+
else
|
|
45
|
+
local_value.nil? ? global_value : local_value
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class Config
|
|
5
|
+
##
|
|
6
|
+
# Normalizes HTTP headers for outgoing requests.
|
|
7
|
+
# Ensures a browser-like baseline while respecting caller overrides.
|
|
8
|
+
class RequestHeaders
|
|
9
|
+
# Browser-like default `Accept` header value.
|
|
10
|
+
DEFAULT_ACCEPT = %w[
|
|
11
|
+
text/html
|
|
12
|
+
application/xhtml+xml
|
|
13
|
+
application/xml;q=0.9
|
|
14
|
+
image/avif
|
|
15
|
+
image/webp
|
|
16
|
+
image/apng
|
|
17
|
+
*/*;q=0.8
|
|
18
|
+
].join(',')
|
|
19
|
+
|
|
20
|
+
# Browser-like default `User-Agent` header value.
|
|
21
|
+
DEFAULT_USER_AGENT = [
|
|
22
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
|
23
|
+
'AppleWebKit/537.36 (KHTML, like Gecko)',
|
|
24
|
+
'Chrome/123.0.0.0',
|
|
25
|
+
'Safari/537.36'
|
|
26
|
+
].join(' ')
|
|
27
|
+
|
|
28
|
+
# Baseline browser-like header set used for outbound requests.
|
|
29
|
+
DEFAULT_HEADERS = {
|
|
30
|
+
'Accept' => DEFAULT_ACCEPT,
|
|
31
|
+
'Cache-Control' => 'max-age=0',
|
|
32
|
+
'Connection' => 'keep-alive',
|
|
33
|
+
'Sec-Fetch-Dest' => 'document',
|
|
34
|
+
'Sec-Fetch-Mode' => 'navigate',
|
|
35
|
+
'Sec-Fetch-Site' => 'none',
|
|
36
|
+
'Sec-Fetch-User' => '?1',
|
|
37
|
+
'Upgrade-Insecure-Requests' => '1',
|
|
38
|
+
'User-Agent' => DEFAULT_USER_AGENT
|
|
39
|
+
}.freeze
|
|
40
|
+
|
|
41
|
+
class << self
|
|
42
|
+
##
|
|
43
|
+
# @return [Hash{String => String}] the unmodified default header set
|
|
44
|
+
def browser_defaults
|
|
45
|
+
DEFAULT_HEADERS.dup
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
##
|
|
49
|
+
# Normalizes the provided headers while applying Html2rss defaults.
|
|
50
|
+
#
|
|
51
|
+
# @param headers [Hash, nil] caller provided headers
|
|
52
|
+
# @param channel_language [String, nil] language defined on the channel
|
|
53
|
+
# @param url [String] request URL used to infer the Host header
|
|
54
|
+
# @return [Hash{String => String}] normalized HTTP headers
|
|
55
|
+
def normalize(headers, channel_language:, url:)
|
|
56
|
+
new(headers || {}, channel_language:, url:).to_h
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# @param headers [Hash{String, Symbol => String}] caller-provided headers
|
|
61
|
+
# @param channel_language [String, nil] channel language hint for Accept-Language
|
|
62
|
+
# @param url [String, Html2rss::Url, nil] request URL used to infer Host
|
|
63
|
+
def initialize(headers, channel_language:, url:)
|
|
64
|
+
@headers = headers
|
|
65
|
+
@channel_language = channel_language
|
|
66
|
+
@url = url
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
##
|
|
70
|
+
# @return [Hash{String => String}] normalized HTTP headers
|
|
71
|
+
def to_h
|
|
72
|
+
defaults = DEFAULT_HEADERS.dup
|
|
73
|
+
normalized = normalize_custom_headers(headers)
|
|
74
|
+
|
|
75
|
+
accept_override = normalized.delete('Accept')
|
|
76
|
+
defaults.merge!(normalized)
|
|
77
|
+
|
|
78
|
+
defaults['Accept'] = normalize_accept(accept_override)
|
|
79
|
+
defaults['Accept-Language'] = build_accept_language
|
|
80
|
+
defaults['Host'] ||= request_host
|
|
81
|
+
|
|
82
|
+
defaults.compact
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
private
|
|
86
|
+
|
|
87
|
+
attr_reader :headers, :channel_language, :url
|
|
88
|
+
|
|
89
|
+
def normalize_custom_headers(custom)
|
|
90
|
+
custom.transform_keys { canonicalize(_1) }
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def canonicalize(key)
|
|
94
|
+
key.to_s.split('-').map!(&:capitalize).join('-')
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def normalize_accept(override)
|
|
98
|
+
return DEFAULT_ACCEPT if override.nil? || override.empty?
|
|
99
|
+
|
|
100
|
+
values = accept_values(DEFAULT_ACCEPT)
|
|
101
|
+
|
|
102
|
+
accept_values(override).reverse_each do |value|
|
|
103
|
+
next if values.include?(value)
|
|
104
|
+
|
|
105
|
+
values.unshift(value)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
values.join(',')
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def accept_values(header)
|
|
112
|
+
header.split(',').map!(&:strip).reject(&:empty?)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def build_accept_language
|
|
116
|
+
language = channel_language.to_s.strip
|
|
117
|
+
return 'en-US,en;q=0.9' if language.empty?
|
|
118
|
+
|
|
119
|
+
normalized = language.tr('_', '-')
|
|
120
|
+
primary, region = normalized.split('-', 2)
|
|
121
|
+
primary = primary.downcase
|
|
122
|
+
region = region&.upcase
|
|
123
|
+
|
|
124
|
+
return primary if region.nil?
|
|
125
|
+
|
|
126
|
+
"#{primary}-#{region},#{primary};q=0.9"
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def request_host
|
|
130
|
+
return nil if url.nil? || url.empty?
|
|
131
|
+
|
|
132
|
+
Html2rss::Url.from_absolute(url).host
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|