html2rss 0.16.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -657
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +7 -4
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -9
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -78
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +120 -46
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -108
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
data/lib/html2rss/url.rb
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'addressable/uri'
|
|
4
|
+
require 'cgi'
|
|
5
|
+
|
|
6
|
+
module Html2rss
|
|
7
|
+
##
|
|
8
|
+
# A value object representing a resolved, absolute URL with built-in operations.
|
|
9
|
+
# Provides URL resolution, sanitization, and titleization capabilities.
|
|
10
|
+
#
|
|
11
|
+
# @example Creating a URL from a relative path
|
|
12
|
+
# url = Url.from_relative('/path/to/article', 'https://example.com')
|
|
13
|
+
# url.to_s # => "https://example.com/path/to/article"
|
|
14
|
+
#
|
|
15
|
+
# @example Sanitizing a raw URL string
|
|
16
|
+
# url = Url.sanitize('https://example.com/ ')
|
|
17
|
+
# url.to_s # => "https://example.com/"
|
|
18
|
+
#
|
|
19
|
+
# @example Getting titleized versions
|
|
20
|
+
# url = Url.from_relative('/foo-bar/baz.txt', 'https://example.com')
|
|
21
|
+
# url.titleized # => "Foo Bar Baz"
|
|
22
|
+
# url.channel_titleized # => "example.com: Foo Bar Baz"
|
|
23
|
+
class Url
|
|
24
|
+
include Comparable
|
|
25
|
+
|
|
26
|
+
# Regular expression for basic URI format validation
|
|
27
|
+
URI_REGEXP = Addressable::URI::URIREGEX
|
|
28
|
+
SUPPORTED_SCHEMES = %w[http https].to_set.freeze
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Creates a URL from a relative path and base URL.
|
|
32
|
+
#
|
|
33
|
+
# @param relative_url [String, Html2rss::Url] the relative URL to resolve
|
|
34
|
+
# @param base_url [String, Html2rss::Url] the base URL to resolve against
|
|
35
|
+
# @return [Url] the resolved absolute URL
|
|
36
|
+
# @raise [ArgumentError] if the URL cannot be parsed
|
|
37
|
+
def self.from_relative(relative_url, base_url)
|
|
38
|
+
url = Addressable::URI.parse(relative_url.to_s.strip)
|
|
39
|
+
return new(url) if url.absolute?
|
|
40
|
+
|
|
41
|
+
base_uri = Addressable::URI.parse(base_url.to_s)
|
|
42
|
+
base_uri.path = '/' if base_uri.path.empty?
|
|
43
|
+
|
|
44
|
+
new(base_uri.join(url).normalize)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
##
|
|
48
|
+
# Creates a URL by sanitizing a raw URL string.
|
|
49
|
+
# Removes spaces and extracts the first valid URL from the string.
|
|
50
|
+
#
|
|
51
|
+
# @param raw_url [String] the raw URL string to sanitize
|
|
52
|
+
# @return [Url, nil] the sanitized URL, or nil if no valid URL found
|
|
53
|
+
def self.sanitize(raw_url)
|
|
54
|
+
matched_urls = raw_url.to_s.scan(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
|
|
55
|
+
url = matched_urls.first.to_s.strip
|
|
56
|
+
return nil if url.empty?
|
|
57
|
+
|
|
58
|
+
new(Addressable::URI.parse(url).normalize)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
##
|
|
62
|
+
# Creates a URL from an already-absolute URL string.
|
|
63
|
+
#
|
|
64
|
+
# @param url_string [String, Html2rss::Url] the absolute URL to parse
|
|
65
|
+
# @return [Url] the parsed and normalized URL
|
|
66
|
+
# @raise [ArgumentError] if the URL is not absolute or cannot be parsed
|
|
67
|
+
def self.from_absolute(url_string)
|
|
68
|
+
return url_string if url_string.is_a?(self)
|
|
69
|
+
|
|
70
|
+
url = new(Addressable::URI.parse(url_string.to_s.strip).normalize)
|
|
71
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
|
72
|
+
|
|
73
|
+
url
|
|
74
|
+
rescue Addressable::URI::InvalidURIError
|
|
75
|
+
raise ArgumentError, 'URL must be absolute'
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
##
|
|
79
|
+
# Creates a URL for channel use with validation.
|
|
80
|
+
# Validates that the URL meets channel requirements (absolute, no @, supported schemes).
|
|
81
|
+
#
|
|
82
|
+
# @param url_string [String] the URL string to validate and parse
|
|
83
|
+
# @return [Url] the validated and parsed URL
|
|
84
|
+
# @raise [ArgumentError] if the URL doesn't meet channel requirements
|
|
85
|
+
# @example Creating a channel URL
|
|
86
|
+
# Url.for_channel('https://example.com')
|
|
87
|
+
# # => #<Html2rss::Url:... @uri=#<Addressable::URI:... URI:https://example.com>>
|
|
88
|
+
# @example Invalid channel URL
|
|
89
|
+
# Url.for_channel('/relative/path')
|
|
90
|
+
# # => raises ArgumentError: "URL must be absolute"
|
|
91
|
+
def self.for_channel(url_string)
|
|
92
|
+
return nil if url_string.nil? || url_string.empty?
|
|
93
|
+
|
|
94
|
+
stripped = url_string.strip
|
|
95
|
+
return nil if stripped.empty?
|
|
96
|
+
|
|
97
|
+
url = from_absolute(stripped)
|
|
98
|
+
validate_channel_url(url)
|
|
99
|
+
url
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
##
|
|
103
|
+
# Validates that a URL meets channel requirements.
|
|
104
|
+
#
|
|
105
|
+
# @param url [Url] the URL to validate
|
|
106
|
+
# @raise [ArgumentError] if the URL doesn't meet channel requirements
|
|
107
|
+
def self.validate_channel_url(url)
|
|
108
|
+
raise ArgumentError, 'URL must be absolute' unless url.absolute?
|
|
109
|
+
|
|
110
|
+
raise ArgumentError, 'URL must not contain an @ character' if url.to_s.include?('@')
|
|
111
|
+
|
|
112
|
+
scheme = url.scheme
|
|
113
|
+
raise ArgumentError, "URL scheme '#{scheme}' is not supported" unless SUPPORTED_SCHEMES.include?(scheme)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
private_class_method :validate_channel_url
|
|
117
|
+
|
|
118
|
+
##
|
|
119
|
+
# @param uri [Addressable::URI] the underlying Addressable::URI object (internal use only)
|
|
120
|
+
def initialize(uri)
|
|
121
|
+
@uri = uri.freeze
|
|
122
|
+
freeze
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Delegate common URI operations to the underlying URI
|
|
126
|
+
def to_s = @uri.to_s
|
|
127
|
+
def scheme = @uri.scheme
|
|
128
|
+
def host = @uri.host
|
|
129
|
+
def port = @uri.port
|
|
130
|
+
def path = @uri.path
|
|
131
|
+
def query = @uri.query
|
|
132
|
+
def fragment = @uri.fragment
|
|
133
|
+
def absolute? = @uri.absolute?
|
|
134
|
+
|
|
135
|
+
##
|
|
136
|
+
# Returns the URL query string as a hash of string keys and values.
|
|
137
|
+
#
|
|
138
|
+
# @return [Hash{String => String}] normalized query parameters
|
|
139
|
+
def query_values
|
|
140
|
+
@uri.query_values(Hash) || {}
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
##
|
|
144
|
+
# Returns the URL path split into non-empty segments.
|
|
145
|
+
#
|
|
146
|
+
# @return [Array<String>] normalized path segments
|
|
147
|
+
def path_segments
|
|
148
|
+
@uri.path.to_s.split('/').reject(&:empty?)
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
##
|
|
152
|
+
# Returns a copy of the URL with the provided path.
|
|
153
|
+
#
|
|
154
|
+
# @param path [String] normalized absolute path
|
|
155
|
+
# @return [Url] a new URL with the updated path
|
|
156
|
+
def with_path(path)
|
|
157
|
+
uri = @uri.dup
|
|
158
|
+
uri.path = path
|
|
159
|
+
self.class.from_absolute(uri.normalize.to_s)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
##
|
|
163
|
+
# Returns a copy of the URL with the provided query values.
|
|
164
|
+
#
|
|
165
|
+
# @param values [Hash{String, Symbol => #to_s}] query parameters to assign
|
|
166
|
+
# @return [Url] a new URL with the updated query string
|
|
167
|
+
def with_query_values(values)
|
|
168
|
+
uri = @uri.dup
|
|
169
|
+
uri.query_values = values.transform_keys(&:to_s).transform_values(&:to_s)
|
|
170
|
+
self.class.from_absolute(uri.normalize.to_s)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
##
|
|
174
|
+
# Returns a titleized representation of the URL path.
|
|
175
|
+
# Converts the path to a human-readable title by cleaning and capitalizing words.
|
|
176
|
+
# Removes file extensions and special characters, then capitalizes each word.
|
|
177
|
+
#
|
|
178
|
+
# @return [String] the titleized path, or empty string if path is empty
|
|
179
|
+
# @example Basic titleization
|
|
180
|
+
# url = Url.from_absolute('https://example.com/foo-bar/baz.txt')
|
|
181
|
+
# url.titleized # => "Foo Bar Baz"
|
|
182
|
+
# @example With URL encoding
|
|
183
|
+
# url = Url.from_absolute('https://example.com/hello%20world/article.html')
|
|
184
|
+
# url.titleized # => "Hello World Article"
|
|
185
|
+
def titleized
|
|
186
|
+
path = @uri.path
|
|
187
|
+
return '' if path.empty?
|
|
188
|
+
|
|
189
|
+
nicer_path = CGI.unescapeURIComponent(path)
|
|
190
|
+
.split('/')
|
|
191
|
+
.flat_map do |part|
|
|
192
|
+
part.gsub(/[^a-zA-Z0-9.]/, ' ').gsub(/\s+/, ' ').split
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
nicer_path.map!(&:capitalize)
|
|
196
|
+
File.basename(nicer_path.join(' '), '.*')
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
##
|
|
200
|
+
# Returns a titleized representation of the URL with prefixed host.
|
|
201
|
+
# Creates a channel title by combining host and path information.
|
|
202
|
+
# Useful for RSS channel titles that need to identify the source.
|
|
203
|
+
#
|
|
204
|
+
# @return [String] the titleized channel URL
|
|
205
|
+
# @example With path
|
|
206
|
+
# url = Url.from_absolute('https://example.com/foo-bar/baz')
|
|
207
|
+
# url.channel_titleized # => "example.com: Foo Bar Baz"
|
|
208
|
+
# @example Without path (root URL)
|
|
209
|
+
# url = Url.from_absolute('https://example.com')
|
|
210
|
+
# url.channel_titleized # => "example.com"
|
|
211
|
+
def channel_titleized
|
|
212
|
+
nicer_path = CGI.unescapeURIComponent(@uri.path).split('/').reject(&:empty?)
|
|
213
|
+
host = @uri.host
|
|
214
|
+
|
|
215
|
+
nicer_path.any? ? "#{host}: #{nicer_path.map(&:capitalize).join(' ')}" : host
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
##
|
|
219
|
+
# Compares this URL with another URL for equality.
|
|
220
|
+
# URLs are considered equal if their string representations are the same.
|
|
221
|
+
#
|
|
222
|
+
# @param other [Url] the other URL to compare with
|
|
223
|
+
# @return [Integer] -1, 0, or 1 for less than, equal, or greater than
|
|
224
|
+
def <=>(other)
|
|
225
|
+
to_s <=> other.to_s
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
##
|
|
229
|
+
# Returns true if this URL is equal to another URL.
|
|
230
|
+
#
|
|
231
|
+
# @param other [Object] the other object to compare with
|
|
232
|
+
# @return [Boolean] true if the URLs are equal
|
|
233
|
+
def ==(other)
|
|
234
|
+
other.is_a?(Url) && to_s == other.to_s
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
##
|
|
238
|
+
# Supports hash-based comparisons by ensuring equality semantics match `hash`.
|
|
239
|
+
#
|
|
240
|
+
# @param other [Object] the other object to compare with
|
|
241
|
+
# @return [Boolean] true if the URLs are considered equal
|
|
242
|
+
def eql?(other)
|
|
243
|
+
other.is_a?(Url) && to_s == other.to_s
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
##
|
|
247
|
+
# Returns the hash code for this URL.
|
|
248
|
+
#
|
|
249
|
+
# @return [Integer] the hash code
|
|
250
|
+
def hash
|
|
251
|
+
to_s.hash
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
##
|
|
255
|
+
# Returns a string representation of the URL for debugging.
|
|
256
|
+
#
|
|
257
|
+
# @return [String] the debug representation
|
|
258
|
+
def inspect
|
|
259
|
+
"#<#{self.class}:#{object_id} @uri=#{@uri.inspect}>"
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
|
@@ -3,11 +3,10 @@
|
|
|
3
3
|
require 'zeitwerk'
|
|
4
4
|
|
|
5
5
|
loader = Zeitwerk::Loader.for_gem
|
|
6
|
+
loader.inflector.inflect('cli' => 'CLI')
|
|
6
7
|
loader.setup
|
|
7
8
|
|
|
8
|
-
require 'addressable'
|
|
9
9
|
require 'logger'
|
|
10
|
-
require 'yaml'
|
|
11
10
|
|
|
12
11
|
##
|
|
13
12
|
# The Html2rss namespace.
|
|
@@ -23,90 +22,150 @@ module Html2rss
|
|
|
23
22
|
end
|
|
24
23
|
|
|
25
24
|
##
|
|
26
|
-
#
|
|
27
|
-
class Error < StandardError; end
|
|
28
|
-
|
|
29
|
-
##
|
|
30
|
-
# Key for the feeds configuration in the YAML file.
|
|
31
|
-
CONFIG_KEY_FEEDS = :feeds
|
|
32
|
-
|
|
33
|
-
##
|
|
34
|
-
# Returns an RSS object generated from the provided YAML file configuration.
|
|
35
|
-
#
|
|
36
|
-
# Example:
|
|
25
|
+
# Loads a feed configuration from YAML.
|
|
37
26
|
#
|
|
38
|
-
#
|
|
39
|
-
#
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
# @param global_config [Hash] Global options (e.g., HTTP headers).
|
|
44
|
-
# @param params [Hash] Dynamic parameters for the feed configuration.
|
|
45
|
-
# @return [RSS::Rss] RSS object generated from the configuration.
|
|
46
|
-
def self.feed_from_yaml_config(file, name = nil, global_config: {}, params: {})
|
|
47
|
-
yaml = YAML.safe_load_file(file, symbolize_names: true)
|
|
48
|
-
feeds = yaml[CONFIG_KEY_FEEDS] || {}
|
|
49
|
-
|
|
50
|
-
feed_config = find_feed_config(yaml, feeds, name, global_config)
|
|
51
|
-
|
|
52
|
-
feed(Config.new(feed_config, global_config, params))
|
|
27
|
+
# @param file [String] path to the YAML file
|
|
28
|
+
# @param feed_name [String, nil] optional feed name inside a multi-feed config
|
|
29
|
+
# @return [Hash<Symbol, Object>] loaded configuration hash
|
|
30
|
+
def self.config_from_yaml_file(file, feed_name = nil)
|
|
31
|
+
Config.load_yaml(file, feed_name)
|
|
53
32
|
end
|
|
54
33
|
|
|
55
34
|
##
|
|
56
35
|
# Returns an RSS object generated from the provided configuration.
|
|
57
36
|
#
|
|
58
|
-
#
|
|
59
|
-
#
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
# title: { selector: 'a' },
|
|
65
|
-
# link: { selector: 'a', extractor: 'href' }
|
|
66
|
-
# }
|
|
67
|
-
# )
|
|
68
|
-
# # => #<RSS::Rss:0x00007fb2f48d14a0 ...>
|
|
69
|
-
#
|
|
70
|
-
# @param config [Hash<Symbol, Object>, Html2rss::Config] Feed configuration.
|
|
71
|
-
# @return [RSS::Rss] RSS object generated from the configuration.
|
|
72
|
-
def self.feed(config)
|
|
73
|
-
config = Config.new(config) unless config.is_a?(Config)
|
|
74
|
-
RssBuilder.build(config)
|
|
37
|
+
# @param raw_config [Hash<Symbol, Object>] feed configuration
|
|
38
|
+
# @return [RSS::Rss] generated RSS feed
|
|
39
|
+
def self.feed(raw_config)
|
|
40
|
+
run_pipeline(raw_config) do |response:, config:, articles:|
|
|
41
|
+
build_rss_feed(response:, config:, articles:)
|
|
42
|
+
end
|
|
75
43
|
end
|
|
76
44
|
|
|
77
45
|
##
|
|
78
|
-
#
|
|
46
|
+
# Returns a JSONFeed 1.1 hash generated from the provided configuration.
|
|
79
47
|
#
|
|
80
|
-
# @param
|
|
81
|
-
# @
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def self.find_feed_config(yaml, feeds, feed_name, global_config)
|
|
86
|
-
return yaml unless feed_name
|
|
87
|
-
|
|
88
|
-
feed_name = feed_name.to_sym
|
|
89
|
-
if feeds.key?(feed_name)
|
|
90
|
-
global_config.merge!(yaml.reject { |key| key == CONFIG_KEY_FEEDS })
|
|
91
|
-
feeds[feed_name]
|
|
92
|
-
else
|
|
93
|
-
yaml
|
|
48
|
+
# @param raw_config [Hash<Symbol, Object>] feed configuration
|
|
49
|
+
# @return [Hash] JSONFeed-compliant hash
|
|
50
|
+
def self.json_feed(raw_config)
|
|
51
|
+
run_pipeline(raw_config) do |response:, config:, articles:|
|
|
52
|
+
build_json_feed(response:, config:, articles:)
|
|
94
53
|
end
|
|
95
54
|
end
|
|
96
55
|
|
|
97
56
|
##
|
|
98
57
|
# Scrapes the provided URL and returns an RSS object.
|
|
99
|
-
# No need for a "feed config".
|
|
100
58
|
#
|
|
101
|
-
# @param url [String]
|
|
102
|
-
# @param strategy [Symbol]
|
|
103
|
-
# @
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
59
|
+
# @param url [String] source page URL
|
|
60
|
+
# @param strategy [Symbol] request strategy to use
|
|
61
|
+
# @param items_selector [String, nil] optional selector hint for item extraction
|
|
62
|
+
# @param max_redirects [Integer, nil] optional redirect limit override
|
|
63
|
+
# @param max_requests [Integer, nil] optional request budget override
|
|
64
|
+
# @return [RSS::Rss] generated RSS feed
|
|
65
|
+
def self.auto_source(url, strategy: :faraday, items_selector: nil, max_redirects: nil, max_requests: nil)
|
|
66
|
+
feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
##
|
|
70
|
+
# Scrapes the provided URL and returns a JSONFeed 1.1 hash.
|
|
71
|
+
#
|
|
72
|
+
# @param url [String] source page URL
|
|
73
|
+
# @param strategy [Symbol] request strategy to use
|
|
74
|
+
# @param items_selector [String, nil] optional selector hint for item extraction
|
|
75
|
+
# @param max_redirects [Integer, nil] optional redirect limit override
|
|
76
|
+
# @param max_requests [Integer, nil] optional request budget override
|
|
77
|
+
# @return [Hash] JSONFeed-compliant hash
|
|
78
|
+
def self.auto_json_feed(url, strategy: :faraday, items_selector: nil, max_redirects: nil, max_requests: nil)
|
|
79
|
+
json_feed(build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:))
|
|
109
80
|
end
|
|
110
81
|
|
|
111
|
-
|
|
82
|
+
class << self
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def run_pipeline(raw_config)
|
|
86
|
+
# 1. Normalize and validate the user-facing feed config.
|
|
87
|
+
config = Config.from_hash(raw_config, params: raw_config[:params])
|
|
88
|
+
runtime_input = RequestSession::RuntimeInput.from_config(config)
|
|
89
|
+
|
|
90
|
+
# 2. Fetch the initial page using a shared request session.
|
|
91
|
+
request_session = RequestSession.from_runtime_input(runtime_input)
|
|
92
|
+
response = request_session.fetch_initial_response
|
|
93
|
+
|
|
94
|
+
# 3. Collect articles from configured selectors and auto-source scrapers.
|
|
95
|
+
articles = Articles::Deduplicator.new(
|
|
96
|
+
collect_articles(response:, config:, request_session:)
|
|
97
|
+
).call
|
|
98
|
+
|
|
99
|
+
# 4. Render the final output format chosen by the public entrypoint.
|
|
100
|
+
yield response:, config:, articles:
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def collect_articles(response:, config:, request_session:)
|
|
104
|
+
selector_articles(response:, config:, request_session:) +
|
|
105
|
+
auto_source_articles(response:, config:, request_session:)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def selector_articles(response:, config:, request_session:) # rubocop:disable Metrics/MethodLength
|
|
109
|
+
return [] unless (selectors = config.selectors)
|
|
110
|
+
|
|
111
|
+
page_responses = if (max_pages = selectors.dig(:items, :pagination, :max_pages))
|
|
112
|
+
RequestSession::RelNextPager.new(
|
|
113
|
+
session: request_session,
|
|
114
|
+
initial_response: response,
|
|
115
|
+
max_pages:
|
|
116
|
+
).to_a
|
|
117
|
+
else
|
|
118
|
+
[response]
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
page_responses.flat_map do |page_response|
|
|
122
|
+
Selectors.new(page_response, selectors:, time_zone: config.time_zone).articles
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def auto_source_articles(response:, config:, request_session:)
|
|
127
|
+
return [] unless (auto_source = config.auto_source)
|
|
128
|
+
|
|
129
|
+
AutoSource.new(response, auto_source, request_session:).articles
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def build_rss_feed(response:, config:, articles:)
|
|
133
|
+
channel = RssBuilder::Channel.new(response, overrides: config.channel)
|
|
134
|
+
|
|
135
|
+
RssBuilder.new(channel:, articles:, stylesheets: config.stylesheets).call
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def build_json_feed(response:, config:, articles:)
|
|
139
|
+
channel = RssBuilder::Channel.new(response, overrides: config.channel)
|
|
140
|
+
|
|
141
|
+
JsonFeedBuilder.new(channel:, articles:).call
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
|
|
145
|
+
keys = []
|
|
146
|
+
keys << :strategy unless strategy == :faraday
|
|
147
|
+
keys << :max_redirects unless max_redirects.nil?
|
|
148
|
+
keys << :max_requests unless max_requests.nil?
|
|
149
|
+
keys
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def build_auto_source_config(url:, strategy:, items_selector:, max_redirects:, max_requests:)
|
|
153
|
+
Config.auto_source_config(
|
|
154
|
+
url:,
|
|
155
|
+
items_selector:,
|
|
156
|
+
request_controls: shortcut_request_controls(strategy:, max_redirects:, max_requests:)
|
|
157
|
+
)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def shortcut_request_controls(strategy:, max_redirects:, max_requests:)
|
|
161
|
+
RequestControls.new(
|
|
162
|
+
strategy:,
|
|
163
|
+
max_redirects:,
|
|
164
|
+
max_requests:,
|
|
165
|
+
explicit_keys: explicit_request_control_keys(strategy:, max_redirects:, max_requests:)
|
|
166
|
+
)
|
|
167
|
+
end
|
|
168
|
+
end
|
|
112
169
|
end
|
|
170
|
+
|
|
171
|
+
loader.eager_load
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'fileutils'
|
|
5
|
+
require_relative '../html2rss'
|
|
6
|
+
|
|
7
|
+
namespace :config do
|
|
8
|
+
desc 'Generate config JSON schema'
|
|
9
|
+
task :schema do
|
|
10
|
+
destination = Html2rss::Config.schema_path
|
|
11
|
+
|
|
12
|
+
FileUtils.mkdir_p(File.dirname(destination))
|
|
13
|
+
File.write(destination, "#{Html2rss::Config.json_schema_json}\n")
|
|
14
|
+
|
|
15
|
+
puts "Generated config schema at #{destination}"
|
|
16
|
+
end
|
|
17
|
+
end
|