html2rss 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -1
- data/lib/html2rss/articles/deduplicator.rb +1 -0
- data/lib/html2rss/auto_source/cleanup.rb +11 -0
- data/lib/html2rss/auto_source/scraper/html.rb +5 -0
- data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
- data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
- data/lib/html2rss/auto_source/scraper.rb +19 -1
- data/lib/html2rss/auto_source.rb +4 -0
- data/lib/html2rss/blocked_surface.rb +1 -0
- data/lib/html2rss/category_extractor.rb +2 -2
- data/lib/html2rss/cli.rb +30 -6
- data/lib/html2rss/config/class_methods.rb +24 -35
- data/lib/html2rss/config/dynamic_params.rb +6 -4
- data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
- data/lib/html2rss/config/request_headers.rb +9 -3
- data/lib/html2rss/config/schema.rb +33 -1
- data/lib/html2rss/config/validator.rb +40 -2
- data/lib/html2rss/config.rb +19 -13
- data/lib/html2rss/error.rb +25 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
- data/lib/html2rss/html_extractor.rb +5 -0
- data/lib/html2rss/html_navigator.rb +8 -0
- data/lib/html2rss/json_feed_builder.rb +1 -0
- data/lib/html2rss/rendering/audio_renderer.rb +8 -3
- data/lib/html2rss/rendering/description_builder.rb +0 -1
- data/lib/html2rss/rendering/image_renderer.rb +17 -7
- data/lib/html2rss/rendering/media_renderer.rb +4 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
- data/lib/html2rss/rendering/video_renderer.rb +8 -3
- data/lib/html2rss/rendering.rb +11 -2
- data/lib/html2rss/request_controls.rb +16 -21
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/context.rb +14 -2
- data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
- data/lib/html2rss/request_service/policy.rb +4 -0
- data/lib/html2rss/request_service/response.rb +9 -1
- data/lib/html2rss/request_service.rb +19 -0
- data/lib/html2rss/request_session/runtime_input.rb +16 -2
- data/lib/html2rss/request_session/runtime_policy.rb +7 -0
- data/lib/html2rss/request_session.rb +13 -9
- data/lib/html2rss/rss_builder/article.rb +22 -1
- data/lib/html2rss/rss_builder/channel.rb +11 -2
- data/lib/html2rss/rss_builder/enclosure.rb +15 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
- data/lib/html2rss/rss_builder.rb +4 -0
- data/lib/html2rss/selectors/config.rb +1 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
- data/lib/html2rss/selectors/extractors/href.rb +2 -0
- data/lib/html2rss/selectors/extractors/html.rb +1 -0
- data/lib/html2rss/selectors/extractors/static.rb +2 -1
- data/lib/html2rss/selectors/extractors/text.rb +1 -0
- data/lib/html2rss/selectors/extractors.rb +2 -1
- data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
- data/lib/html2rss/selectors/post_processors/base.rb +13 -7
- data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
- data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
- data/lib/html2rss/selectors/post_processors/template.rb +3 -0
- data/lib/html2rss/selectors/post_processors.rb +5 -0
- data/lib/html2rss/selectors.rb +7 -0
- data/lib/html2rss/url.rb +27 -23
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +15 -78
- data/schema/html2rss-config.schema.json +83 -1
- metadata +7 -2
|
@@ -7,9 +7,16 @@ module Html2rss
|
|
|
7
7
|
##
|
|
8
8
|
# Determines whether a WordPress page can safely be mapped to a posts query.
|
|
9
9
|
class PageScope
|
|
10
|
+
# Canonical path segment for category archives.
|
|
10
11
|
CATEGORY_SEGMENT = 'category'
|
|
12
|
+
# Canonical path segment for tag archives.
|
|
11
13
|
TAG_SEGMENT = 'tag'
|
|
14
|
+
# Canonical path segment for author archives.
|
|
12
15
|
AUTHOR_SEGMENT = 'author'
|
|
16
|
+
# Canonical path segment for paginated archives.
|
|
17
|
+
PAGE_SEGMENT = 'page'
|
|
18
|
+
# Canonical query key used for paginated archives.
|
|
19
|
+
PAGED_QUERY_KEY = 'paged'
|
|
13
20
|
|
|
14
21
|
##
|
|
15
22
|
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
@@ -20,7 +27,7 @@ module Html2rss
|
|
|
20
27
|
end
|
|
21
28
|
|
|
22
29
|
##
|
|
23
|
-
# @param query [Hash
|
|
30
|
+
# @param query [Hash{String => String}] scoped query params for the posts endpoint
|
|
24
31
|
# @param fetchable [Boolean] whether a posts follow-up is safe for this page
|
|
25
32
|
# @param reason [Symbol] classification of the resolved page scope
|
|
26
33
|
def initialize(query:, fetchable:, reason:)
|
|
@@ -31,7 +38,7 @@ module Html2rss
|
|
|
31
38
|
end
|
|
32
39
|
|
|
33
40
|
##
|
|
34
|
-
# @return [Hash
|
|
41
|
+
# @return [Hash{String => String}] query params to apply to the posts request
|
|
35
42
|
attr_reader :query
|
|
36
43
|
|
|
37
44
|
##
|
|
@@ -58,11 +65,13 @@ module Html2rss
|
|
|
58
65
|
##
|
|
59
66
|
# @return [PageScope] derived page scope
|
|
60
67
|
def call
|
|
61
|
-
category_scope ||
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
68
|
+
scope = category_scope ||
|
|
69
|
+
tag_scope ||
|
|
70
|
+
author_scope ||
|
|
71
|
+
date_scope ||
|
|
72
|
+
fallback_scope
|
|
73
|
+
|
|
74
|
+
apply_pagination(scope)
|
|
66
75
|
end
|
|
67
76
|
|
|
68
77
|
private
|
|
@@ -103,6 +112,17 @@ module Html2rss
|
|
|
103
112
|
PageScope.new(query: {}, fetchable: true, reason: :unscoped)
|
|
104
113
|
end
|
|
105
114
|
|
|
115
|
+
def apply_pagination(scope)
|
|
116
|
+
page = archive_page_number
|
|
117
|
+
return scope unless scope.fetchable? && page
|
|
118
|
+
|
|
119
|
+
PageScope.new(
|
|
120
|
+
query: scope.query.merge('page' => page.to_s),
|
|
121
|
+
fetchable: scope.fetchable?,
|
|
122
|
+
reason: scope.reason
|
|
123
|
+
)
|
|
124
|
+
end
|
|
125
|
+
|
|
106
126
|
def scoped_scope(query)
|
|
107
127
|
return unknown_archive_scope if query.values.any?(&:nil?)
|
|
108
128
|
|
|
@@ -166,8 +186,12 @@ module Html2rss
|
|
|
166
186
|
@path_segments ||= canonical_or_current_url.path_segments
|
|
167
187
|
end
|
|
168
188
|
|
|
189
|
+
def scoped_path_segments
|
|
190
|
+
@scoped_path_segments ||= paginated_path? ? path_segments[0...-2] : path_segments
|
|
191
|
+
end
|
|
192
|
+
|
|
169
193
|
def leading_path_segment
|
|
170
|
-
|
|
194
|
+
scoped_path_segments.first
|
|
171
195
|
end
|
|
172
196
|
|
|
173
197
|
def date_archive_path?
|
|
@@ -202,16 +226,33 @@ module Html2rss
|
|
|
202
226
|
end
|
|
203
227
|
|
|
204
228
|
def date_archive_segments
|
|
205
|
-
year_index =
|
|
229
|
+
year_index = scoped_path_segments.find_index { _1.match?(/\A\d{4}\z/) }
|
|
206
230
|
return unless year_index
|
|
207
231
|
|
|
208
|
-
segments =
|
|
232
|
+
segments = scoped_path_segments.drop(year_index)
|
|
209
233
|
return unless segments.length.between?(1, 3)
|
|
210
234
|
return unless archive_segment_shape?(segments)
|
|
211
235
|
|
|
212
236
|
segments
|
|
213
237
|
end
|
|
214
238
|
|
|
239
|
+
def paginated_path?
|
|
240
|
+
return false if path_segments.length < 2
|
|
241
|
+
return false unless path_segments[-2] == PAGE_SEGMENT
|
|
242
|
+
|
|
243
|
+
!parse_positive_integer(path_segments[-1]).nil?
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def archive_page_number
|
|
247
|
+
parse_positive_integer(canonical_or_current_url.query_values[PAGED_QUERY_KEY]) || path_page_number
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def path_page_number
|
|
251
|
+
return unless paginated_path?
|
|
252
|
+
|
|
253
|
+
parse_positive_integer(path_segments[-1])
|
|
254
|
+
end
|
|
255
|
+
|
|
215
256
|
def archive_segment_shape?(segments)
|
|
216
257
|
month = segments[1]
|
|
217
258
|
day = segments[2]
|
|
@@ -253,6 +294,15 @@ module Html2rss
|
|
|
253
294
|
|
|
254
295
|
number
|
|
255
296
|
end
|
|
297
|
+
|
|
298
|
+
def parse_positive_integer(value)
|
|
299
|
+
return nil unless value.to_s.match?(/\A\d+\z/)
|
|
300
|
+
|
|
301
|
+
number = value.to_i
|
|
302
|
+
return nil if number < 1
|
|
303
|
+
|
|
304
|
+
number
|
|
305
|
+
end
|
|
256
306
|
end
|
|
257
307
|
end
|
|
258
308
|
end
|
|
@@ -7,13 +7,14 @@ module Html2rss
|
|
|
7
7
|
##
|
|
8
8
|
# Resolves the WordPress posts endpoint for a given page and scope.
|
|
9
9
|
class PostsEndpoint
|
|
10
|
+
# REST API collection path for posts resources.
|
|
10
11
|
POSTS_PATH = 'wp/v2/posts'
|
|
11
12
|
|
|
12
13
|
##
|
|
13
14
|
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
14
15
|
# @param page_url [Html2rss::Url] canonical page URL
|
|
15
16
|
# @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
|
|
16
|
-
# @param posts_query [Hash
|
|
17
|
+
# @param posts_query [Hash{String => String}] query params for the posts request
|
|
17
18
|
# @param logger [Logger] logger used for operational warnings
|
|
18
19
|
# @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
|
|
19
20
|
def self.resolve(parsed_body:, page_url:, page_scope:, posts_query:, logger: Html2rss::Log)
|
|
@@ -24,7 +25,7 @@ module Html2rss
|
|
|
24
25
|
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
25
26
|
# @param page_url [Html2rss::Url] canonical page URL
|
|
26
27
|
# @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
|
|
27
|
-
# @param posts_query [Hash
|
|
28
|
+
# @param posts_query [Hash{String => String}] query params for the posts request
|
|
28
29
|
# @param logger [Logger] logger used for operational warnings
|
|
29
30
|
def initialize(parsed_body:, page_url:, page_scope:, posts_query:, logger:)
|
|
30
31
|
@parsed_body = parsed_body
|
|
@@ -10,9 +10,18 @@ module Html2rss
|
|
|
10
10
|
class WordpressApi # rubocop:disable Metrics/ClassLength
|
|
11
11
|
include Enumerable
|
|
12
12
|
|
|
13
|
+
# Selector for WordPress API discovery link tags.
|
|
13
14
|
API_LINK_SELECTOR = 'link[rel="https://api.w.org/"][href]'
|
|
15
|
+
# Selector for canonical link tags used for scope normalization.
|
|
14
16
|
CANONICAL_LINK_SELECTOR = 'link[rel="canonical"][href]'
|
|
17
|
+
# Fields requested from the WordPress posts endpoint.
|
|
15
18
|
POSTS_FIELDS = %w[id title excerpt content link date categories].freeze
|
|
19
|
+
# Baseline query sent to WordPress posts API follow-ups.
|
|
20
|
+
POSTS_QUERY_DEFAULTS = {
|
|
21
|
+
'_fields' => POSTS_FIELDS.join(','),
|
|
22
|
+
'per_page' => '100'
|
|
23
|
+
}.freeze
|
|
24
|
+
# @return [Symbol] scraper config key
|
|
16
25
|
def self.options_key = :wordpress_api
|
|
17
26
|
|
|
18
27
|
##
|
|
@@ -29,6 +38,7 @@ module Html2rss
|
|
|
29
38
|
# @param url [String, Html2rss::Url] canonical page URL
|
|
30
39
|
# @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
|
|
31
40
|
# @param _opts [Hash] unused scraper-specific options
|
|
41
|
+
# @option _opts [Object] :_reserved reserved for future scraper-specific options
|
|
32
42
|
# @return [void]
|
|
33
43
|
def initialize(parsed_body, url:, request_session: nil, **_opts)
|
|
34
44
|
@parsed_body = parsed_body
|
|
@@ -40,7 +50,7 @@ module Html2rss
|
|
|
40
50
|
##
|
|
41
51
|
# Yields article hashes from the WordPress posts API.
|
|
42
52
|
#
|
|
43
|
-
# @yieldparam article [Hash
|
|
53
|
+
# @yieldparam article [Hash{Symbol => Object}] normalized article hash
|
|
44
54
|
# @return [Enumerator, void] enumerator when no block is given
|
|
45
55
|
def each
|
|
46
56
|
return enum_for(:each) unless block_given?
|
|
@@ -94,7 +104,7 @@ module Html2rss
|
|
|
94
104
|
end
|
|
95
105
|
|
|
96
106
|
def article_id(_post, article_url)
|
|
97
|
-
root_path_query_id(article_url) ||
|
|
107
|
+
root_path_query_id(article_url) || present_string(article_url.path) || article_url.to_s
|
|
98
108
|
end
|
|
99
109
|
|
|
100
110
|
def article_title(post)
|
|
@@ -106,11 +116,11 @@ module Html2rss
|
|
|
106
116
|
end
|
|
107
117
|
|
|
108
118
|
def article_published_at(post)
|
|
109
|
-
|
|
119
|
+
present_string(post[:date])
|
|
110
120
|
end
|
|
111
121
|
|
|
112
122
|
def article_categories(post)
|
|
113
|
-
Array(post[:categories]).filter_map { |value|
|
|
123
|
+
Array(post[:categories]).filter_map { |value| present_string(value) }
|
|
114
124
|
end
|
|
115
125
|
|
|
116
126
|
def article_attributes(post, article_url)
|
|
@@ -125,7 +135,7 @@ module Html2rss
|
|
|
125
135
|
end
|
|
126
136
|
|
|
127
137
|
def absolute_link(link)
|
|
128
|
-
value =
|
|
138
|
+
value = present_string(link)
|
|
129
139
|
return unless value
|
|
130
140
|
|
|
131
141
|
Html2rss::Url.from_relative(value, url)
|
|
@@ -138,17 +148,17 @@ module Html2rss
|
|
|
138
148
|
end
|
|
139
149
|
|
|
140
150
|
def rendered_html(value)
|
|
141
|
-
text =
|
|
151
|
+
text = present_string(value)
|
|
142
152
|
text unless text.nil?
|
|
143
153
|
end
|
|
144
154
|
|
|
145
|
-
def
|
|
155
|
+
def present_string(value)
|
|
146
156
|
text = value.to_s.strip
|
|
147
157
|
text unless text.empty?
|
|
148
158
|
end
|
|
149
159
|
|
|
150
160
|
def root_path_query_id(article_url)
|
|
151
|
-
query =
|
|
161
|
+
query = present_string(article_url.query)
|
|
152
162
|
return unless query
|
|
153
163
|
|
|
154
164
|
path = article_url.path.to_s
|
|
@@ -158,10 +168,7 @@ module Html2rss
|
|
|
158
168
|
end
|
|
159
169
|
|
|
160
170
|
def posts_query
|
|
161
|
-
|
|
162
|
-
'_fields' => POSTS_FIELDS.join(','),
|
|
163
|
-
'per_page' => '100'
|
|
164
|
-
}.merge(page_scope.query)
|
|
171
|
+
POSTS_QUERY_DEFAULTS.merge(page_scope.query).transform_values(&:to_s)
|
|
165
172
|
end
|
|
166
173
|
|
|
167
174
|
def posts_endpoint_url
|
|
@@ -14,12 +14,15 @@ module Html2rss
|
|
|
14
14
|
# Scrapers run in parallel threads, so implementations must avoid shared
|
|
15
15
|
# mutable state and degrade by returning no articles when a follow-up would
|
|
16
16
|
# be unsafe or unsupported.
|
|
17
|
-
#
|
|
18
17
|
module Scraper
|
|
18
|
+
# Root markers indicating likely app-shell/client-rendered surfaces.
|
|
19
19
|
APP_SHELL_ROOT_SELECTORS = '#app, #root, #__next, [data-reactroot], [ng-app], [id*="app-shell"]'
|
|
20
|
+
# Maximum anchors tolerated before app-shell detection is considered unlikely.
|
|
20
21
|
APP_SHELL_MAX_ANCHORS = 2
|
|
22
|
+
# Maximum visible text length tolerated for app-shell classification.
|
|
21
23
|
APP_SHELL_MAX_VISIBLE_TEXT_LENGTH = 220
|
|
22
24
|
|
|
25
|
+
# Ordered scraper classes considered during auto-source extraction.
|
|
23
26
|
SCRAPERS = [
|
|
24
27
|
WordpressApi,
|
|
25
28
|
Schema,
|
|
@@ -32,6 +35,7 @@ module Html2rss
|
|
|
32
35
|
##
|
|
33
36
|
# Error raised when no suitable scraper is found.
|
|
34
37
|
class NoScraperFound < Html2rss::Error
|
|
38
|
+
# User-facing messages grouped by no-scraper surface category.
|
|
35
39
|
CATEGORY_MESSAGES = {
|
|
36
40
|
blocked_surface: 'No scrapers found: blocked surface likely (anti-bot or interstitial). ' \
|
|
37
41
|
'Retry with --strategy browserless, try a more specific public listing URL, ' \
|
|
@@ -44,6 +48,8 @@ module Html2rss
|
|
|
44
48
|
'or use explicit selectors in a feed config.'
|
|
45
49
|
}.freeze
|
|
46
50
|
|
|
51
|
+
# @param message [String, nil] custom error message override
|
|
52
|
+
# @param category [Symbol] no-scraper classification
|
|
47
53
|
def initialize(message = nil, category: :unsupported_surface)
|
|
48
54
|
validate_category!(category)
|
|
49
55
|
@category = category
|
|
@@ -66,6 +72,12 @@ module Html2rss
|
|
|
66
72
|
# Returns an array of scraper classes that claim to find articles in the parsed body.
|
|
67
73
|
# @param parsed_body [Nokogiri::HTML::Document] The parsed HTML body.
|
|
68
74
|
# @param opts [Hash] The options hash.
|
|
75
|
+
# @option opts [Hash] :wordpress_api scraper toggle and configuration
|
|
76
|
+
# @option opts [Hash] :schema scraper toggle and configuration
|
|
77
|
+
# @option opts [Hash] :microdata scraper toggle and configuration
|
|
78
|
+
# @option opts [Hash] :json_state scraper toggle and configuration
|
|
79
|
+
# @option opts [Hash] :semantic_html scraper toggle and configuration
|
|
80
|
+
# @option opts [Hash] :html scraper toggle and configuration
|
|
69
81
|
# @return [Array<Class>] An array of scraper classes that can handle the parsed body.
|
|
70
82
|
def self.from(parsed_body, opts = Html2rss::AutoSource::DEFAULT_CONFIG[:scraper])
|
|
71
83
|
scrapers = SCRAPERS.select { |scraper| opts.dig(scraper.options_key, :enabled) }
|
|
@@ -81,6 +93,12 @@ module Html2rss
|
|
|
81
93
|
# @param url [String, Html2rss::Url] The page url.
|
|
82
94
|
# @param request_session [Html2rss::RequestSession, nil] Shared follow-up session.
|
|
83
95
|
# @param opts [Hash] The options hash.
|
|
96
|
+
# @option opts [Hash] :wordpress_api scraper toggle and configuration
|
|
97
|
+
# @option opts [Hash] :schema scraper toggle and configuration
|
|
98
|
+
# @option opts [Hash] :microdata scraper toggle and configuration
|
|
99
|
+
# @option opts [Hash] :json_state scraper toggle and configuration
|
|
100
|
+
# @option opts [Hash] :semantic_html scraper toggle and configuration
|
|
101
|
+
# @option opts [Hash] :html scraper toggle and configuration
|
|
84
102
|
# @return [Array<Object>] An array of scraper instances that can handle the parsed body.
|
|
85
103
|
#
|
|
86
104
|
# `instances_for` is the main entrypoint for extraction. It lets a scraper
|
data/lib/html2rss/auto_source.rb
CHANGED
|
@@ -17,6 +17,7 @@ module Html2rss
|
|
|
17
17
|
# @see Html2rss::AutoSource::Scraper::SemanticHtml
|
|
18
18
|
# @see Html2rss::AutoSource::Scraper::Html
|
|
19
19
|
class AutoSource
|
|
20
|
+
# Default auto-source configuration shipped for scraper and cleanup behavior.
|
|
20
21
|
DEFAULT_CONFIG = {
|
|
21
22
|
scraper: {
|
|
22
23
|
wordpress_api: {
|
|
@@ -67,6 +68,7 @@ module Html2rss
|
|
|
67
68
|
end.freeze
|
|
68
69
|
private_constant :SCRAPER_CONFIG
|
|
69
70
|
|
|
71
|
+
# Runtime schema used to validate auto-source config values.
|
|
70
72
|
Config = Dry::Schema.Params do
|
|
71
73
|
optional(:scraper).hash(&SCRAPER_CONFIG)
|
|
72
74
|
|
|
@@ -80,6 +82,8 @@ module Html2rss
|
|
|
80
82
|
# @param response [Html2rss::RequestService::Response] initial page response
|
|
81
83
|
# @param opts [Hash] validated auto-source options
|
|
82
84
|
# @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
|
|
85
|
+
# @option opts [Hash] :scraper scraper configuration map
|
|
86
|
+
# @option opts [Hash] :cleanup cleanup configuration map
|
|
83
87
|
# @return [void]
|
|
84
88
|
def initialize(response, opts = DEFAULT_CONFIG, request_session: nil)
|
|
85
89
|
@parsed_body = response.parsed_body
|
|
@@ -7,6 +7,7 @@ module Html2rss
|
|
|
7
7
|
# This module centralizes signature matching so request-time guards and
|
|
8
8
|
# auto-source surface classification stay consistent.
|
|
9
9
|
module BlockedSurface
|
|
10
|
+
# Known interstitial fingerprints used to detect blocked or anti-bot surfaces.
|
|
10
11
|
INTERSTITIAL_SIGNATURES = [
|
|
11
12
|
{
|
|
12
13
|
key: :cloudflare_interstitial,
|
|
@@ -49,7 +49,7 @@ module Html2rss
|
|
|
49
49
|
##
|
|
50
50
|
# Extracts categories from data attributes of a single element.
|
|
51
51
|
#
|
|
52
|
-
# @param element [Nokogiri::XML::Element]
|
|
52
|
+
# @param element [Nokogiri::XML::Element] metadata element that may contain category links
|
|
53
53
|
# @return [Set<String>] Set of category strings
|
|
54
54
|
def self.extract_element_data_categories(element)
|
|
55
55
|
Set.new.tap do |categories|
|
|
@@ -65,7 +65,7 @@ module Html2rss
|
|
|
65
65
|
##
|
|
66
66
|
# Extracts text-based categories from elements, splitting content into discrete values.
|
|
67
67
|
#
|
|
68
|
-
# @param element [Nokogiri::XML::Element]
|
|
68
|
+
# @param element [Nokogiri::XML::Element] metadata element whose text may contain delimiters
|
|
69
69
|
# @return [Set<String>] Set of category strings
|
|
70
70
|
def self.extract_text_categories(element)
|
|
71
71
|
anchor_values = element.css('a').filter_map do |node|
|
data/lib/html2rss/cli.rb
CHANGED
|
@@ -11,7 +11,23 @@ module Html2rss
|
|
|
11
11
|
# The Html2rss command line interface.
|
|
12
12
|
class CLI < Thor # rubocop:disable Metrics/ClassLength
|
|
13
13
|
check_unknown_options!
|
|
14
|
+
# Ordered fallback chain attempted by auto strategy.
|
|
15
|
+
#
|
|
16
|
+
# @return [Array<Symbol>]
|
|
17
|
+
AUTO_FALLBACK_CHAIN = Html2rss::FeedPipeline::AutoFallback::CHAIN.freeze
|
|
18
|
+
# Supported CLI strategy option values.
|
|
19
|
+
#
|
|
20
|
+
# @return [Array<String>]
|
|
21
|
+
STRATEGY_OPTION_ENUM = (['auto'] + Html2rss::RequestService.strategy_names).uniq.freeze
|
|
22
|
+
# User-facing strategy help text that reflects the current fallback chain.
|
|
23
|
+
#
|
|
24
|
+
# @return [String]
|
|
25
|
+
STRATEGY_OPTION_DESC = [
|
|
26
|
+
'Optional request strategy (defaults to auto; auto tries',
|
|
27
|
+
"#{AUTO_FALLBACK_CHAIN.join(' -> ')})"
|
|
28
|
+
].join(' ').freeze
|
|
14
29
|
|
|
30
|
+
# @return [Boolean] whether Thor should terminate process on command failures
|
|
15
31
|
def self.exit_on_failure?
|
|
16
32
|
true
|
|
17
33
|
end
|
|
@@ -24,14 +40,17 @@ module Html2rss
|
|
|
24
40
|
default: {}
|
|
25
41
|
method_option :strategy,
|
|
26
42
|
type: :string,
|
|
27
|
-
desc:
|
|
28
|
-
enum:
|
|
43
|
+
desc: STRATEGY_OPTION_DESC,
|
|
44
|
+
enum: STRATEGY_OPTION_ENUM
|
|
29
45
|
method_option :max_redirects,
|
|
30
46
|
type: :numeric,
|
|
31
47
|
desc: 'Maximum redirects to follow per request'
|
|
32
48
|
method_option :max_requests,
|
|
33
49
|
type: :numeric,
|
|
34
50
|
desc: 'Maximum requests to allow for this feed build'
|
|
51
|
+
# @param yaml_file [String] path to YAML config
|
|
52
|
+
# @param feed_name [String, nil] optional named feed in multi-feed config
|
|
53
|
+
# @return [void]
|
|
35
54
|
def feed(yaml_file, feed_name = nil)
|
|
36
55
|
config = Html2rss.config_from_yaml_file(yaml_file, feed_name)
|
|
37
56
|
config[:params] = options[:params] || {}
|
|
@@ -43,8 +62,8 @@ module Html2rss
|
|
|
43
62
|
desc 'auto [URL]', 'Automatically sources an RSS feed from the URL'
|
|
44
63
|
method_option :strategy,
|
|
45
64
|
type: :string,
|
|
46
|
-
desc:
|
|
47
|
-
enum:
|
|
65
|
+
desc: STRATEGY_OPTION_DESC,
|
|
66
|
+
enum: STRATEGY_OPTION_ENUM
|
|
48
67
|
method_option :format,
|
|
49
68
|
type: :string,
|
|
50
69
|
desc: 'Output format for the auto-sourced feed',
|
|
@@ -57,6 +76,8 @@ module Html2rss
|
|
|
57
76
|
method_option :max_requests,
|
|
58
77
|
type: :numeric,
|
|
59
78
|
desc: 'Maximum requests to allow for this feed build'
|
|
79
|
+
# @param url [String] source page URL for auto discovery
|
|
80
|
+
# @return [void]
|
|
60
81
|
def auto(url) # rubocop:disable Metrics/MethodLength
|
|
61
82
|
format = options.fetch(:format, 'rss')
|
|
62
83
|
source_method = format == 'jsonfeed' ? Html2rss.method(:auto_json_feed) : Html2rss.method(:auto_source)
|
|
@@ -156,7 +177,7 @@ module Html2rss
|
|
|
156
177
|
end
|
|
157
178
|
|
|
158
179
|
def current_strategy
|
|
159
|
-
options[:strategy]&.to_sym || :
|
|
180
|
+
options[:strategy]&.to_sym || :auto
|
|
160
181
|
end
|
|
161
182
|
|
|
162
183
|
def current_max_redirects
|
|
@@ -186,7 +207,10 @@ module Html2rss
|
|
|
186
207
|
'or increase request.max_requests in the config.'
|
|
187
208
|
rescue Html2rss::RequestService::BrowserlessConfigurationError,
|
|
188
209
|
Html2rss::RequestService::BrowserlessConnectionFailed,
|
|
189
|
-
Html2rss::RequestService::
|
|
210
|
+
Html2rss::RequestService::BotasaurusConfigurationError,
|
|
211
|
+
Html2rss::RequestService::BotasaurusConnectionFailed,
|
|
212
|
+
Html2rss::RequestService::BlockedSurfaceDetected,
|
|
213
|
+
Html2rss::NoFeedItemsExtracted => error
|
|
190
214
|
raise Thor::Error, error.message
|
|
191
215
|
end
|
|
192
216
|
end
|
|
@@ -5,12 +5,13 @@ module Html2rss
|
|
|
5
5
|
##
|
|
6
6
|
# Public class-level helpers for loading, validating, and exporting config.
|
|
7
7
|
module ClassMethods
|
|
8
|
+
# Sentinel to differentiate omitted params from explicit `nil`.
|
|
8
9
|
UNSET = Object.new.freeze
|
|
9
10
|
|
|
10
11
|
##
|
|
11
12
|
# Returns the exported JSON Schema for html2rss configuration.
|
|
12
13
|
#
|
|
13
|
-
# @return [Hash
|
|
14
|
+
# @return [Hash{String => Object}] JSON Schema represented as a Ruby hash
|
|
14
15
|
def json_schema
|
|
15
16
|
Schema.json_schema
|
|
16
17
|
end
|
|
@@ -27,15 +28,15 @@ module Html2rss
|
|
|
27
28
|
##
|
|
28
29
|
# Validates a configuration hash with the runtime validator.
|
|
29
30
|
#
|
|
30
|
-
# @param config [Hash
|
|
31
|
-
# @param params [Hash
|
|
31
|
+
# @param config [Hash{Symbol => Object}] the configuration hash
|
|
32
|
+
# @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
|
|
32
33
|
# @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
|
|
33
34
|
def validate(config, params: UNSET)
|
|
34
35
|
prepared_config = prepare_for_validation(resolve_effective_config(config, params:))
|
|
35
36
|
|
|
36
37
|
Validator.new.call(prepared_config)
|
|
37
38
|
rescue DynamicParams::ParamsMissing => error
|
|
38
|
-
prepared_config = prepare_for_validation(
|
|
39
|
+
prepared_config = prepare_for_validation(HashUtil.deep_symbolize_keys(config, context: 'config'))
|
|
39
40
|
prepared_config[:dynamic_params_error] = error.message
|
|
40
41
|
|
|
41
42
|
Validator.new.call(prepared_config)
|
|
@@ -55,7 +56,7 @@ module Html2rss
|
|
|
55
56
|
# @param file [String] the YAML file to load
|
|
56
57
|
# @param feed_name [String, nil] optional feed name for multi-feed files
|
|
57
58
|
# @param multiple_feeds_key [Symbol] key under which multiple feeds are defined
|
|
58
|
-
# @param params [Hash
|
|
59
|
+
# @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting
|
|
59
60
|
# @return [Dry::Validation::Result] validation result after defaults and deprecations are applied
|
|
60
61
|
def validate_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS, params: UNSET)
|
|
61
62
|
validate(load_yaml(file, feed_name, multiple_feeds_key:), params:)
|
|
@@ -69,7 +70,7 @@ module Html2rss
|
|
|
69
70
|
# @param file [String] the YAML file to load.
|
|
70
71
|
# @param feed_name [String, nil] the feed name when using multiple feeds.
|
|
71
72
|
# @param multiple_feeds_key [Symbol] the key under which multiple feeds are defined.
|
|
72
|
-
# @return [Hash
|
|
73
|
+
# @return [Hash{Symbol => Object}] the configuration hash.
|
|
73
74
|
# @raise [ArgumentError] if the file doesn't exist or feed is not found.
|
|
74
75
|
# rubocop:disable Metrics/MethodLength
|
|
75
76
|
def load_yaml(file, feed_name = nil, multiple_feeds_key: MultipleFeedsConfig::CONFIG_KEY_FEEDS)
|
|
@@ -97,8 +98,8 @@ module Html2rss
|
|
|
97
98
|
# Processes the provided configuration hash, applying dynamic parameters if given,
|
|
98
99
|
# and returns a new configuration object.
|
|
99
100
|
#
|
|
100
|
-
# @param config [Hash
|
|
101
|
-
# @param params [Hash
|
|
101
|
+
# @param config [Hash{Symbol => Object}] the configuration hash.
|
|
102
|
+
# @param params [Hash{Symbol => Object}, Hash{String => Object}, nil] dynamic parameters for string formatting.
|
|
102
103
|
# @return [Html2rss::Config] the configuration object.
|
|
103
104
|
def from_hash(config, params: UNSET)
|
|
104
105
|
new(resolve_effective_config(config, params:))
|
|
@@ -110,7 +111,7 @@ module Html2rss
|
|
|
110
111
|
# @param url [String] source page URL
|
|
111
112
|
# @param items_selector [String, nil] optional selector hint for item extraction
|
|
112
113
|
# @param request_controls [Html2rss::RequestControls, nil] explicit request controls to write
|
|
113
|
-
# @return [Hash
|
|
114
|
+
# @return [Hash{Symbol => Object}] feed config hash ready for {from_hash}
|
|
114
115
|
def auto_source_config(url:, items_selector: nil, request_controls: nil)
|
|
115
116
|
config = {
|
|
116
117
|
channel: default_config[:channel].merge(url:),
|
|
@@ -127,10 +128,10 @@ module Html2rss
|
|
|
127
128
|
##
|
|
128
129
|
# Provides a default configuration.
|
|
129
130
|
#
|
|
130
|
-
# @return [Hash
|
|
131
|
+
# @return [Hash{Symbol => Object}] a hash with default configuration values.
|
|
131
132
|
def default_config
|
|
132
133
|
{
|
|
133
|
-
strategy:
|
|
134
|
+
strategy: default_strategy_name,
|
|
134
135
|
request: {
|
|
135
136
|
max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
|
|
136
137
|
max_requests: RequestService::Policy::DEFAULTS[:max_requests]
|
|
@@ -141,15 +142,22 @@ module Html2rss
|
|
|
141
142
|
}
|
|
142
143
|
end
|
|
143
144
|
|
|
145
|
+
# @return [Symbol] the default strategy for feed orchestration
|
|
146
|
+
def default_strategy_name
|
|
147
|
+
:auto
|
|
148
|
+
end
|
|
149
|
+
|
|
144
150
|
private
|
|
145
151
|
|
|
146
152
|
def resolve_effective_config(config, params:)
|
|
147
|
-
effective_config =
|
|
153
|
+
effective_config = HashUtil.deep_symbolize_keys(config, context: 'config')
|
|
148
154
|
resolved_params = parameter_defaults(effective_config)
|
|
149
|
-
|
|
155
|
+
unless params.equal?(UNSET) || params.nil?
|
|
156
|
+
resolved_params.merge!(HashUtil.deep_symbolize_keys(params, context: 'params'))
|
|
157
|
+
end
|
|
150
158
|
|
|
151
|
-
DynamicParams.call(effective_config[:headers], resolved_params)
|
|
152
|
-
DynamicParams.call(effective_config[:channel], resolved_params)
|
|
159
|
+
effective_config[:headers] = DynamicParams.call(effective_config[:headers], resolved_params)
|
|
160
|
+
effective_config[:channel] = DynamicParams.call(effective_config[:channel], resolved_params)
|
|
153
161
|
|
|
154
162
|
effective_config
|
|
155
163
|
end
|
|
@@ -163,27 +171,8 @@ module Html2rss
|
|
|
163
171
|
end
|
|
164
172
|
|
|
165
173
|
def prepare_for_validation(config)
|
|
166
|
-
Config::Preparer.new.call(deep_dup(config))
|
|
167
|
-
end
|
|
168
|
-
|
|
169
|
-
# rubocop:disable Metrics/MethodLength
|
|
170
|
-
def deep_dup(object)
|
|
171
|
-
case object
|
|
172
|
-
when Hash
|
|
173
|
-
object.transform_values do |value|
|
|
174
|
-
deep_dup(value)
|
|
175
|
-
end
|
|
176
|
-
when Array
|
|
177
|
-
object.map { |value| deep_dup(value) }
|
|
178
|
-
else
|
|
179
|
-
begin
|
|
180
|
-
object.dup
|
|
181
|
-
rescue TypeError
|
|
182
|
-
object
|
|
183
|
-
end
|
|
184
|
-
end
|
|
174
|
+
Config::Preparer.new.call(HashUtil.deep_dup(config))
|
|
185
175
|
end
|
|
186
|
-
# rubocop:enable Metrics/MethodLength
|
|
187
176
|
end
|
|
188
177
|
end
|
|
189
178
|
end
|
|
@@ -4,13 +4,14 @@ module Html2rss
|
|
|
4
4
|
class Config
|
|
5
5
|
# Processes and applies dynamic parameter formatting in configuration values.
|
|
6
6
|
class DynamicParams
|
|
7
|
+
# Raised when string interpolation references an unavailable parameter.
|
|
7
8
|
class ParamsMissing < Html2rss::Error; end
|
|
8
9
|
|
|
9
10
|
class << self
|
|
10
11
|
# Recursively traverses the given value and formats any strings containing
|
|
11
12
|
# placeholders with values from the provided params.
|
|
12
13
|
#
|
|
13
|
-
# @param value [String, Hash, Enumerable, Object]
|
|
14
|
+
# @param value [String, Hash, Enumerable, Object] value that may contain parameter placeholders
|
|
14
15
|
# @param params [Hash] The parameters for substitution.
|
|
15
16
|
# @param getter [Proc, nil] Optional proc to retrieve a key's value.
|
|
16
17
|
# @param replace_missing_with [Object, nil] Value to substitute if a key is missing.
|
|
@@ -55,12 +56,13 @@ module Html2rss
|
|
|
55
56
|
end
|
|
56
57
|
|
|
57
58
|
def from_hash(hash, params, getter:, replace_missing_with:)
|
|
58
|
-
|
|
59
|
-
|
|
59
|
+
HashUtil.deep_symbolize_keys(hash, context: 'dynamic params hash').to_h do |key, value|
|
|
60
|
+
[key, call(value, params, getter:, replace_missing_with:)]
|
|
61
|
+
end
|
|
60
62
|
end
|
|
61
63
|
|
|
62
64
|
def from_enumerable(enumerable, params, getter:, replace_missing_with:)
|
|
63
|
-
enumerable.map
|
|
65
|
+
enumerable.map { |value| call(value, params, getter:, replace_missing_with:) }
|
|
64
66
|
end
|
|
65
67
|
end
|
|
66
68
|
end
|
|
@@ -7,6 +7,7 @@ module Html2rss
|
|
|
7
7
|
# where each feed name is the key for its feed configuration.
|
|
8
8
|
# All global configuration keys (outside :feeds) are merged into each feed's settings.
|
|
9
9
|
class MultipleFeedsConfig
|
|
10
|
+
# Reserved YAML key under which multiple named feed configs are defined.
|
|
10
11
|
CONFIG_KEY_FEEDS = :feeds
|
|
11
12
|
|
|
12
13
|
class << self
|
|
@@ -37,11 +38,11 @@ module Html2rss
|
|
|
37
38
|
local_value = config[key]
|
|
38
39
|
case local_value
|
|
39
40
|
when Hash
|
|
40
|
-
global_value.is_a?(Hash) ?
|
|
41
|
+
global_value.is_a?(Hash) ? HashUtil.deep_merge(global_value, local_value) : local_value
|
|
41
42
|
when Array
|
|
42
43
|
global_value.is_a?(Array) ? global_value + local_value : local_value
|
|
43
44
|
else
|
|
44
|
-
global_value
|
|
45
|
+
local_value.nil? ? global_value : local_value
|
|
45
46
|
end
|
|
46
47
|
end
|
|
47
48
|
end
|