html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class WordpressApi
|
|
7
|
+
##
|
|
8
|
+
# Determines whether a WordPress page can safely be mapped to a posts query.
|
|
9
|
+
class PageScope
|
|
10
|
+
# Canonical path segment for category archives.
|
|
11
|
+
CATEGORY_SEGMENT = 'category'
|
|
12
|
+
# Canonical path segment for tag archives.
|
|
13
|
+
TAG_SEGMENT = 'tag'
|
|
14
|
+
# Canonical path segment for author archives.
|
|
15
|
+
AUTHOR_SEGMENT = 'author'
|
|
16
|
+
# Canonical path segment for paginated archives.
|
|
17
|
+
PAGE_SEGMENT = 'page'
|
|
18
|
+
# Canonical query key used for paginated archives.
|
|
19
|
+
PAGED_QUERY_KEY = 'paged'
|
|
20
|
+
|
|
21
|
+
##
|
|
22
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
23
|
+
# @param url [Html2rss::Url] canonical page URL
|
|
24
|
+
# @return [PageScope] derived page scope
|
|
25
|
+
def self.from(parsed_body:, url:)
|
|
26
|
+
Resolver.new(parsed_body:, url:).call
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
##
|
|
30
|
+
# @param query [Hash{String => String}] scoped query params for the posts endpoint
|
|
31
|
+
# @param fetchable [Boolean] whether a posts follow-up is safe for this page
|
|
32
|
+
# @param reason [Symbol] classification of the resolved page scope
|
|
33
|
+
def initialize(query:, fetchable:, reason:)
|
|
34
|
+
@query = query.freeze
|
|
35
|
+
@fetchable = fetchable
|
|
36
|
+
@reason = reason
|
|
37
|
+
freeze
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
##
|
|
41
|
+
# @return [Hash{String => String}] query params to apply to the posts request
|
|
42
|
+
attr_reader :query
|
|
43
|
+
|
|
44
|
+
##
|
|
45
|
+
# @return [Boolean] whether the page may safely use the posts API follow-up
|
|
46
|
+
def fetchable?
|
|
47
|
+
@fetchable
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
##
|
|
51
|
+
# @return [Symbol] classification of the resolved page scope
|
|
52
|
+
attr_reader :reason
|
|
53
|
+
|
|
54
|
+
##
|
|
55
|
+
# Resolves the page scope from page markup and canonical URL signals.
|
|
56
|
+
class Resolver # rubocop:disable Metrics/ClassLength
|
|
57
|
+
##
|
|
58
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
59
|
+
# @param url [Html2rss::Url] canonical page URL
|
|
60
|
+
def initialize(parsed_body:, url:)
|
|
61
|
+
@parsed_body = parsed_body
|
|
62
|
+
@url = Html2rss::Url.from_absolute(url)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
##
|
|
66
|
+
# @return [PageScope] derived page scope
|
|
67
|
+
def call
|
|
68
|
+
scope = category_scope ||
|
|
69
|
+
tag_scope ||
|
|
70
|
+
author_scope ||
|
|
71
|
+
date_scope ||
|
|
72
|
+
fallback_scope
|
|
73
|
+
|
|
74
|
+
apply_pagination(scope)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
private
|
|
78
|
+
|
|
79
|
+
attr_reader :parsed_body, :url
|
|
80
|
+
|
|
81
|
+
def category_scope
|
|
82
|
+
return unless category_archive?
|
|
83
|
+
|
|
84
|
+
scoped_scope('categories' => archive_id('category'))
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def tag_scope
|
|
88
|
+
return unless tag_archive?
|
|
89
|
+
|
|
90
|
+
scoped_scope('tags' => archive_id('tag'))
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def author_scope
|
|
94
|
+
return unless author_archive?
|
|
95
|
+
|
|
96
|
+
scoped_scope('author' => archive_id('author'))
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def date_scope
|
|
100
|
+
return unless date_archive?
|
|
101
|
+
|
|
102
|
+
range = date_archive_range
|
|
103
|
+
return unknown_archive_scope unless range
|
|
104
|
+
|
|
105
|
+
PageScope.new(query: range, fetchable: true, reason: :archive)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def fallback_scope
|
|
109
|
+
return unknown_archive_scope if archive_like?
|
|
110
|
+
return non_archive_scope if singular_like?
|
|
111
|
+
|
|
112
|
+
PageScope.new(query: {}, fetchable: true, reason: :unscoped)
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def apply_pagination(scope)
|
|
116
|
+
page = archive_page_number
|
|
117
|
+
return scope unless scope.fetchable? && page
|
|
118
|
+
|
|
119
|
+
PageScope.new(
|
|
120
|
+
query: scope.query.merge('page' => page.to_s),
|
|
121
|
+
fetchable: scope.fetchable?,
|
|
122
|
+
reason: scope.reason
|
|
123
|
+
)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def scoped_scope(query)
|
|
127
|
+
return unknown_archive_scope if query.values.any?(&:nil?)
|
|
128
|
+
|
|
129
|
+
PageScope.new(query:, fetchable: true, reason: :archive)
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def unknown_archive_scope
|
|
133
|
+
PageScope.new(query: {}, fetchable: false, reason: :unsupported_archive)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def non_archive_scope
|
|
137
|
+
PageScope.new(query: {}, fetchable: false, reason: :non_archive)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def category_archive?
|
|
141
|
+
body_classes.include?('category') || leading_path_segment == CATEGORY_SEGMENT
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def tag_archive?
|
|
145
|
+
body_classes.include?('tag') || leading_path_segment == TAG_SEGMENT
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def author_archive?
|
|
149
|
+
body_classes.include?('author') || leading_path_segment == AUTHOR_SEGMENT
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def date_archive?
|
|
153
|
+
body_classes.include?('date') || date_archive_path?
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def archive_like?
|
|
157
|
+
category_archive? || tag_archive? || author_archive? || date_archive? || body_classes.include?('archive')
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def singular_like?
|
|
161
|
+
body_classes.intersect?(%w[page single singular attachment]) ||
|
|
162
|
+
body_classes.any? { _1.match?(/\A(?:page-id|postid)-\d+\z/) }
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def body_classes
|
|
166
|
+
@body_classes ||= parsed_body.at_css('body')&.[]('class').to_s.split
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def archive_id(prefix)
|
|
170
|
+
body_classes.filter_map do |klass|
|
|
171
|
+
klass[Regexp.new("^#{Regexp.escape(prefix)}-(\\d+)$"), 1]
|
|
172
|
+
end.first
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
def canonical_or_current_url
|
|
176
|
+
href = parsed_body.at_css(WordpressApi::CANONICAL_LINK_SELECTOR)&.[]('href').to_s.strip
|
|
177
|
+
return url if href.empty?
|
|
178
|
+
|
|
179
|
+
canonical_url = Html2rss::Url.from_relative(href, url)
|
|
180
|
+
same_origin_url?(canonical_url, url) ? canonical_url : url
|
|
181
|
+
rescue ArgumentError
|
|
182
|
+
url
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def path_segments
|
|
186
|
+
@path_segments ||= canonical_or_current_url.path_segments
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def scoped_path_segments
|
|
190
|
+
@scoped_path_segments ||= paginated_path? ? path_segments[0...-2] : path_segments
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def leading_path_segment
|
|
194
|
+
scoped_path_segments.first
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def date_archive_path?
|
|
198
|
+
!date_archive_segments.nil?
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
def date_archive_range
|
|
202
|
+
components = date_archive_components
|
|
203
|
+
return unless components
|
|
204
|
+
|
|
205
|
+
start_date = Date.new(*components.fetch(:start_date_parts))
|
|
206
|
+
{
|
|
207
|
+
'after' => iso8601_start(start_date),
|
|
208
|
+
'before' => iso8601_start(next_archive_boundary(start_date, components.fetch(:precision)))
|
|
209
|
+
}
|
|
210
|
+
rescue Date::Error
|
|
211
|
+
nil
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def date_archive_components
|
|
215
|
+
segments = date_archive_segments
|
|
216
|
+
return unless segments
|
|
217
|
+
|
|
218
|
+
year = segments.fetch(0).to_i
|
|
219
|
+
month = parse_archive_segment(segments[1], 1, 12)
|
|
220
|
+
day = parse_archive_segment(segments[2], 1, 31)
|
|
221
|
+
|
|
222
|
+
{
|
|
223
|
+
start_date_parts: [year, month || 1, day || 1],
|
|
224
|
+
precision: archive_precision(month:, day:)
|
|
225
|
+
}
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def date_archive_segments
|
|
229
|
+
year_index = scoped_path_segments.find_index { _1.match?(/\A\d{4}\z/) }
|
|
230
|
+
return unless year_index
|
|
231
|
+
|
|
232
|
+
segments = scoped_path_segments.drop(year_index)
|
|
233
|
+
return unless segments.length.between?(1, 3)
|
|
234
|
+
return unless archive_segment_shape?(segments)
|
|
235
|
+
|
|
236
|
+
segments
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
def paginated_path?
|
|
240
|
+
return false if path_segments.length < 2
|
|
241
|
+
return false unless path_segments[-2] == PAGE_SEGMENT
|
|
242
|
+
|
|
243
|
+
!parse_positive_integer(path_segments[-1]).nil?
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
def archive_page_number
|
|
247
|
+
parse_positive_integer(canonical_or_current_url.query_values[PAGED_QUERY_KEY]) || path_page_number
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def path_page_number
|
|
251
|
+
return unless paginated_path?
|
|
252
|
+
|
|
253
|
+
parse_positive_integer(path_segments[-1])
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def archive_segment_shape?(segments)
|
|
257
|
+
month = segments[1]
|
|
258
|
+
day = segments[2]
|
|
259
|
+
return false if day && month.nil?
|
|
260
|
+
return false unless month.nil? || month.match?(/\A\d+\z/)
|
|
261
|
+
return false unless day.nil? || day.match?(/\A\d+\z/)
|
|
262
|
+
|
|
263
|
+
true
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def same_origin_url?(left, right)
|
|
267
|
+
[left.scheme, left.host, left.port] == [right.scheme, right.host, right.port]
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
def archive_precision(month:, day:)
|
|
271
|
+
return :day if day
|
|
272
|
+
return :month if month
|
|
273
|
+
|
|
274
|
+
:year
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
def next_archive_boundary(start_date, precision)
|
|
278
|
+
{
|
|
279
|
+
year: start_date.next_year,
|
|
280
|
+
month: start_date.next_month,
|
|
281
|
+
day: start_date.next_day
|
|
282
|
+
}.fetch(precision)
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def iso8601_start(date)
|
|
286
|
+
date.strftime('%Y-%m-%dT00:00:00Z')
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def parse_archive_segment(value, minimum, maximum)
|
|
290
|
+
return nil unless value&.match?(/\A\d+\z/)
|
|
291
|
+
|
|
292
|
+
number = value.to_i
|
|
293
|
+
return nil if number < minimum || number > maximum
|
|
294
|
+
|
|
295
|
+
number
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
def parse_positive_integer(value)
|
|
299
|
+
return nil unless value.to_s.match?(/\A\d+\z/)
|
|
300
|
+
|
|
301
|
+
number = value.to_i
|
|
302
|
+
return nil if number < 1
|
|
303
|
+
|
|
304
|
+
number
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
end
|
|
311
|
+
end
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class WordpressApi
|
|
7
|
+
##
|
|
8
|
+
# Resolves the WordPress posts endpoint for a given page and scope.
|
|
9
|
+
class PostsEndpoint
|
|
10
|
+
# REST API collection path for posts resources.
|
|
11
|
+
POSTS_PATH = 'wp/v2/posts'
|
|
12
|
+
|
|
13
|
+
##
|
|
14
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
15
|
+
# @param page_url [Html2rss::Url] canonical page URL
|
|
16
|
+
# @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
|
|
17
|
+
# @param posts_query [Hash{String => String}] query params for the posts request
|
|
18
|
+
# @param logger [Logger] logger used for operational warnings
|
|
19
|
+
# @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
|
|
20
|
+
def self.resolve(parsed_body:, page_url:, page_scope:, posts_query:, logger: Html2rss::Log)
|
|
21
|
+
new(parsed_body:, page_url:, page_scope:, posts_query:, logger:).call
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
##
|
|
25
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
26
|
+
# @param page_url [Html2rss::Url] canonical page URL
|
|
27
|
+
# @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
|
|
28
|
+
# @param posts_query [Hash{String => String}] query params for the posts request
|
|
29
|
+
# @param logger [Logger] logger used for operational warnings
|
|
30
|
+
def initialize(parsed_body:, page_url:, page_scope:, posts_query:, logger:)
|
|
31
|
+
@parsed_body = parsed_body
|
|
32
|
+
@page_url = Html2rss::Url.from_absolute(page_url)
|
|
33
|
+
@page_scope = page_scope
|
|
34
|
+
@posts_query = posts_query
|
|
35
|
+
@logger = logger
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
##
|
|
39
|
+
# @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
|
|
40
|
+
def call
|
|
41
|
+
api_root = api_root_url
|
|
42
|
+
return unless api_root
|
|
43
|
+
return unless fetchable_page_scope?
|
|
44
|
+
|
|
45
|
+
query_style_api_root?(api_root) ? query_root_posts_url(api_root) : posts_collection_url(api_root)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
private
|
|
49
|
+
|
|
50
|
+
attr_reader :parsed_body, :page_url, :page_scope, :posts_query, :logger
|
|
51
|
+
|
|
52
|
+
def api_root_url
|
|
53
|
+
href = parsed_body.at_css(WordpressApi::API_LINK_SELECTOR)&.[]('href').to_s.strip
|
|
54
|
+
return log_missing_api_root if href.empty?
|
|
55
|
+
|
|
56
|
+
Html2rss::Url.from_relative(href, page_url)
|
|
57
|
+
rescue Addressable::URI::InvalidURIError, ArgumentError => error
|
|
58
|
+
logger.warn("#{WordpressApi}: invalid WordPress API endpoint #{href.inspect} (#{error.message})")
|
|
59
|
+
nil
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def fetchable_page_scope?
|
|
63
|
+
return true if page_scope.fetchable?
|
|
64
|
+
|
|
65
|
+
if page_scope.reason == :non_archive
|
|
66
|
+
logger.debug(
|
|
67
|
+
"#{WordpressApi}: page advertised WordPress API support " \
|
|
68
|
+
'without a safe WordPress archive scope'
|
|
69
|
+
)
|
|
70
|
+
return false
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
logger.warn("#{WordpressApi}: unable to derive safe WordPress archive scope for #{page_url}")
|
|
74
|
+
false
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def log_missing_api_root
|
|
78
|
+
logger.debug("#{WordpressApi}: page advertised WordPress API support without a usable API root")
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def query_style_api_root?(api_root)
|
|
83
|
+
api_root.query_values.key?('rest_route')
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def query_root_posts_url(api_root)
|
|
87
|
+
query = api_root.query_values
|
|
88
|
+
route = normalized_rest_route(query.fetch('rest_route', '/'))
|
|
89
|
+
api_root.with_query_values(
|
|
90
|
+
query.merge(
|
|
91
|
+
'rest_route' => append_posts_route(route),
|
|
92
|
+
**posts_query
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def posts_collection_url(api_root)
|
|
98
|
+
Html2rss::Url.from_relative(POSTS_PATH, normalized_api_root(api_root))
|
|
99
|
+
.with_query_values(api_root.query_values.merge(posts_query))
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def normalized_api_root(api_root)
|
|
103
|
+
api_root.with_path(normalized_api_path(api_root.path))
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def normalized_api_path(path)
|
|
107
|
+
segments = path.to_s.split('/').reject(&:empty?)
|
|
108
|
+
normalized_path = "/#{segments.join('/')}"
|
|
109
|
+
normalized_path = '/' if normalized_path == '/'
|
|
110
|
+
normalized_path.end_with?('/') ? normalized_path : "#{normalized_path}/"
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def normalized_rest_route(route)
|
|
114
|
+
value = route.to_s
|
|
115
|
+
value = '/' if value.empty?
|
|
116
|
+
value = "/#{value}" unless value.start_with?('/')
|
|
117
|
+
trim_trailing_slashes(value)
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
def trim_trailing_slashes(value)
|
|
121
|
+
end_index = value.length
|
|
122
|
+
end_index -= 1 while end_index > 1 && value.getbyte(end_index - 1) == 47
|
|
123
|
+
value[0, end_index]
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def append_posts_route(route)
|
|
127
|
+
return '/wp/v2/posts' if route == '/'
|
|
128
|
+
|
|
129
|
+
"#{route}/wp/v2/posts"
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'date'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
|
|
6
|
+
module Html2rss
|
|
7
|
+
class AutoSource
|
|
8
|
+
module Scraper
|
|
9
|
+
# Scrapes WordPress sites through their REST API instead of parsing article HTML.
|
|
10
|
+
class WordpressApi # rubocop:disable Metrics/ClassLength
|
|
11
|
+
include Enumerable
|
|
12
|
+
|
|
13
|
+
# Selector for WordPress API discovery link tags.
|
|
14
|
+
API_LINK_SELECTOR = 'link[rel="https://api.w.org/"][href]'
|
|
15
|
+
# Selector for canonical link tags used for scope normalization.
|
|
16
|
+
CANONICAL_LINK_SELECTOR = 'link[rel="canonical"][href]'
|
|
17
|
+
# Fields requested from the WordPress posts endpoint.
|
|
18
|
+
POSTS_FIELDS = %w[id title excerpt content link date categories].freeze
|
|
19
|
+
# Baseline query sent to WordPress posts API follow-ups.
|
|
20
|
+
POSTS_QUERY_DEFAULTS = {
|
|
21
|
+
'_fields' => POSTS_FIELDS.join(','),
|
|
22
|
+
'per_page' => '100'
|
|
23
|
+
}.freeze
|
|
24
|
+
# @return [Symbol] scraper config key
|
|
25
|
+
def self.options_key = :wordpress_api
|
|
26
|
+
|
|
27
|
+
##
|
|
28
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
29
|
+
# @return [Boolean] whether the page advertises a WordPress REST API endpoint
|
|
30
|
+
def self.articles?(parsed_body)
|
|
31
|
+
return false unless parsed_body
|
|
32
|
+
|
|
33
|
+
!parsed_body.at_css(API_LINK_SELECTOR).nil?
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
##
|
|
37
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
38
|
+
# @param url [String, Html2rss::Url] canonical page URL
|
|
39
|
+
# @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
|
|
40
|
+
# @param _opts [Hash] unused scraper-specific options
|
|
41
|
+
# @option _opts [Object] :_reserved reserved for future scraper-specific options
|
|
42
|
+
# @return [void]
|
|
43
|
+
def initialize(parsed_body, url:, request_session: nil, **_opts)
|
|
44
|
+
@parsed_body = parsed_body
|
|
45
|
+
@url = Html2rss::Url.from_absolute(url)
|
|
46
|
+
@request_session = request_session
|
|
47
|
+
@page_scope = PageScope.from(parsed_body:, url: @url)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
##
|
|
51
|
+
# Yields article hashes from the WordPress posts API.
|
|
52
|
+
#
|
|
53
|
+
# @yieldparam article [Hash{Symbol => Object}] normalized article hash
|
|
54
|
+
# @return [Enumerator, void] enumerator when no block is given
|
|
55
|
+
def each
|
|
56
|
+
return enum_for(:each) unless block_given?
|
|
57
|
+
return unless (posts = fetch_posts)
|
|
58
|
+
|
|
59
|
+
posts.filter_map { article_from(_1) }.each { yield(_1) }
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
attr_reader :parsed_body, :url, :request_session, :page_scope
|
|
65
|
+
|
|
66
|
+
def fetch_posts
|
|
67
|
+
response = posts_response
|
|
68
|
+
return unless response
|
|
69
|
+
|
|
70
|
+
Array(response.parsed_body)
|
|
71
|
+
rescue RequestService::UnsupportedResponseContentType => error
|
|
72
|
+
Log.warn("#{self.class}: unsupported WordPress API posts content type (#{error.message})")
|
|
73
|
+
nil
|
|
74
|
+
rescue JSON::ParserError => error
|
|
75
|
+
Log.warn("#{self.class}: failed to parse WordPress API posts JSON (#{error.message})")
|
|
76
|
+
nil
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def posts_response
|
|
80
|
+
return unless request_session
|
|
81
|
+
return unless (resolved_posts_url = posts_endpoint_url)
|
|
82
|
+
|
|
83
|
+
request_session.follow_up(
|
|
84
|
+
url: resolved_posts_url,
|
|
85
|
+
relation: :auto_source,
|
|
86
|
+
origin_url: url
|
|
87
|
+
)
|
|
88
|
+
rescue Html2rss::Error => error
|
|
89
|
+
Log.warn("#{self.class}: failed to fetch WordPress API posts (#{error.class}: #{error.message})")
|
|
90
|
+
nil
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def article_from(post)
|
|
94
|
+
return unless post.is_a?(Hash)
|
|
95
|
+
|
|
96
|
+
article_url = article_url(post)
|
|
97
|
+
return unless article_url
|
|
98
|
+
|
|
99
|
+
article_attributes(post, article_url).compact
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def article_url(post)
|
|
103
|
+
absolute_link(post[:link])
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def article_id(_post, article_url)
|
|
107
|
+
root_path_query_id(article_url) || present_string(article_url.path) || article_url.to_s
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def article_title(post)
|
|
111
|
+
rendered_text(post.dig(:title, :rendered))
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def article_description(post)
|
|
115
|
+
rendered_html(post.dig(:content, :rendered)) || rendered_html(post.dig(:excerpt, :rendered))
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def article_published_at(post)
|
|
119
|
+
present_string(post[:date])
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def article_categories(post)
|
|
123
|
+
Array(post[:categories]).filter_map { |value| present_string(value) }
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def article_attributes(post, article_url)
|
|
127
|
+
{
|
|
128
|
+
id: article_id(post, article_url),
|
|
129
|
+
title: article_title(post),
|
|
130
|
+
description: article_description(post),
|
|
131
|
+
url: article_url,
|
|
132
|
+
published_at: article_published_at(post),
|
|
133
|
+
categories: article_categories(post)
|
|
134
|
+
}
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
def absolute_link(link)
|
|
138
|
+
value = present_string(link)
|
|
139
|
+
return unless value
|
|
140
|
+
|
|
141
|
+
Html2rss::Url.from_relative(value, url)
|
|
142
|
+
rescue ArgumentError
|
|
143
|
+
nil
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def rendered_text(value)
|
|
147
|
+
rendered_html(value)&.then { Nokogiri::HTML.fragment(_1).text.strip }
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def rendered_html(value)
|
|
151
|
+
text = present_string(value)
|
|
152
|
+
text unless text.nil?
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def present_string(value)
|
|
156
|
+
text = value.to_s.strip
|
|
157
|
+
text unless text.empty?
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def root_path_query_id(article_url)
|
|
161
|
+
query = present_string(article_url.query)
|
|
162
|
+
return unless query
|
|
163
|
+
|
|
164
|
+
path = article_url.path.to_s
|
|
165
|
+
return unless path.empty? || path == '/'
|
|
166
|
+
|
|
167
|
+
"/?#{query}"
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def posts_query
|
|
171
|
+
POSTS_QUERY_DEFAULTS.merge(page_scope.query).transform_values(&:to_s)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def posts_endpoint_url
|
|
175
|
+
PostsEndpoint.resolve(
|
|
176
|
+
parsed_body:,
|
|
177
|
+
page_url: url,
|
|
178
|
+
page_scope:,
|
|
179
|
+
posts_query:,
|
|
180
|
+
logger: Log
|
|
181
|
+
)
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
186
|
+
end
|