html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,311 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class WordpressApi
7
+ ##
8
+ # Determines whether a WordPress page can safely be mapped to a posts query.
9
+ class PageScope
10
+ # Canonical path segment for category archives.
11
+ CATEGORY_SEGMENT = 'category'
12
+ # Canonical path segment for tag archives.
13
+ TAG_SEGMENT = 'tag'
14
+ # Canonical path segment for author archives.
15
+ AUTHOR_SEGMENT = 'author'
16
+ # Canonical path segment for paginated archives.
17
+ PAGE_SEGMENT = 'page'
18
+ # Canonical query key used for paginated archives.
19
+ PAGED_QUERY_KEY = 'paged'
20
+
21
+ ##
22
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
23
+ # @param url [Html2rss::Url] canonical page URL
24
+ # @return [PageScope] derived page scope
25
+ def self.from(parsed_body:, url:)
26
+ Resolver.new(parsed_body:, url:).call
27
+ end
28
+
29
+ ##
30
+ # @param query [Hash{String => String}] scoped query params for the posts endpoint
31
+ # @param fetchable [Boolean] whether a posts follow-up is safe for this page
32
+ # @param reason [Symbol] classification of the resolved page scope
33
+ def initialize(query:, fetchable:, reason:)
34
+ @query = query.freeze
35
+ @fetchable = fetchable
36
+ @reason = reason
37
+ freeze
38
+ end
39
+
40
+ ##
41
+ # @return [Hash{String => String}] query params to apply to the posts request
42
+ attr_reader :query
43
+
44
+ ##
45
+ # @return [Boolean] whether the page may safely use the posts API follow-up
46
+ def fetchable?
47
+ @fetchable
48
+ end
49
+
50
+ ##
51
+ # @return [Symbol] classification of the resolved page scope
52
+ attr_reader :reason
53
+
54
+ ##
55
+ # Resolves the page scope from page markup and canonical URL signals.
56
+ class Resolver # rubocop:disable Metrics/ClassLength
57
+ ##
58
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
59
+ # @param url [Html2rss::Url] canonical page URL
60
+ def initialize(parsed_body:, url:)
61
+ @parsed_body = parsed_body
62
+ @url = Html2rss::Url.from_absolute(url)
63
+ end
64
+
65
+ ##
66
+ # @return [PageScope] derived page scope
67
+ def call
68
+ scope = category_scope ||
69
+ tag_scope ||
70
+ author_scope ||
71
+ date_scope ||
72
+ fallback_scope
73
+
74
+ apply_pagination(scope)
75
+ end
76
+
77
+ private
78
+
79
+ attr_reader :parsed_body, :url
80
+
81
+ def category_scope
82
+ return unless category_archive?
83
+
84
+ scoped_scope('categories' => archive_id('category'))
85
+ end
86
+
87
+ def tag_scope
88
+ return unless tag_archive?
89
+
90
+ scoped_scope('tags' => archive_id('tag'))
91
+ end
92
+
93
+ def author_scope
94
+ return unless author_archive?
95
+
96
+ scoped_scope('author' => archive_id('author'))
97
+ end
98
+
99
+ def date_scope
100
+ return unless date_archive?
101
+
102
+ range = date_archive_range
103
+ return unknown_archive_scope unless range
104
+
105
+ PageScope.new(query: range, fetchable: true, reason: :archive)
106
+ end
107
+
108
+ def fallback_scope
109
+ return unknown_archive_scope if archive_like?
110
+ return non_archive_scope if singular_like?
111
+
112
+ PageScope.new(query: {}, fetchable: true, reason: :unscoped)
113
+ end
114
+
115
+ def apply_pagination(scope)
116
+ page = archive_page_number
117
+ return scope unless scope.fetchable? && page
118
+
119
+ PageScope.new(
120
+ query: scope.query.merge('page' => page.to_s),
121
+ fetchable: scope.fetchable?,
122
+ reason: scope.reason
123
+ )
124
+ end
125
+
126
+ def scoped_scope(query)
127
+ return unknown_archive_scope if query.values.any?(&:nil?)
128
+
129
+ PageScope.new(query:, fetchable: true, reason: :archive)
130
+ end
131
+
132
+ def unknown_archive_scope
133
+ PageScope.new(query: {}, fetchable: false, reason: :unsupported_archive)
134
+ end
135
+
136
+ def non_archive_scope
137
+ PageScope.new(query: {}, fetchable: false, reason: :non_archive)
138
+ end
139
+
140
+ def category_archive?
141
+ body_classes.include?('category') || leading_path_segment == CATEGORY_SEGMENT
142
+ end
143
+
144
+ def tag_archive?
145
+ body_classes.include?('tag') || leading_path_segment == TAG_SEGMENT
146
+ end
147
+
148
+ def author_archive?
149
+ body_classes.include?('author') || leading_path_segment == AUTHOR_SEGMENT
150
+ end
151
+
152
+ def date_archive?
153
+ body_classes.include?('date') || date_archive_path?
154
+ end
155
+
156
+ def archive_like?
157
+ category_archive? || tag_archive? || author_archive? || date_archive? || body_classes.include?('archive')
158
+ end
159
+
160
+ def singular_like?
161
+ body_classes.intersect?(%w[page single singular attachment]) ||
162
+ body_classes.any? { _1.match?(/\A(?:page-id|postid)-\d+\z/) }
163
+ end
164
+
165
+ def body_classes
166
+ @body_classes ||= parsed_body.at_css('body')&.[]('class').to_s.split
167
+ end
168
+
169
+ def archive_id(prefix)
170
+ body_classes.filter_map do |klass|
171
+ klass[Regexp.new("^#{Regexp.escape(prefix)}-(\\d+)$"), 1]
172
+ end.first
173
+ end
174
+
175
+ def canonical_or_current_url
176
+ href = parsed_body.at_css(WordpressApi::CANONICAL_LINK_SELECTOR)&.[]('href').to_s.strip
177
+ return url if href.empty?
178
+
179
+ canonical_url = Html2rss::Url.from_relative(href, url)
180
+ same_origin_url?(canonical_url, url) ? canonical_url : url
181
+ rescue ArgumentError
182
+ url
183
+ end
184
+
185
+ def path_segments
186
+ @path_segments ||= canonical_or_current_url.path_segments
187
+ end
188
+
189
+ def scoped_path_segments
190
+ @scoped_path_segments ||= paginated_path? ? path_segments[0...-2] : path_segments
191
+ end
192
+
193
+ def leading_path_segment
194
+ scoped_path_segments.first
195
+ end
196
+
197
+ def date_archive_path?
198
+ !date_archive_segments.nil?
199
+ end
200
+
201
+ def date_archive_range
202
+ components = date_archive_components
203
+ return unless components
204
+
205
+ start_date = Date.new(*components.fetch(:start_date_parts))
206
+ {
207
+ 'after' => iso8601_start(start_date),
208
+ 'before' => iso8601_start(next_archive_boundary(start_date, components.fetch(:precision)))
209
+ }
210
+ rescue Date::Error
211
+ nil
212
+ end
213
+
214
+ def date_archive_components
215
+ segments = date_archive_segments
216
+ return unless segments
217
+
218
+ year = segments.fetch(0).to_i
219
+ month = parse_archive_segment(segments[1], 1, 12)
220
+ day = parse_archive_segment(segments[2], 1, 31)
221
+
222
+ {
223
+ start_date_parts: [year, month || 1, day || 1],
224
+ precision: archive_precision(month:, day:)
225
+ }
226
+ end
227
+
228
+ def date_archive_segments
229
+ year_index = scoped_path_segments.find_index { _1.match?(/\A\d{4}\z/) }
230
+ return unless year_index
231
+
232
+ segments = scoped_path_segments.drop(year_index)
233
+ return unless segments.length.between?(1, 3)
234
+ return unless archive_segment_shape?(segments)
235
+
236
+ segments
237
+ end
238
+
239
+ def paginated_path?
240
+ return false if path_segments.length < 2
241
+ return false unless path_segments[-2] == PAGE_SEGMENT
242
+
243
+ !parse_positive_integer(path_segments[-1]).nil?
244
+ end
245
+
246
+ def archive_page_number
247
+ parse_positive_integer(canonical_or_current_url.query_values[PAGED_QUERY_KEY]) || path_page_number
248
+ end
249
+
250
+ def path_page_number
251
+ return unless paginated_path?
252
+
253
+ parse_positive_integer(path_segments[-1])
254
+ end
255
+
256
+ def archive_segment_shape?(segments)
257
+ month = segments[1]
258
+ day = segments[2]
259
+ return false if day && month.nil?
260
+ return false unless month.nil? || month.match?(/\A\d+\z/)
261
+ return false unless day.nil? || day.match?(/\A\d+\z/)
262
+
263
+ true
264
+ end
265
+
266
+ def same_origin_url?(left, right)
267
+ [left.scheme, left.host, left.port] == [right.scheme, right.host, right.port]
268
+ end
269
+
270
+ def archive_precision(month:, day:)
271
+ return :day if day
272
+ return :month if month
273
+
274
+ :year
275
+ end
276
+
277
+ def next_archive_boundary(start_date, precision)
278
+ {
279
+ year: start_date.next_year,
280
+ month: start_date.next_month,
281
+ day: start_date.next_day
282
+ }.fetch(precision)
283
+ end
284
+
285
+ def iso8601_start(date)
286
+ date.strftime('%Y-%m-%dT00:00:00Z')
287
+ end
288
+
289
+ def parse_archive_segment(value, minimum, maximum)
290
+ return nil unless value&.match?(/\A\d+\z/)
291
+
292
+ number = value.to_i
293
+ return nil if number < minimum || number > maximum
294
+
295
+ number
296
+ end
297
+
298
+ def parse_positive_integer(value)
299
+ return nil unless value.to_s.match?(/\A\d+\z/)
300
+
301
+ number = value.to_i
302
+ return nil if number < 1
303
+
304
+ number
305
+ end
306
+ end
307
+ end
308
+ end
309
+ end
310
+ end
311
+ end
@@ -0,0 +1,135 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class WordpressApi
7
+ ##
8
+ # Resolves the WordPress posts endpoint for a given page and scope.
9
+ class PostsEndpoint
10
+ # REST API collection path for posts resources.
11
+ POSTS_PATH = 'wp/v2/posts'
12
+
13
+ ##
14
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
15
+ # @param page_url [Html2rss::Url] canonical page URL
16
+ # @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
17
+ # @param posts_query [Hash{String => String}] query params for the posts request
18
+ # @param logger [Logger] logger used for operational warnings
19
+ # @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
20
+ def self.resolve(parsed_body:, page_url:, page_scope:, posts_query:, logger: Html2rss::Log)
21
+ new(parsed_body:, page_url:, page_scope:, posts_query:, logger:).call
22
+ end
23
+
24
+ ##
25
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
26
+ # @param page_url [Html2rss::Url] canonical page URL
27
+ # @param page_scope [Html2rss::AutoSource::Scraper::WordpressApi::PageScope] derived page scope
28
+ # @param posts_query [Hash{String => String}] query params for the posts request
29
+ # @param logger [Logger] logger used for operational warnings
30
+ def initialize(parsed_body:, page_url:, page_scope:, posts_query:, logger:)
31
+ @parsed_body = parsed_body
32
+ @page_url = Html2rss::Url.from_absolute(page_url)
33
+ @page_scope = page_scope
34
+ @posts_query = posts_query
35
+ @logger = logger
36
+ end
37
+
38
+ ##
39
+ # @return [Html2rss::Url, nil] resolved posts endpoint or nil when unavailable
40
+ def call
41
+ api_root = api_root_url
42
+ return unless api_root
43
+ return unless fetchable_page_scope?
44
+
45
+ query_style_api_root?(api_root) ? query_root_posts_url(api_root) : posts_collection_url(api_root)
46
+ end
47
+
48
+ private
49
+
50
+ attr_reader :parsed_body, :page_url, :page_scope, :posts_query, :logger
51
+
52
+ def api_root_url
53
+ href = parsed_body.at_css(WordpressApi::API_LINK_SELECTOR)&.[]('href').to_s.strip
54
+ return log_missing_api_root if href.empty?
55
+
56
+ Html2rss::Url.from_relative(href, page_url)
57
+ rescue Addressable::URI::InvalidURIError, ArgumentError => error
58
+ logger.warn("#{WordpressApi}: invalid WordPress API endpoint #{href.inspect} (#{error.message})")
59
+ nil
60
+ end
61
+
62
+ def fetchable_page_scope?
63
+ return true if page_scope.fetchable?
64
+
65
+ if page_scope.reason == :non_archive
66
+ logger.debug(
67
+ "#{WordpressApi}: page advertised WordPress API support " \
68
+ 'without a safe WordPress archive scope'
69
+ )
70
+ return false
71
+ end
72
+
73
+ logger.warn("#{WordpressApi}: unable to derive safe WordPress archive scope for #{page_url}")
74
+ false
75
+ end
76
+
77
+ def log_missing_api_root
78
+ logger.debug("#{WordpressApi}: page advertised WordPress API support without a usable API root")
79
+ nil
80
+ end
81
+
82
+ def query_style_api_root?(api_root)
83
+ api_root.query_values.key?('rest_route')
84
+ end
85
+
86
+ def query_root_posts_url(api_root)
87
+ query = api_root.query_values
88
+ route = normalized_rest_route(query.fetch('rest_route', '/'))
89
+ api_root.with_query_values(
90
+ query.merge(
91
+ 'rest_route' => append_posts_route(route),
92
+ **posts_query
93
+ )
94
+ )
95
+ end
96
+
97
+ def posts_collection_url(api_root)
98
+ Html2rss::Url.from_relative(POSTS_PATH, normalized_api_root(api_root))
99
+ .with_query_values(api_root.query_values.merge(posts_query))
100
+ end
101
+
102
+ def normalized_api_root(api_root)
103
+ api_root.with_path(normalized_api_path(api_root.path))
104
+ end
105
+
106
+ def normalized_api_path(path)
107
+ segments = path.to_s.split('/').reject(&:empty?)
108
+ normalized_path = "/#{segments.join('/')}"
109
+ normalized_path = '/' if normalized_path == '/'
110
+ normalized_path.end_with?('/') ? normalized_path : "#{normalized_path}/"
111
+ end
112
+
113
+ def normalized_rest_route(route)
114
+ value = route.to_s
115
+ value = '/' if value.empty?
116
+ value = "/#{value}" unless value.start_with?('/')
117
+ trim_trailing_slashes(value)
118
+ end
119
+
120
+ def trim_trailing_slashes(value)
121
+ end_index = value.length
122
+ end_index -= 1 while end_index > 1 && value.getbyte(end_index - 1) == 47
123
+ value[0, end_index]
124
+ end
125
+
126
+ def append_posts_route(route)
127
+ return '/wp/v2/posts' if route == '/'
128
+
129
+ "#{route}/wp/v2/posts"
130
+ end
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'date'
4
+ require 'nokogiri'
5
+
6
+ module Html2rss
7
+ class AutoSource
8
+ module Scraper
9
+ # Scrapes WordPress sites through their REST API instead of parsing article HTML.
10
+ class WordpressApi # rubocop:disable Metrics/ClassLength
11
+ include Enumerable
12
+
13
+ # Selector for WordPress API discovery link tags.
14
+ API_LINK_SELECTOR = 'link[rel="https://api.w.org/"][href]'
15
+ # Selector for canonical link tags used for scope normalization.
16
+ CANONICAL_LINK_SELECTOR = 'link[rel="canonical"][href]'
17
+ # Fields requested from the WordPress posts endpoint.
18
+ POSTS_FIELDS = %w[id title excerpt content link date categories].freeze
19
+ # Baseline query sent to WordPress posts API follow-ups.
20
+ POSTS_QUERY_DEFAULTS = {
21
+ '_fields' => POSTS_FIELDS.join(','),
22
+ 'per_page' => '100'
23
+ }.freeze
24
+ # @return [Symbol] scraper config key
25
+ def self.options_key = :wordpress_api
26
+
27
+ ##
28
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
29
+ # @return [Boolean] whether the page advertises a WordPress REST API endpoint
30
+ def self.articles?(parsed_body)
31
+ return false unless parsed_body
32
+
33
+ !parsed_body.at_css(API_LINK_SELECTOR).nil?
34
+ end
35
+
36
+ ##
37
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
38
+ # @param url [String, Html2rss::Url] canonical page URL
39
+ # @param request_session [Html2rss::RequestSession, nil] shared request session for follow-up fetches
40
+ # @param _opts [Hash] unused scraper-specific options
41
+ # @option _opts [Object] :_reserved reserved for future scraper-specific options
42
+ # @return [void]
43
+ def initialize(parsed_body, url:, request_session: nil, **_opts)
44
+ @parsed_body = parsed_body
45
+ @url = Html2rss::Url.from_absolute(url)
46
+ @request_session = request_session
47
+ @page_scope = PageScope.from(parsed_body:, url: @url)
48
+ end
49
+
50
+ ##
51
+ # Yields article hashes from the WordPress posts API.
52
+ #
53
+ # @yieldparam article [Hash{Symbol => Object}] normalized article hash
54
+ # @return [Enumerator, void] enumerator when no block is given
55
+ def each
56
+ return enum_for(:each) unless block_given?
57
+ return unless (posts = fetch_posts)
58
+
59
+ posts.filter_map { article_from(_1) }.each { yield(_1) }
60
+ end
61
+
62
+ private
63
+
64
+ attr_reader :parsed_body, :url, :request_session, :page_scope
65
+
66
+ def fetch_posts
67
+ response = posts_response
68
+ return unless response
69
+
70
+ Array(response.parsed_body)
71
+ rescue RequestService::UnsupportedResponseContentType => error
72
+ Log.warn("#{self.class}: unsupported WordPress API posts content type (#{error.message})")
73
+ nil
74
+ rescue JSON::ParserError => error
75
+ Log.warn("#{self.class}: failed to parse WordPress API posts JSON (#{error.message})")
76
+ nil
77
+ end
78
+
79
+ def posts_response
80
+ return unless request_session
81
+ return unless (resolved_posts_url = posts_endpoint_url)
82
+
83
+ request_session.follow_up(
84
+ url: resolved_posts_url,
85
+ relation: :auto_source,
86
+ origin_url: url
87
+ )
88
+ rescue Html2rss::Error => error
89
+ Log.warn("#{self.class}: failed to fetch WordPress API posts (#{error.class}: #{error.message})")
90
+ nil
91
+ end
92
+
93
+ def article_from(post)
94
+ return unless post.is_a?(Hash)
95
+
96
+ article_url = article_url(post)
97
+ return unless article_url
98
+
99
+ article_attributes(post, article_url).compact
100
+ end
101
+
102
+ def article_url(post)
103
+ absolute_link(post[:link])
104
+ end
105
+
106
+ def article_id(_post, article_url)
107
+ root_path_query_id(article_url) || present_string(article_url.path) || article_url.to_s
108
+ end
109
+
110
+ def article_title(post)
111
+ rendered_text(post.dig(:title, :rendered))
112
+ end
113
+
114
+ def article_description(post)
115
+ rendered_html(post.dig(:content, :rendered)) || rendered_html(post.dig(:excerpt, :rendered))
116
+ end
117
+
118
+ def article_published_at(post)
119
+ present_string(post[:date])
120
+ end
121
+
122
+ def article_categories(post)
123
+ Array(post[:categories]).filter_map { |value| present_string(value) }
124
+ end
125
+
126
+ def article_attributes(post, article_url)
127
+ {
128
+ id: article_id(post, article_url),
129
+ title: article_title(post),
130
+ description: article_description(post),
131
+ url: article_url,
132
+ published_at: article_published_at(post),
133
+ categories: article_categories(post)
134
+ }
135
+ end
136
+
137
+ def absolute_link(link)
138
+ value = present_string(link)
139
+ return unless value
140
+
141
+ Html2rss::Url.from_relative(value, url)
142
+ rescue ArgumentError
143
+ nil
144
+ end
145
+
146
+ def rendered_text(value)
147
+ rendered_html(value)&.then { Nokogiri::HTML.fragment(_1).text.strip }
148
+ end
149
+
150
+ def rendered_html(value)
151
+ text = present_string(value)
152
+ text unless text.nil?
153
+ end
154
+
155
+ def present_string(value)
156
+ text = value.to_s.strip
157
+ text unless text.empty?
158
+ end
159
+
160
+ def root_path_query_id(article_url)
161
+ query = present_string(article_url.query)
162
+ return unless query
163
+
164
+ path = article_url.path.to_s
165
+ return unless path.empty? || path == '/'
166
+
167
+ "/?#{query}"
168
+ end
169
+
170
+ def posts_query
171
+ POSTS_QUERY_DEFAULTS.merge(page_scope.query).transform_values(&:to_s)
172
+ end
173
+
174
+ def posts_endpoint_url
175
+ PostsEndpoint.resolve(
176
+ parsed_body:,
177
+ page_url: url,
178
+ page_scope:,
179
+ posts_query:,
180
+ logger: Log
181
+ )
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end