html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -656
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +115 -38
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,377 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ #
9
+ # Scrapes JSON state blobs embedded in script tags such as Next.js, Nuxt,
10
+ # or custom window globals. The scraper searches `<script type="application/json">`
11
+ # tags and well-known JavaScript globals for arrays of article-like hashes
12
+ # and normalises them to a structure compatible with HtmlExtractor.
13
+ class JsonState
14
+ include Enumerable
15
+
16
+ JSON_SCRIPT_SELECTOR = 'script[type="application/json"]'
17
+ GLOBAL_ASSIGNMENT_PATTERNS = [
18
+ /(?:window|self|globalThis)\.__NEXT_DATA__\s*=\s*/m,
19
+ /(?:window|self|globalThis)\.__NUXT__\s*=\s*/m,
20
+ /(?:window|self|globalThis)\.STATE\s*=\s*/m,
21
+ /(?:window|self|globalThis)\.__REDUX_STATE__\s*=\s*/m,
22
+ /(?:window|self|globalThis)\.__PRELOADED_STATE__\s*=\s*/m,
23
+ /(?:window|self|globalThis)\.__APOLLO_STATE__\s*=\s*/m,
24
+ /(?:window|self|globalThis)\.__remixContext\s*=\s*/m,
25
+ /(?:window|self|globalThis)\.__sveltekit_data\s*=\s*/m,
26
+ /(?:window|self|globalThis)\.GATSBY_STATE\s*=\s*/m,
27
+ /(?:window|self|globalThis)\.__ember_meta\s*=\s*/m,
28
+ /(?:window|self|globalThis)\.angular\s*=\s*/m
29
+ ].freeze
30
+
31
+ TITLE_KEYS = %w[title headline name text].freeze
32
+ URL_KEYS = %w[url link href permalink slug path canonicalUrl shortUrl].freeze
33
+ DESCRIPTION_KEYS = %w[description summary excerpt dek subheading].freeze
34
+ IMAGE_KEYS = %w[image imageUrl thumbnailUrl thumbnail src featuredImage coverImage heroImage].freeze
35
+ PUBLISHED_AT_KEYS = %w[published_at publishedAt datePublished date publicationDate pubDate updatedAt updated_at
36
+ createdAt created_at].freeze
37
+ CATEGORY_KEYS = %w[categories tags section sections topic topics channel].freeze
38
+ ID_KEYS = %w[id guid uuid slug key].freeze
39
+
40
+ # Scans DOM nodes for JSON payloads containing article data.
41
+ module DocumentScanner
42
+ module_function
43
+
44
+ def json_documents(parsed_body)
45
+ script_documents(parsed_body) + assignment_documents(parsed_body)
46
+ end
47
+
48
+ def script_documents(parsed_body)
49
+ parsed_body.css(JSON_SCRIPT_SELECTOR).filter_map { parse_json(_1.text) }
50
+ end
51
+
52
+ def assignment_documents(parsed_body)
53
+ parsed_body.css('script').filter_map { parse_assignment(_1.text) }
54
+ end
55
+
56
+ def parse_assignment(text)
57
+ payload = assignment_payload(text)
58
+ parse_json(payload) if payload
59
+ end
60
+
61
+ def assignment_payload(text)
62
+ trimmed = text.to_s.strip
63
+ return if trimmed.empty?
64
+
65
+ GLOBAL_ASSIGNMENT_PATTERNS.each do |pattern|
66
+ next unless trimmed.match?(pattern)
67
+
68
+ payload = trimmed.sub(pattern, '')
69
+ return extract_assignment_payload(payload)
70
+ end
71
+
72
+ nil
73
+ end
74
+
75
+ def extract_assignment_payload(text)
76
+ extract_json_block(text) || text
77
+ end
78
+
79
+ def extract_json_block(text)
80
+ start_index = text.index(/[\[{]/)
81
+ return unless start_index
82
+
83
+ stop_index = scan_for_json_end(text, start_index)
84
+ text[start_index..stop_index] if stop_index
85
+ end
86
+
87
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
88
+ def scan_for_json_end(text, start_index)
89
+ stack = []
90
+ in_string = false
91
+ escape = false
92
+
93
+ text.each_char.with_index do |char, index|
94
+ next if index < start_index
95
+
96
+ if in_string
97
+ if escape
98
+ escape = false
99
+ elsif char == '\\'
100
+ escape = true
101
+ elsif char == '"'
102
+ in_string = false
103
+ end
104
+ next
105
+ end
106
+
107
+ case char
108
+ when '"'
109
+ in_string = true
110
+ when '{'
111
+ stack << '}'
112
+ when '['
113
+ stack << ']'
114
+ when '}', ']'
115
+ expected = stack.pop
116
+ return index if expected == char && stack.empty?
117
+ end
118
+ end
119
+
120
+ nil
121
+ end
122
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
123
+
124
+ def parse_json(payload)
125
+ return unless payload
126
+
127
+ JSON.parse(payload, symbolize_names: true)
128
+ rescue JSON::ParserError => error
129
+ parse_js_object(payload, error)
130
+ end
131
+
132
+ def parse_js_object(payload, _original_error)
133
+ coerced = coerce_javascript_object(payload)
134
+ return unless coerced
135
+
136
+ # Some sites emit JavaScript object literals (unquoted keys, trailing commas).
137
+ # Coerce those payloads into valid JSON so we keep the same parsing pipeline.
138
+ JSON.parse(coerced, symbolize_names: true)
139
+ rescue JSON::ParserError => error
140
+ Html2rss::Log.debug("#{name}: failed to parse coerced JavaScript object (#{error.message})")
141
+ nil
142
+ end
143
+
144
+ def coerce_javascript_object(payload)
145
+ string = payload.dup
146
+
147
+ # KISS approach: mutate common JS literal quirks instead of a full parser.
148
+ strip_trailing_commas(quote_unquoted_keys(string))
149
+ end
150
+
151
+ def quote_unquoted_keys(jsonish)
152
+ jsonish.gsub(/(\A\s*|[{,\[]\s*)([A-Za-z_]\w*)(\s*:)/) do
153
+ "#{Regexp.last_match(1)}\"#{Regexp.last_match(2)}\"#{Regexp.last_match(3)}"
154
+ end
155
+ end
156
+
157
+ def strip_trailing_commas(jsonish)
158
+ jsonish.gsub(/,(\s*[\]}])/, '\1')
159
+ end
160
+ end
161
+ private_constant :DocumentScanner
162
+
163
+ # Retrieves values from heterogeneous objects by probing multiple keys.
164
+ module ValueFinder
165
+ module_function
166
+
167
+ def fetch(object, keys)
168
+ case object
169
+ when Hash then fetch_from_hash(object, keys)
170
+ when Array then fetch_from_array(object, keys)
171
+ end
172
+ end
173
+
174
+ def fetch_from_hash(hash, keys)
175
+ keys.each do |key|
176
+ string_key = key.to_s
177
+ return hash[string_key] if hash.key?(string_key)
178
+
179
+ symbol_key = string_key.to_sym
180
+ return hash[symbol_key] if hash.key?(symbol_key)
181
+ end
182
+
183
+ fetch_nested(hash[:attributes] || hash['attributes'], keys) ||
184
+ fetch_nested(hash[:data] || hash['data'], keys)
185
+ end
186
+
187
+ def fetch_from_array(array, keys)
188
+ array.each do |entry|
189
+ result = fetch(entry, keys)
190
+ return result if result
191
+ end
192
+
193
+ nil
194
+ end
195
+
196
+ def fetch_nested(value, keys)
197
+ fetch(value, keys) if value
198
+ end
199
+ end
200
+ private_constant :ValueFinder
201
+
202
+ # Identifies arrays that look like collections of article hashes.
203
+ module CandidateDetector
204
+ module_function
205
+
206
+ def candidate_array?(document)
207
+ case document
208
+ when Array
209
+ return true if array_of_articles?(document)
210
+
211
+ document.any? { traversable_candidate?(_1) }
212
+ when Hash then document.each_value.any? { candidate_array?(_1) }
213
+ else false
214
+ end
215
+ end
216
+
217
+ def traversable_candidate?(value)
218
+ case value
219
+ when Array, Hash then candidate_array?(value)
220
+ else false
221
+ end
222
+ end
223
+
224
+ def array_of_articles?(array)
225
+ array.any? do |element|
226
+ next unless element.is_a?(Hash)
227
+
228
+ title_from(element) && url_from(element)
229
+ end
230
+ end
231
+
232
+ def title_from(object)
233
+ ValueFinder.fetch(object, TITLE_KEYS)
234
+ end
235
+
236
+ def url_from(object)
237
+ ValueFinder.fetch(object, URL_KEYS)
238
+ end
239
+ end
240
+ private_constant :CandidateDetector
241
+
242
+ # Shapes raw entries into the structure required downstream.
243
+ module ArticleNormalizer
244
+ module_function
245
+
246
+ # rubocop:disable Metrics/MethodLength
247
+ def normalise(entry, base_url:)
248
+ return unless entry.is_a?(Hash)
249
+
250
+ title = string(ValueFinder.fetch(entry, TITLE_KEYS))
251
+ description = string(ValueFinder.fetch(entry, DESCRIPTION_KEYS))
252
+ article_url = resolve_link(entry, keys: URL_KEYS, base_url:,
253
+ log_key: 'JsonState: invalid URL encountered')
254
+ return unless article_url
255
+ return if title.nil? && description.nil?
256
+
257
+ {
258
+ title:,
259
+ description:,
260
+ url: article_url,
261
+ image: resolve_link(entry, keys: IMAGE_KEYS, base_url:,
262
+ log_key: 'JsonState: invalid image URL encountered'),
263
+ published_at: string(ValueFinder.fetch(entry, PUBLISHED_AT_KEYS)),
264
+ categories: categories(entry),
265
+ id: identifier(entry, article_url)
266
+ }.compact
267
+ end
268
+ # rubocop:enable Metrics/MethodLength
269
+
270
+ def string(value)
271
+ trimmed = value.to_s.strip
272
+ trimmed unless trimmed.empty?
273
+ end
274
+
275
+ def resolve_link(entry, keys:, base_url:, log_key:)
276
+ value = ValueFinder.fetch(entry, keys)
277
+ value = ValueFinder.fetch(value, keys) if value.is_a?(Hash)
278
+ string = string(value)
279
+ return unless string
280
+
281
+ Url.from_relative(string, base_url)
282
+ rescue ArgumentError
283
+ Log.debug(log_key, url: string)
284
+ nil
285
+ end
286
+
287
+ # rubocop:disable Metrics/MethodLength
288
+ def categories(entry)
289
+ raw = ValueFinder.fetch(entry, CATEGORY_KEYS)
290
+ names = case raw
291
+ when Array then raw
292
+ when Hash then raw.values
293
+ when String then [raw]
294
+ else []
295
+ end
296
+
297
+ result = names.flat_map do |value|
298
+ case value
299
+ when Hash
300
+ string(ValueFinder.fetch(value, %w[name title label]))
301
+ else
302
+ string(value)
303
+ end
304
+ end.compact
305
+
306
+ result.uniq!
307
+ result unless result.empty?
308
+ end
309
+ # rubocop:enable Metrics/MethodLength
310
+
311
+ def identifier(entry, article_url)
312
+ value = ValueFinder.fetch(entry, ID_KEYS)
313
+ value = ValueFinder.fetch(value, ID_KEYS) if value.is_a?(Hash)
314
+ string(value) || article_url.to_s
315
+ end
316
+ end
317
+ private_constant :ArticleNormalizer
318
+
319
+ def self.options_key = :json_state
320
+
321
+ class << self
322
+ def articles?(parsed_body)
323
+ return false unless parsed_body
324
+
325
+ DocumentScanner.json_documents(parsed_body).any? { CandidateDetector.candidate_array?(_1) }
326
+ end
327
+
328
+ def json_documents(parsed_body)
329
+ DocumentScanner.json_documents(parsed_body)
330
+ end
331
+ end
332
+
333
+ def initialize(parsed_body, url:, **_opts)
334
+ @parsed_body = parsed_body
335
+ @url = url
336
+ end
337
+
338
+ attr_reader :parsed_body
339
+
340
+ def each
341
+ return enum_for(:each) unless block_given?
342
+
343
+ DocumentScanner.json_documents(parsed_body).each do |document|
344
+ discover_articles(document) do |article|
345
+ yield article if article
346
+ end
347
+ end
348
+ end
349
+
350
+ private
351
+
352
+ attr_reader :url
353
+
354
+ def discover_articles(document, &block)
355
+ case document
356
+ when Array then handle_array(document, &block)
357
+ when Hash then document.each_value { discover_articles(_1, &block) if traversable?(_1) }
358
+ end
359
+ end
360
+
361
+ def handle_array(array, &block)
362
+ if CandidateDetector.array_of_articles?(array)
363
+ array.each do |entry|
364
+ yield(ArticleNormalizer.normalise(entry, base_url: url))
365
+ end
366
+ else
367
+ array.each { discover_articles(_1, &block) if traversable?(_1) }
368
+ end
369
+ end
370
+
371
+ def traversable?(value)
372
+ value.is_a?(Array) || value.is_a?(Hash)
373
+ end
374
+ end
375
+ end
376
+ end
377
+ end