html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,457 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Html2rss
6
+ class AutoSource
7
+ module Scraper
8
+ ##
9
+ # Scrapes JSON state blobs embedded in script tags such as Next.js, Nuxt,
10
+ # or custom window globals. The scraper searches `<script type="application/json">`
11
+ # tags and well-known JavaScript globals for arrays of article-like hashes
12
+ # and normalises them to a structure compatible with HtmlExtractor.
13
+ class JsonState
14
+ include Enumerable
15
+
16
+ # Selector for JSON-only script tags.
17
+ JSON_SCRIPT_SELECTOR = 'script[type="application/json"]'
18
+ # Regex patterns for known global JavaScript state assignments.
19
+ GLOBAL_ASSIGNMENT_PATTERNS = [
20
+ /(?:window|self|globalThis)\.__NEXT_DATA__\s*=\s*/m,
21
+ /(?:window|self|globalThis)\.__NUXT__\s*=\s*/m,
22
+ /(?:window|self|globalThis)\.STATE\s*=\s*/m,
23
+ /(?:window|self|globalThis)\.__REDUX_STATE__\s*=\s*/m,
24
+ /(?:window|self|globalThis)\.__PRELOADED_STATE__\s*=\s*/m,
25
+ /(?:window|self|globalThis)\.__APOLLO_STATE__\s*=\s*/m,
26
+ /(?:window|self|globalThis)\.__remixContext\s*=\s*/m,
27
+ /(?:window|self|globalThis)\.__sveltekit_data\s*=\s*/m,
28
+ /(?:window|self|globalThis)\.GATSBY_STATE\s*=\s*/m,
29
+ /(?:window|self|globalThis)\.__ember_meta\s*=\s*/m,
30
+ /(?:window|self|globalThis)\.angular\s*=\s*/m
31
+ ].freeze
32
+
33
+ # Preferred keys when extracting title-like values from state payloads.
34
+ TITLE_KEYS = %i[title headline name text].freeze
35
+ # Preferred keys when extracting URL-like values from state payloads.
36
+ URL_KEYS = %i[url link href permalink slug path canonicalUrl shortUrl].freeze
37
+ # Preferred keys when extracting description-like values from state payloads.
38
+ DESCRIPTION_KEYS = %i[description summary excerpt dek subheading].freeze
39
+ # Preferred keys when extracting image-like values from state payloads.
40
+ IMAGE_KEYS = %i[image imageUrl thumbnailUrl thumbnail src featuredImage coverImage heroImage].freeze
41
+ # Preferred keys when extracting publication timestamps from state payloads.
42
+ PUBLISHED_AT_KEYS = %i[published_at publishedAt datePublished date publicationDate pubDate updatedAt updated_at
43
+ createdAt created_at].freeze
44
+ # Preferred keys when extracting category-like values from state payloads.
45
+ CATEGORY_KEYS = %i[categories tags section sections topic topics channel].freeze
46
+ # Preferred keys when extracting identifier-like values from state payloads.
47
+ ID_KEYS = %i[id guid uuid slug key].freeze
48
+
49
+ # Scans DOM nodes for JSON payloads containing article data.
50
+ module DocumentScanner
51
+ module_function
52
+
53
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
54
+ # @return [Array<Hash, Array>] parsed JSON documents discovered in scripts
55
+ def json_documents(parsed_body)
56
+ script_documents(parsed_body) + assignment_documents(parsed_body)
57
+ end
58
+
59
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
60
+ # @return [Array<Hash, Array>] JSON documents extracted from JSON script tags
61
+ def script_documents(parsed_body)
62
+ parsed_body.css(JSON_SCRIPT_SELECTOR).filter_map { parse_json(_1.text) }
63
+ end
64
+
65
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
66
+ # @return [Array<Hash, Array>] JSON documents extracted from global assignments
67
+ def assignment_documents(parsed_body)
68
+ parsed_body.css('script').filter_map { parse_assignment(_1.text) }
69
+ end
70
+
71
+ # @param text [String] script text that may contain a global assignment
72
+ # @return [Hash, Array, nil] parsed assignment payload when available
73
+ def parse_assignment(text)
74
+ payload = assignment_payload(text)
75
+ parse_json(payload) if payload
76
+ end
77
+
78
+ # @param text [String] script text to inspect for known assignment patterns
79
+ # @return [String, nil] extracted JSON-like assignment payload
80
+ def assignment_payload(text)
81
+ trimmed = text.to_s.strip
82
+ return if trimmed.empty?
83
+
84
+ GLOBAL_ASSIGNMENT_PATTERNS.each do |pattern|
85
+ next unless trimmed.match?(pattern)
86
+
87
+ payload = trimmed.sub(pattern, '')
88
+ return extract_assignment_payload(payload)
89
+ end
90
+
91
+ nil
92
+ end
93
+
94
+ # @param text [String] text potentially containing JSON-like payloads
95
+ # @return [String, nil] normalized assignment payload
96
+ def extract_assignment_payload(text)
97
+ extract_json_block(text) || text
98
+ end
99
+
100
+ # @param text [String] text potentially containing JSON blocks
101
+ # @return [String, nil] extracted JSON block spanning balanced brackets
102
+ def extract_json_block(text)
103
+ start_index = text.index(/[\[{]/)
104
+ return unless start_index
105
+
106
+ stop_index = scan_for_json_end(text, start_index)
107
+ text[start_index..stop_index] if stop_index
108
+ end
109
+
110
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
111
+ # @param text [String] text starting with a JSON object/array opening token
112
+ # @param start_index [Integer] index where JSON-like content starts
113
+ # @return [Integer, nil] index where the balanced JSON payload ends
114
+ def scan_for_json_end(text, start_index)
115
+ stack = []
116
+ in_string = false
117
+ escape = false
118
+
119
+ text.each_char.with_index do |char, index|
120
+ next if index < start_index
121
+
122
+ if in_string
123
+ if escape
124
+ escape = false
125
+ elsif char == '\\'
126
+ escape = true
127
+ elsif char == '"'
128
+ in_string = false
129
+ end
130
+ next
131
+ end
132
+
133
+ case char
134
+ when '"'
135
+ in_string = true
136
+ when '{'
137
+ stack << '}'
138
+ when '['
139
+ stack << ']'
140
+ when '}', ']'
141
+ expected = stack.pop
142
+ return index if expected == char && stack.empty?
143
+ end
144
+ end
145
+
146
+ nil
147
+ end
148
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
149
+
150
+ # @param payload [String, nil] JSON payload to parse
151
+ # @return [Hash, Array, nil] parsed payload or nil when parsing fails
152
+ def parse_json(payload)
153
+ return unless payload
154
+
155
+ JSON.parse(payload, symbolize_names: true)
156
+ rescue JSON::ParserError => error
157
+ parse_js_object(payload, error)
158
+ end
159
+
160
+ # @param payload [String] JavaScript object-literal payload
161
+ # @param _original_error [JSON::ParserError] original JSON parse error
162
+ # @return [Hash, Array, nil] parsed payload after JavaScript coercion
163
+ def parse_js_object(payload, _original_error)
164
+ coerced = coerce_javascript_object(payload)
165
+ return unless coerced
166
+
167
+ # Some sites emit JavaScript object literals (unquoted keys, trailing commas).
168
+ # Coerce those payloads into valid JSON so we keep the same parsing pipeline.
169
+ JSON.parse(coerced, symbolize_names: true)
170
+ rescue JSON::ParserError => error
171
+ Html2rss::Log.debug("#{name}: failed to parse coerced JavaScript object (#{error.message})")
172
+ nil
173
+ end
174
+
175
+ # @param payload [String] JavaScript object-literal payload
176
+ # @return [String] JSON-compatible payload string
177
+ def coerce_javascript_object(payload)
178
+ string = payload.dup
179
+
180
+ # KISS approach: mutate common JS literal quirks instead of a full parser.
181
+ strip_trailing_commas(quote_unquoted_keys(string))
182
+ end
183
+
184
+ # @param jsonish [String] JSON-like string with potentially unquoted keys
185
+ # @return [String] payload with unquoted object keys quoted
186
+ def quote_unquoted_keys(jsonish)
187
+ jsonish.gsub(/(\A\s*|[{,\[]\s*)([A-Za-z_]\w*)(\s*:)/) do
188
+ "#{Regexp.last_match(1)}\"#{Regexp.last_match(2)}\"#{Regexp.last_match(3)}"
189
+ end
190
+ end
191
+
192
+ # @param jsonish [String] JSON-like string with potential trailing commas
193
+ # @return [String] payload without trailing commas before closing tokens
194
+ def strip_trailing_commas(jsonish)
195
+ jsonish.gsub(/,(\s*[\]}])/, '\1')
196
+ end
197
+ end
198
+ private_constant :DocumentScanner
199
+
200
+ # Retrieves values from heterogeneous objects by probing multiple keys.
201
+ module ValueFinder
202
+ module_function
203
+
204
+ # @param object [Hash, Array] candidate container traversed during key lookup
205
+ # @param keys [Array<Symbol>] keys to probe in order
206
+ # @return [Object, nil] first matching value
207
+ def fetch(object, keys)
208
+ case object
209
+ when Hash then fetch_from_hash(object, keys)
210
+ when Array then fetch_from_array(object, keys)
211
+ end
212
+ end
213
+
214
+ # @param hash [Hash] hash candidate traversed during key lookup
215
+ # @param keys [Array<Symbol>] keys to probe in order
216
+ # @return [Object, nil] first matching value from hash or nested metadata
217
+ def fetch_from_hash(hash, keys)
218
+ keys.each do |key|
219
+ return hash[key] if hash.key?(key)
220
+ end
221
+
222
+ fetch_nested(hash[:attributes], keys) ||
223
+ fetch_nested(hash[:data], keys)
224
+ end
225
+
226
+ # @param array [Array] array whose entries may contain target keys
227
+ # @param keys [Array<Symbol>] keys to probe in order
228
+ # @return [Object, nil] first matching value from array entries
229
+ def fetch_from_array(array, keys)
230
+ array.each do |entry|
231
+ result = fetch(entry, keys)
232
+ return result if result
233
+ end
234
+
235
+ nil
236
+ end
237
+
238
+ # @param value [Hash, Array, nil] nested value to recurse into
239
+ # @param keys [Array<Symbol>] keys to probe in order
240
+ # @return [Object, nil] matching nested value
241
+ def fetch_nested(value, keys)
242
+ fetch(value, keys) if value
243
+ end
244
+ end
245
+ private_constant :ValueFinder
246
+
247
+ # Identifies arrays that look like collections of article hashes.
248
+ module CandidateDetector
249
+ module_function
250
+
251
+ # @param document [Hash, Array, Object] candidate document node
252
+ # @return [Boolean] whether the node contains article-like arrays
253
+ def candidate_array?(document)
254
+ case document
255
+ when Array
256
+ return true if array_of_articles?(document)
257
+
258
+ document.any? { traversable_candidate?(_1) }
259
+ when Hash then document.each_value.any? { candidate_array?(_1) }
260
+ else false
261
+ end
262
+ end
263
+
264
+ # @param value [Hash, Array, Object] candidate nested value
265
+ # @return [Boolean] whether nested value should be traversed for article candidates
266
+ def traversable_candidate?(value)
267
+ case value
268
+ when Array, Hash then candidate_array?(value)
269
+ else false
270
+ end
271
+ end
272
+
273
+ # @param array [Array<Object>] candidate list of entries
274
+ # @return [Boolean] whether array includes hash entries with title and URL fields
275
+ def array_of_articles?(array)
276
+ array.any? do |element|
277
+ next unless element.is_a?(Hash)
278
+
279
+ title_from(element) && url_from(element)
280
+ end
281
+ end
282
+
283
+ # @param object [Hash] article candidate object
284
+ # @return [Object, nil] detected title-like value
285
+ def title_from(object)
286
+ ValueFinder.fetch(object, TITLE_KEYS)
287
+ end
288
+
289
+ # @param object [Hash] article candidate object
290
+ # @return [Object, nil] detected URL-like value
291
+ def url_from(object)
292
+ ValueFinder.fetch(object, URL_KEYS)
293
+ end
294
+ end
295
+ private_constant :CandidateDetector
296
+
297
+ # Shapes raw entries into the structure required downstream.
298
+ module ArticleNormalizer
299
+ module_function
300
+
301
+ # rubocop:disable Metrics/MethodLength
302
+ # @param entry [Hash] raw article entry candidate
303
+ # @param base_url [String, Html2rss::Url] base URL for relative link resolution
304
+ # @return [Hash{Symbol => Object}, nil] normalized article hash for downstream extraction
305
+ def normalise(entry, base_url:)
306
+ return unless entry.is_a?(Hash)
307
+
308
+ title = string(ValueFinder.fetch(entry, TITLE_KEYS))
309
+ description = string(ValueFinder.fetch(entry, DESCRIPTION_KEYS))
310
+ article_url = resolve_link(entry, keys: URL_KEYS, base_url:,
311
+ log_key: 'JsonState: invalid URL encountered')
312
+ return unless article_url
313
+ return if title.nil? && description.nil?
314
+
315
+ {
316
+ title:,
317
+ description:,
318
+ url: article_url,
319
+ image: resolve_link(entry, keys: IMAGE_KEYS, base_url:,
320
+ log_key: 'JsonState: invalid image URL encountered'),
321
+ published_at: string(ValueFinder.fetch(entry, PUBLISHED_AT_KEYS)),
322
+ categories: categories(entry),
323
+ id: identifier(entry, article_url)
324
+ }.compact
325
+ end
326
+ # rubocop:enable Metrics/MethodLength
327
+
328
+ # @param value [Object] candidate scalar value
329
+ # @return [String, nil] normalized non-empty string value
330
+ def string(value)
331
+ trimmed = value.to_s.strip
332
+ trimmed unless trimmed.empty?
333
+ end
334
+
335
+ # @param entry [Hash] raw article entry candidate
336
+ # @param keys [Array<String>] preferred link keys
337
+ # @param base_url [String, Html2rss::Url] base URL for relative link resolution
338
+ # @param log_key [String] structured log message key
339
+ # @return [Html2rss::Url, nil] resolved absolute URL
340
+ def resolve_link(entry, keys:, base_url:, log_key:)
341
+ value = ValueFinder.fetch(entry, keys)
342
+ value = ValueFinder.fetch(value, keys) if value.is_a?(Hash)
343
+ string = string(value)
344
+ return unless string
345
+
346
+ Url.from_relative(string, base_url)
347
+ rescue ArgumentError
348
+ Log.debug(log_key, url: string)
349
+ nil
350
+ end
351
+
352
+ # rubocop:disable Metrics/MethodLength
353
+ # @param entry [Hash] raw article entry candidate
354
+ # @return [Array<String>, nil] normalized unique categories
355
+ def categories(entry)
356
+ raw = ValueFinder.fetch(entry, CATEGORY_KEYS)
357
+ names = case raw
358
+ when Array then raw
359
+ when Hash then raw.values
360
+ when String then [raw]
361
+ else []
362
+ end
363
+
364
+ result = names.flat_map do |value|
365
+ case value
366
+ when Hash
367
+ string(ValueFinder.fetch(value, %i[name title label]))
368
+ else
369
+ string(value)
370
+ end
371
+ end.compact
372
+
373
+ result.uniq!
374
+ result unless result.empty?
375
+ end
376
+ # rubocop:enable Metrics/MethodLength
377
+
378
+ # @param entry [Hash] raw article entry candidate
379
+ # @param article_url [Html2rss::Url] resolved article URL
380
+ # @return [String] stable article identifier fallbacking to resolved URL
381
+ def identifier(entry, article_url)
382
+ value = ValueFinder.fetch(entry, ID_KEYS)
383
+ value = ValueFinder.fetch(value, ID_KEYS) if value.is_a?(Hash)
384
+ string(value) || article_url.to_s
385
+ end
386
+ end
387
+ private_constant :ArticleNormalizer
388
+
389
+ # @return [Symbol] scraper config key
390
+ def self.options_key = :json_state
391
+
392
+ class << self
393
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
394
+ def articles?(parsed_body)
395
+ return false unless parsed_body
396
+
397
+ DocumentScanner.json_documents(parsed_body).any? { CandidateDetector.candidate_array?(_1) }
398
+ end
399
+
400
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
401
+ # @return [Array<Hash, Array>] parsed JSON documents discovered in the response body
402
+ def json_documents(parsed_body)
403
+ DocumentScanner.json_documents(parsed_body)
404
+ end
405
+ end
406
+
407
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
408
+ # @param url [String, Html2rss::Url] page URL used to resolve relative links
409
+ # @param _opts [Hash] scraper-specific options
410
+ # @option _opts [Object] :_reserved reserved for future scraper-specific options
411
+ def initialize(parsed_body, url:, **_opts)
412
+ @parsed_body = parsed_body
413
+ @url = url
414
+ end
415
+
416
+ attr_reader :parsed_body
417
+
418
+ # @yield [Hash{Symbol => Object}] normalized article hash
419
+ # @return [Enumerator, void] article enumerator when no block is given
420
+ def each
421
+ return enum_for(:each) unless block_given?
422
+
423
+ DocumentScanner.json_documents(parsed_body).each do |document|
424
+ discover_articles(document) do |article|
425
+ yield article if article
426
+ end
427
+ end
428
+ end
429
+
430
+ private
431
+
432
+ attr_reader :url
433
+
434
+ def discover_articles(document, &block)
435
+ case document
436
+ when Array then handle_array(document, &block)
437
+ when Hash then document.each_value { discover_articles(_1, &block) if traversable?(_1) }
438
+ end
439
+ end
440
+
441
+ def handle_array(array, &block)
442
+ if CandidateDetector.array_of_articles?(array)
443
+ array.each do |entry|
444
+ yield(ArticleNormalizer.normalise(entry, base_url: url))
445
+ end
446
+ else
447
+ array.each { discover_articles(_1, &block) if traversable?(_1) }
448
+ end
449
+ end
450
+
451
+ def traversable?(value)
452
+ value.is_a?(Array) || value.is_a?(Hash)
453
+ end
454
+ end
455
+ end
456
+ end
457
+ end