html2rss 0.18.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +60 -1
  3. data/lib/html2rss/articles/deduplicator.rb +1 -0
  4. data/lib/html2rss/auto_source/cleanup.rb +11 -0
  5. data/lib/html2rss/auto_source/scraper/html.rb +5 -0
  6. data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
  7. data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
  8. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
  9. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
  10. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
  11. data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
  12. data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
  13. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
  14. data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
  15. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
  16. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
  17. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
  18. data/lib/html2rss/auto_source/scraper.rb +19 -1
  19. data/lib/html2rss/auto_source.rb +4 -0
  20. data/lib/html2rss/blocked_surface.rb +1 -0
  21. data/lib/html2rss/category_extractor.rb +2 -2
  22. data/lib/html2rss/cli.rb +30 -6
  23. data/lib/html2rss/config/class_methods.rb +24 -35
  24. data/lib/html2rss/config/dynamic_params.rb +6 -4
  25. data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
  26. data/lib/html2rss/config/request_headers.rb +9 -3
  27. data/lib/html2rss/config/schema.rb +33 -1
  28. data/lib/html2rss/config/validator.rb +40 -2
  29. data/lib/html2rss/config.rb +19 -13
  30. data/lib/html2rss/error.rb +25 -0
  31. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  32. data/lib/html2rss/feed_pipeline.rb +127 -0
  33. data/lib/html2rss/hash_util.rb +101 -0
  34. data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
  35. data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
  36. data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
  37. data/lib/html2rss/html_extractor.rb +5 -0
  38. data/lib/html2rss/html_navigator.rb +8 -0
  39. data/lib/html2rss/json_feed_builder.rb +1 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +8 -3
  41. data/lib/html2rss/rendering/description_builder.rb +0 -1
  42. data/lib/html2rss/rendering/image_renderer.rb +17 -7
  43. data/lib/html2rss/rendering/media_renderer.rb +4 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
  45. data/lib/html2rss/rendering/video_renderer.rb +8 -3
  46. data/lib/html2rss/rendering.rb +11 -2
  47. data/lib/html2rss/request_controls.rb +16 -21
  48. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  49. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  50. data/lib/html2rss/request_service/context.rb +14 -2
  51. data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
  52. data/lib/html2rss/request_service/policy.rb +4 -0
  53. data/lib/html2rss/request_service/response.rb +9 -1
  54. data/lib/html2rss/request_service.rb +19 -0
  55. data/lib/html2rss/request_session/runtime_input.rb +16 -2
  56. data/lib/html2rss/request_session/runtime_policy.rb +7 -0
  57. data/lib/html2rss/request_session.rb +13 -9
  58. data/lib/html2rss/rss_builder/article.rb +22 -1
  59. data/lib/html2rss/rss_builder/channel.rb +11 -2
  60. data/lib/html2rss/rss_builder/enclosure.rb +15 -1
  61. data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
  62. data/lib/html2rss/rss_builder.rb +4 -0
  63. data/lib/html2rss/selectors/config.rb +1 -0
  64. data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
  65. data/lib/html2rss/selectors/extractors/href.rb +2 -0
  66. data/lib/html2rss/selectors/extractors/html.rb +1 -0
  67. data/lib/html2rss/selectors/extractors/static.rb +2 -1
  68. data/lib/html2rss/selectors/extractors/text.rb +1 -0
  69. data/lib/html2rss/selectors/extractors.rb +2 -1
  70. data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
  71. data/lib/html2rss/selectors/post_processors/base.rb +13 -7
  72. data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
  73. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
  74. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
  75. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
  76. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
  77. data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
  78. data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
  79. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
  80. data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
  81. data/lib/html2rss/selectors/post_processors/template.rb +3 -0
  82. data/lib/html2rss/selectors/post_processors.rb +5 -0
  83. data/lib/html2rss/selectors.rb +7 -0
  84. data/lib/html2rss/url.rb +27 -23
  85. data/lib/html2rss/version.rb +2 -1
  86. data/lib/html2rss.rb +15 -78
  87. data/schema/html2rss-config.schema.json +83 -1
  88. metadata +7 -2
@@ -7,31 +7,43 @@ module Html2rss
7
7
  class Microdata
8
8
  include Enumerable
9
9
 
10
+ # Selector matching nodes that define a microdata item scope.
10
11
  ITEM_SELECTOR = '[itemscope][itemtype]'
12
+ # Schema.org types supported for article extraction via Microdata.
11
13
  SUPPORTED_TYPES = (Schema::Thing::SUPPORTED_TYPES | Set['Product']).freeze
14
+ # Attribute names checked first for microdata property values.
12
15
  VALUE_ATTRIBUTES = %w[content datetime href src data value].freeze
13
16
 
17
+ # @return [Symbol] scraper config key
14
18
  def self.options_key = :microdata
15
19
 
16
20
  class << self
21
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
17
22
  def articles?(parsed_body)
18
23
  supported_roots(parsed_body).any?
19
24
  end
20
25
 
26
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
27
+ # @return [Array<Nokogiri::XML::Element>] top-level supported Microdata roots
21
28
  def supported_roots(parsed_body)
22
29
  return [] unless parsed_body
23
30
 
24
31
  parsed_body.css(ITEM_SELECTOR).select { supported_root?(_1) }
25
32
  end
26
33
 
34
+ # @param node [Nokogiri::XML::Element] itemscope candidate node
27
35
  def supported_root?(node)
28
36
  supported_type_name(node) && top_level_item?(node)
29
37
  end
30
38
 
39
+ # @param node [Nokogiri::XML::Element] itemscope candidate node
40
+ # @return [String, nil] supported schema type name when present
31
41
  def supported_type_name(node)
32
42
  normalized_types(node['itemtype']).find { SUPPORTED_TYPES.include?(_1) }
33
43
  end
34
44
 
45
+ # @param itemtype [String, nil] raw itemtype attribute value
46
+ # @return [Array<String>] normalized schema type names
35
47
  def normalized_types(itemtype)
36
48
  itemtype.to_s.split.filter_map do |value|
37
49
  type = value.split('/').last.to_s.split('#').last.to_s
@@ -39,6 +51,7 @@ module Html2rss
39
51
  end
40
52
  end
41
53
 
54
+ # @param node [Nokogiri::XML::Element] itemscope candidate node
42
55
  def top_level_item?(node)
43
56
  return false if node.attribute('itemprop')
44
57
 
@@ -53,6 +66,7 @@ module Html2rss
53
66
  # the parsed response body to inspect for top-level Microdata items.
54
67
  # @param url [Html2rss::Url] the absolute page URL used to resolve relative links.
55
68
  # @param _opts [Hash] unused scraper-specific options.
69
+ # @option _opts [Object] :_reserved reserved for future scraper-specific options
56
70
  # @return [void]
57
71
  def initialize(parsed_body, url:, **_opts)
58
72
  @parsed_body = parsed_body
@@ -62,7 +76,7 @@ module Html2rss
62
76
  ##
63
77
  # Iterates over normalized article hashes extracted from supported Microdata roots.
64
78
  #
65
- # @yieldparam article [Hash<Symbol, Object>] the normalized article attributes.
79
+ # @yieldparam article [Hash{Symbol => Object}] the normalized article attributes.
66
80
  # @return [Enumerator, void] an enumerator when no block is given.
67
81
  def each
68
82
  return enum_for(:each) unless block_given?
@@ -77,6 +91,8 @@ module Html2rss
77
91
 
78
92
  attr_reader :parsed_body, :url
79
93
 
94
+ # @param root [Nokogiri::XML::Element] supported Microdata root node
95
+ # @return [Hash{Symbol => Object}, nil] normalized article hash
80
96
  def article_from(root)
81
97
  schema_object = SchemaObjectBuilder.call(root)
82
98
  return unless schema_object
@@ -87,6 +103,8 @@ module Html2rss
87
103
  article
88
104
  end
89
105
 
106
+ # @param article [Hash{Symbol => Object}] normalized article hash
107
+ # @return [Boolean] whether article contains required fields
90
108
  def valid_article?(article)
91
109
  return false unless article[:url]
92
110
 
@@ -97,12 +115,17 @@ module Html2rss
97
115
  module ItemParser
98
116
  module_function
99
117
 
118
+ # @param root [Nokogiri::XML::Element] microdata root node
119
+ # @return [Hash{Symbol => Object}] extracted direct properties
100
120
  def call(root)
101
121
  {}.tap do |properties|
102
122
  direct_properties(root).each { append_properties!(properties, _1) }
103
123
  end
104
124
  end
105
125
 
126
+ # @param properties [Hash{Symbol => Object}] accumulator hash for parsed properties
127
+ # @param node [Nokogiri::XML::Element] itemprop node
128
+ # @return [void]
106
129
  def append_properties!(properties, node)
107
130
  value = property_value(node)
108
131
  return if blank_value?(value)
@@ -112,16 +135,23 @@ module Html2rss
112
135
  end
113
136
  end
114
137
 
138
+ # @param root [Nokogiri::XML::Element] microdata root node
139
+ # @return [Array<Nokogiri::XML::Element>] direct property nodes for the root
115
140
  def direct_properties(root)
116
141
  root.css('[itemprop]').select { direct_property?(root, _1) }
117
142
  end
118
143
 
144
+ # @param root [Nokogiri::XML::Element] microdata root node
145
+ # @param node [Nokogiri::XML::Element] candidate itemprop node
146
+ # @return [Boolean] whether the node belongs directly to the current root item
119
147
  def direct_property?(root, node)
120
148
  return false if node == root
121
149
 
122
150
  node.ancestors.take_while { _1 != root }.none? { |ancestor| ancestor.attribute('itemscope') }
123
151
  end
124
152
 
153
+ # @param node [Nokogiri::XML::Element] itemprop node
154
+ # @return [Array<String>] normalized property names
125
155
  def property_names(node)
126
156
  node['itemprop'].to_s.split.filter_map do |name|
127
157
  stripped = name.strip
@@ -129,6 +159,8 @@ module Html2rss
129
159
  end
130
160
  end
131
161
 
162
+ # @param node [Nokogiri::XML::Element] itemprop node
163
+ # @return [Object, nil] parsed property value
132
164
  def property_value(node)
133
165
  value = if node.attribute('itemscope')
134
166
  nested_item(node)
@@ -139,6 +171,8 @@ module Html2rss
139
171
  value unless blank_value?(value)
140
172
  end
141
173
 
174
+ # @param node [Nokogiri::XML::Element] nested itemscope node
175
+ # @return [Hash{Symbol => Object}] nested parsed microdata item
142
176
  def nested_item(node)
143
177
  item = call(node)
144
178
  itemtype = node['itemtype']
@@ -148,6 +182,8 @@ module Html2rss
148
182
  item
149
183
  end
150
184
 
185
+ # @param node [Nokogiri::XML::Element] itemprop node
186
+ # @return [String, nil] first present attribute value
151
187
  def attribute_value(node)
152
188
  VALUE_ATTRIBUTES.each do |attribute|
153
189
  value = node[attribute]
@@ -157,11 +193,17 @@ module Html2rss
157
193
  nil
158
194
  end
159
195
 
196
+ # @param node [Nokogiri::XML::Element] itemprop node
197
+ # @return [String, nil] normalized text content
160
198
  def text_value(node)
161
199
  value = node.text.to_s.strip
162
200
  value unless value.empty?
163
201
  end
164
202
 
203
+ # @param properties [Hash{Symbol => Object}] accumulator hash for parsed properties
204
+ # @param key [Symbol] target property key
205
+ # @param value [Object] parsed property value to assign for the key
206
+ # @return [void]
165
207
  def append(properties, key, value)
166
208
  return if blank_value?(value)
167
209
 
@@ -173,6 +215,8 @@ module Html2rss
173
215
  properties[key] = Array(properties[key]) << value
174
216
  end
175
217
 
218
+ # @param value [Object] candidate value
219
+ # @return [Boolean] whether value is blank for microdata extraction purposes
176
220
  def blank_value?(value)
177
221
  case value
178
222
  when nil then true
@@ -182,6 +226,8 @@ module Html2rss
182
226
  end
183
227
  end
184
228
 
229
+ # @param value [Object] candidate value
230
+ # @return [Boolean] whether value is present for microdata extraction purposes
185
231
  def present?(value)
186
232
  !blank_value?(value)
187
233
  end
@@ -192,6 +238,8 @@ module Html2rss
192
238
  module ValueNormalizer
193
239
  module_function
194
240
 
241
+ # @param values [Array<Object>] value candidates
242
+ # @return [String, nil] first URL-like value converted to string
195
243
  def url_value(*values)
196
244
  values.each do |value|
197
245
  candidate = extract_nested_value(value, :url, :@id)
@@ -201,6 +249,8 @@ module Html2rss
201
249
  nil
202
250
  end
203
251
 
252
+ # @param values [Array<Object>] value candidates
253
+ # @return [String, Hash, nil] first normalized image candidate
204
254
  def image_value(*values)
205
255
  values.each do |value|
206
256
  candidate = normalize_image(value)
@@ -210,6 +260,8 @@ module Html2rss
210
260
  nil
211
261
  end
212
262
 
263
+ # @param value [Object] image candidate value
264
+ # @return [String, Hash, nil] normalized image-like value
213
265
  def normalize_image(value)
214
266
  candidate = unwrap(value)
215
267
  return unless present?(candidate)
@@ -219,6 +271,8 @@ module Html2rss
219
271
  candidate.to_s
220
272
  end
221
273
 
274
+ # @param value [Object] about candidate value
275
+ # @return [Array<String, Hash>, nil] normalized about values
222
276
  def normalize_about(value)
223
277
  candidate = unwrap(value)
224
278
  items = candidate.is_a?(Array) ? candidate : [candidate]
@@ -226,6 +280,8 @@ module Html2rss
226
280
  values unless values.empty?
227
281
  end
228
282
 
283
+ # @param item [Object] single about item
284
+ # @return [String, Hash, nil] normalized about item
229
285
  def normalize_about_item(item)
230
286
  case item
231
287
  when Hash
@@ -235,6 +291,8 @@ module Html2rss
235
291
  end
236
292
  end
237
293
 
294
+ # @param value [Object] scalar or array candidate
295
+ # @return [String, Array<String>, nil] normalized scalar or string array
238
296
  def string_or_array(value)
239
297
  candidate = unwrap(value)
240
298
  return unless present?(candidate)
@@ -245,15 +303,21 @@ module Html2rss
245
303
  result unless result.empty?
246
304
  end
247
305
 
306
+ # @param values [Array<Object>] value candidates
307
+ # @return [Array<String>, nil] normalized unique string values
248
308
  def array_value(*values)
249
309
  result = values.flat_map { string_values(Array(unwrap(_1))) }.uniq
250
310
  result unless result.empty?
251
311
  end
252
312
 
313
+ # @param values [Array<Object>] candidate scalar values collected from microdata arrays
314
+ # @return [Array<String>] normalized string values
253
315
  def string_values(values)
254
316
  values.filter_map { stringify(_1) }
255
317
  end
256
318
 
319
+ # @param values [Array<Object>] value candidates
320
+ # @return [String, nil] first present string-like value
257
321
  def first_string(*values)
258
322
  values.each do |value|
259
323
  candidate = stringify(unwrap(value))
@@ -263,6 +327,9 @@ module Html2rss
263
327
  nil
264
328
  end
265
329
 
330
+ # @param value [Object] nested container or scalar
331
+ # @param keys [Array<Symbol>] nested keys to probe in order
332
+ # @return [Object, nil] first matching nested value
266
333
  def extract_nested_value(value, *keys)
267
334
  candidate = unwrap(value)
268
335
  return candidate unless candidate.is_a?(Hash)
@@ -275,10 +342,14 @@ module Html2rss
275
342
  nil
276
343
  end
277
344
 
345
+ # @param value [Object] scalar or array candidate
346
+ # @return [Object] first array element or the original value
278
347
  def unwrap(value)
279
348
  value.is_a?(Array) ? value.first : value
280
349
  end
281
350
 
351
+ # @param value [Object] scalar candidate normalized to string output
352
+ # @return [String, nil] normalized string representation
282
353
  def stringify(value)
283
354
  return unless present?(value)
284
355
  return value if value.is_a?(String)
@@ -287,6 +358,8 @@ module Html2rss
287
358
  value.to_s
288
359
  end
289
360
 
361
+ # @param value [Object] candidate value
362
+ # @return [Boolean] whether value is present
290
363
  def present?(value)
291
364
  case value
292
365
  when nil then false
@@ -304,6 +377,8 @@ module Html2rss
304
377
 
305
378
  extend ValueNormalizer
306
379
 
380
+ # @param root [Nokogiri::XML::Element] supported microdata root node
381
+ # @return [Hash{Symbol => Object}, nil] compact schema-like object
307
382
  def call(root)
308
383
  type = Microdata.supported_type_name(root)
309
384
  return unless type
@@ -311,12 +386,20 @@ module Html2rss
311
386
  compact_object(type, root, ItemParser.call(root))
312
387
  end
313
388
 
389
+ # @param type [String] schema type inferred from itemtype
390
+ # @param root [Nokogiri::XML::Element] supported microdata root node
391
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
392
+ # @return [Hash{Symbol => Object}] normalized schema-like object
314
393
  def compact_object(type, root, properties)
315
394
  object = base_attributes(type, root, properties)
316
395
  merge_categories!(object, properties)
317
396
  object.compact
318
397
  end
319
398
 
399
+ # @param type [String] schema type inferred from itemtype
400
+ # @param root [Nokogiri::XML::Element] supported microdata root node
401
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
402
+ # @return [Hash{Symbol => Object}] base schema attributes before category merging
320
403
  def base_attributes(type, root, properties)
321
404
  identifier = first_string(root['itemid'], properties.delete(:identifier))
322
405
 
@@ -328,10 +411,14 @@ module Html2rss
328
411
  .merge(media_attributes(properties))
329
412
  end
330
413
 
414
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
415
+ # @return [String, nil] normalized title
331
416
  def title(properties)
332
417
  first_string(properties.delete(:headline), properties.delete(:title), properties.delete(:name))
333
418
  end
334
419
 
420
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
421
+ # @return [Hash{Symbol => Object}] normalized text attributes
335
422
  def text_attributes(properties)
336
423
  {
337
424
  title: title(properties),
@@ -342,18 +429,26 @@ module Html2rss
342
429
  }
343
430
  end
344
431
 
432
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
433
+ # @param identifier [String, nil] identifier candidate for fallback URL handling
434
+ # @return [Hash{Symbol => Object}] normalized link attributes
345
435
  def link_attributes(properties, identifier)
346
436
  {
347
437
  url: url(properties, identifier)
348
438
  }
349
439
  end
350
440
 
441
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
442
+ # @return [Hash{Symbol => Object}] normalized media attributes
351
443
  def media_attributes(properties)
352
444
  {
353
445
  image: image_value(properties.delete(:image), properties.delete(:thumbnailUrl))
354
446
  }
355
447
  end
356
448
 
449
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
450
+ # @param fallback_id [String, nil] identifier candidate for fallback URL handling
451
+ # @return [String, nil] normalized URL candidate
357
452
  def url(properties, fallback_id)
358
453
  url_value(
359
454
  properties.delete(:url),
@@ -362,6 +457,8 @@ module Html2rss
362
457
  )
363
458
  end
364
459
 
460
+ # @param fallback_id [String, nil] identifier candidate for fallback URL handling
461
+ # @return [String, nil] fallback URL candidate when identifier looks URL-like
365
462
  def url_fallback(fallback_id)
366
463
  value = first_string(fallback_id)
367
464
  return unless value
@@ -371,6 +468,8 @@ module Html2rss
371
468
  nil
372
469
  end
373
470
 
471
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
472
+ # @return [String, nil] normalized published-at value
374
473
  def published_at(properties)
375
474
  first_string(
376
475
  properties.delete(:datePublished),
@@ -380,6 +479,9 @@ module Html2rss
380
479
  )
381
480
  end
382
481
 
482
+ # @param object [Hash{Symbol => Object}] schema-like output object
483
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
484
+ # @return [void]
383
485
  def merge_categories!(object, properties)
384
486
  categories = array_value(properties.delete(:categories), properties.delete(:articleSection))
385
487
  assign_if_present(object, :categories, categories)
@@ -388,6 +490,10 @@ module Html2rss
388
490
  assign_if_present(object, :about, normalize_about(properties.delete(:about)))
389
491
  end
390
492
 
493
+ # @param object [Hash{Symbol => Object}] schema-like output object
494
+ # @param key [Symbol] target attribute key
495
+ # @param value [Object] value to assign when present
496
+ # @return [void]
391
497
  def assign_if_present(object, key, value)
392
498
  object[key] = value if value
393
499
  end
@@ -90,7 +90,7 @@ module Html2rss
90
90
  ##
91
91
  # Extracts categories from a string by splitting on separators.
92
92
  #
93
- # @param string [String] The string to process
93
+ # @param string [String] source string that may contain category delimiters
94
94
  # @return [Set<String>] Set of category strings
95
95
  def self.extract_string_categories(string)
96
96
  Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?))
@@ -11,6 +11,7 @@ module Html2rss
11
11
  #
12
12
  # @see https://schema.org/ItemList
13
13
  class ItemList < Thing
14
+ # Schema.org type names handled by the ItemList extractor.
14
15
  SUPPORTED_TYPES = Set['ItemList']
15
16
 
16
17
  # @return [Array<Hash>] the scraped article hashes with DEFAULT_ATTRIBUTES
@@ -5,11 +5,13 @@ module Html2rss
5
5
  module Scraper
6
6
  class Schema
7
7
  ##
8
- #
9
8
  # @see https://schema.org/ListItem
10
9
  class ListItem < Thing
10
+ # @return [String, nil] stable list-item identifier
11
11
  def id = (id = (schema_object.dig(:item, :@id) || super).to_s).empty? ? nil : id
12
+ # @return [String, nil] list-item title
12
13
  def title = schema_object.dig(:item, :name) || super || url&.titleized
14
+ # @return [String, nil] list-item description
13
15
  def description = schema_object.dig(:item, :description) || super
14
16
 
15
17
  # @return [Html2rss::Url, nil]
@@ -11,6 +11,7 @@ module Html2rss
11
11
  #
12
12
  # @see https://schema.org/Thing
13
13
  class Thing
14
+ # Supported Schema.org `@type` values mapped to article extraction.
14
15
  SUPPORTED_TYPES = %w[
15
16
  AdvertiserContentArticle
16
17
  AnalysisNewsArticle
@@ -32,8 +33,11 @@ module Html2rss
32
33
  TechArticle
33
34
  ].to_set.freeze
34
35
 
36
+ # Attributes exposed by `#call` in generated article hashes.
35
37
  DEFAULT_ATTRIBUTES = %i[id title description url image published_at categories].freeze
36
38
 
39
+ # @param schema_object [Hash{Symbol => Object}] parsed schema.org object
40
+ # @param url [String, Html2rss::Url, nil] base URL used for relative normalization
37
41
  def initialize(schema_object, url:)
38
42
  @schema_object = schema_object
39
43
  @base_url = normalized_base_url(url)
@@ -46,6 +50,7 @@ module Html2rss
46
50
  end
47
51
  end
48
52
 
53
+ # @return [String, nil] stable schema object identifier
49
54
  def id
50
55
  return @id if defined?(@id)
51
56
 
@@ -56,8 +61,10 @@ module Html2rss
56
61
  @id = id
57
62
  end
58
63
 
64
+ # @return [String, nil] article title
59
65
  def title = schema_object[:title]
60
66
 
67
+ # @return [String, nil] longest available description field
61
68
  def description
62
69
  schema_object.values_at(:description, :schema_object_body, :abstract)
63
70
  .max_by { |string| string.to_s.size }
@@ -74,14 +81,17 @@ module Html2rss
74
81
  Url.from_relative(url, base_url || url)
75
82
  end
76
83
 
84
+ # @return [Html2rss::Url, nil] normalized article image URL
77
85
  def image
78
86
  if (image_url = image_urls.first)
79
87
  Url.from_relative(image_url, base_url || image_url)
80
88
  end
81
89
  end
82
90
 
91
+ # @return [String, nil] published-at timestamp string
83
92
  def published_at = schema_object[:datePublished]
84
93
 
94
+ # @return [Array<String>, nil] extracted category labels
85
95
  def categories
86
96
  return @categories if defined?(@categories)
87
97
 
@@ -90,6 +100,7 @@ module Html2rss
90
100
 
91
101
  attr_reader :schema_object, :base_url
92
102
 
103
+ # @return [Array<String>] normalized image URL candidates
93
104
  def image_urls
94
105
  schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
95
106
  next unless object
@@ -102,6 +113,9 @@ module Html2rss
102
113
  end
103
114
  end
104
115
 
116
+ # @param value [String, Symbol, nil] candidate schema identifier
117
+ # @param reference_url [Html2rss::Url, nil] URL used for same-origin normalization
118
+ # @return [String, nil] normalized identifier value
105
119
  def normalized_id(value, reference_url:)
106
120
  text = value.to_s
107
121
  return if text.empty?
@@ -114,6 +128,9 @@ module Html2rss
114
128
  text
115
129
  end
116
130
 
131
+ # @param text [String] raw identifier text
132
+ # @param reference_url [Html2rss::Url, nil] URL used to resolve relative IDs
133
+ # @return [Html2rss::Url] normalized identifier URL
117
134
  def normalized_id_url(text, reference_url:)
118
135
  if text.start_with?('/')
119
136
  Url.from_relative(text, reference_url || text)
@@ -122,6 +139,8 @@ module Html2rss
122
139
  end
123
140
  end
124
141
 
142
+ # @param url [Html2rss::Url] normalized identifier URL
143
+ # @return [String, nil] path/query portion used as stable ID
125
144
  def normalized_id_value(url)
126
145
  path = url.path.to_s
127
146
  return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
@@ -130,6 +149,8 @@ module Html2rss
130
149
  url.query
131
150
  end
132
151
 
152
+ # @param url [String, Html2rss::Url, nil] candidate page URL
153
+ # @return [Html2rss::Url, nil] normalized absolute URL for schema resolution
133
154
  def normalized_base_url(url)
134
155
  return if url.to_s.strip.empty?
135
156
 
@@ -8,24 +8,28 @@ module Html2rss
8
8
  module Scraper
9
9
  ##
10
10
  # Scrapes articles from Schema.org objects, by looking for the objects in:
11
-
12
11
  # <script type="application/ld+json"> "schema" tags.
13
12
  #
14
- # See:
15
- # 1. https://schema.org/docs/full.html
16
- # 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
13
+ # @see https://schema.org/docs/full.html
14
+ # @see https://developers.google.com/search/docs/appearance/structured-data/article#microdata
17
15
  class Schema
18
16
  include Enumerable
19
17
 
18
+ # Selector for JSON-LD script tags containing Schema.org objects.
20
19
  TAG_SELECTOR = 'script[type="application/ld+json"]'
21
20
 
21
+ # @return [Symbol] scraper config key
22
22
  def self.options_key = :schema
23
23
 
24
24
  class << self
25
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
26
+ # @return [Boolean] whether the page includes supported schema types
25
27
  def articles?(parsed_body)
26
28
  parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) }
27
29
  end
28
30
 
31
+ # @param script [Nokogiri::XML::Element] schema JSON-LD script tag
32
+ # @return [Boolean] whether the tag references a supported schema type
29
33
  def supported_schema_type?(script)
30
34
  supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
31
35
  supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
@@ -52,11 +56,14 @@ module Html2rss
52
56
  end
53
57
  end
54
58
 
59
+ # @param object [Hash{Symbol => Object}] schema candidate object
60
+ # @return [Boolean] whether an extractor exists for the candidate object
55
61
  def supported_schema_object?(object)
56
62
  scraper_for_schema_object(object) ? true : false
57
63
  end
58
64
 
59
65
  ##
66
+ # @param schema_object [Hash{Symbol => Object}] schema object with an @type key
60
67
  # @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
61
68
  def scraper_for_schema_object(schema_object)
62
69
  type = schema_object[:@type]
@@ -81,6 +88,10 @@ module Html2rss
81
88
  end
82
89
  end
83
90
 
91
+ # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
92
+ # @param url [String, Html2rss::Url] base page URL
93
+ # @param opts [Hash] scraper-specific options
94
+ # @option opts [Object] :_reserved reserved for future scraper-specific options
84
95
  def initialize(parsed_body, url:, **opts)
85
96
  @parsed_body = parsed_body
86
97
  @url = url
@@ -25,7 +25,9 @@ module Html2rss
25
25
  :score
26
26
  )
27
27
 
28
+ # Comma-separated heading selector used for heading/anchor matching.
28
29
  HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
30
+ # Path segments that usually represent utility navigation rather than article content.
29
31
  UTILITY_PATH_SEGMENTS = %w[
30
32
  about account author category comment comments contact feedback help
31
33
  login newsletter profile register search settings share signup subscribe
@@ -40,11 +42,14 @@ module Html2rss
40
42
  logout
41
43
  user users
42
44
  ].to_set.freeze
45
+ # Path segments that signal content-like destinations.
43
46
  CONTENT_PATH_SEGMENTS = %w[
44
47
  article articles news post posts story stories update updates
45
48
  ].to_set.freeze
49
+ # Ancestor tags that usually indicate navigation/utility regions.
46
50
  UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
47
51
 
52
+ # @param base_url [String, Html2rss::Url] page URL used to normalize href destinations
48
53
  def initialize(base_url)
49
54
  @base_url = base_url
50
55
  end
@@ -20,8 +20,10 @@ module Html2rss
20
20
  class SemanticHtml
21
21
  include Enumerable
22
22
 
23
+ # Container plus selected anchor chosen for extraction.
23
24
  Entry = Data.define(:container, :selected_anchor)
24
25
 
26
+ # Candidate semantic container selectors used to locate extractable blocks.
25
27
  CONTAINER_SELECTORS = [
26
28
  'article:not(:has(article))',
27
29
  'section:not(:has(section))',
@@ -45,6 +47,8 @@ module Html2rss
45
47
  # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
46
48
  # @param url [String, Html2rss::Url] base url
47
49
  # @param extractor [Class] extractor class used for article extraction
50
+ # @param _opts [Hash] scraper-specific options
51
+ # @option _opts [Object] :_reserved reserved for future scraper-specific options
48
52
  def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
49
53
  @parsed_body = parsed_body
50
54
  @url = url