html2rss 0.17.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +90 -639
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +50 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +44 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +123 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
  20. data/lib/html2rss/auto_source/scraper.rb +160 -8
  21. data/lib/html2rss/auto_source.rb +123 -47
  22. data/lib/html2rss/blocked_surface.rb +65 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +194 -23
  25. data/lib/html2rss/config/class_methods.rb +178 -0
  26. data/lib/html2rss/config/dynamic_params.rb +70 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
  28. data/lib/html2rss/config/request_headers.rb +136 -0
  29. data/lib/html2rss/config/schema.rb +240 -0
  30. data/lib/html2rss/config/validator.rb +146 -0
  31. data/lib/html2rss/config.rb +118 -61
  32. data/lib/html2rss/error.rb +31 -0
  33. data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
  34. data/lib/html2rss/feed_pipeline.rb +127 -0
  35. data/lib/html2rss/hash_util.rb +101 -0
  36. data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
  37. data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
  38. data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
  39. data/lib/html2rss/html_extractor.rb +141 -0
  40. data/lib/html2rss/html_navigator.rb +54 -0
  41. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  42. data/lib/html2rss/json_feed_builder.rb +59 -0
  43. data/lib/html2rss/rendering/audio_renderer.rb +36 -0
  44. data/lib/html2rss/rendering/description_builder.rb +87 -0
  45. data/lib/html2rss/rendering/image_renderer.rb +41 -0
  46. data/lib/html2rss/rendering/media_renderer.rb +37 -0
  47. data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
  48. data/lib/html2rss/rendering/video_renderer.rb +36 -0
  49. data/lib/html2rss/rendering.rb +23 -0
  50. data/lib/html2rss/request_controls.rb +123 -0
  51. data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
  52. data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
  53. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  54. data/lib/html2rss/request_service/budget.rb +39 -0
  55. data/lib/html2rss/request_service/context.rb +77 -21
  56. data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
  57. data/lib/html2rss/request_service/policy.rb +252 -0
  58. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  59. data/lib/html2rss/request_service/response.rb +51 -3
  60. data/lib/html2rss/request_service/response_guard.rb +62 -0
  61. data/lib/html2rss/request_service.rb +50 -15
  62. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  63. data/lib/html2rss/request_session/runtime_input.rb +71 -0
  64. data/lib/html2rss/request_session/runtime_policy.rb +83 -0
  65. data/lib/html2rss/request_session.rb +122 -0
  66. data/lib/html2rss/rss_builder/article.rb +187 -0
  67. data/lib/html2rss/rss_builder/channel.rb +105 -11
  68. data/lib/html2rss/rss_builder/enclosure.rb +62 -0
  69. data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
  70. data/lib/html2rss/rss_builder.rb +76 -71
  71. data/lib/html2rss/selectors/config.rb +123 -0
  72. data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
  73. data/lib/html2rss/selectors/extractors/href.rb +55 -0
  74. data/lib/html2rss/selectors/extractors/html.rb +49 -0
  75. data/lib/html2rss/selectors/extractors/static.rb +42 -0
  76. data/lib/html2rss/selectors/extractors/text.rb +47 -0
  77. data/lib/html2rss/selectors/extractors.rb +53 -0
  78. data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
  79. data/lib/html2rss/selectors/post_processors/base.rb +80 -0
  80. data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
  81. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
  82. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
  83. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
  84. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
  85. data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
  86. data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
  87. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
  88. data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
  89. data/lib/html2rss/selectors/post_processors/template.rb +76 -0
  90. data/lib/html2rss/selectors/post_processors.rb +48 -0
  91. data/lib/html2rss/selectors.rb +301 -0
  92. data/lib/html2rss/url.rb +266 -0
  93. data/lib/html2rss/version.rb +2 -1
  94. data/lib/html2rss.rb +67 -71
  95. data/lib/tasks/config_schema.rake +17 -0
  96. data/schema/html2rss-config.schema.json +551 -0
  97. metadata +120 -38
  98. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  99. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  100. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  101. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  102. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  103. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  104. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  105. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  106. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  107. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  108. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  109. data/lib/html2rss/attribute_post_processors.rb +0 -44
  110. data/lib/html2rss/auto_source/article.rb +0 -127
  111. data/lib/html2rss/auto_source/channel.rb +0 -78
  112. data/lib/html2rss/auto_source/reducer.rb +0 -48
  113. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  114. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  115. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  116. data/lib/html2rss/config/channel.rb +0 -125
  117. data/lib/html2rss/config/selectors.rb +0 -103
  118. data/lib/html2rss/item.rb +0 -186
  119. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  120. data/lib/html2rss/item_extractors/href.rb +0 -52
  121. data/lib/html2rss/item_extractors/html.rb +0 -46
  122. data/lib/html2rss/item_extractors/static.rb +0 -39
  123. data/lib/html2rss/item_extractors/text.rb +0 -44
  124. data/lib/html2rss/item_extractors.rb +0 -88
  125. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  126. data/lib/html2rss/rss_builder/item.rb +0 -83
  127. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,505 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ # Scrapes Schema.org Microdata items embedded directly in HTML markup.
7
+ class Microdata
8
+ include Enumerable
9
+
10
+ # Selector matching nodes that define a microdata item scope.
11
+ ITEM_SELECTOR = '[itemscope][itemtype]'
12
+ # Schema.org types supported for article extraction via Microdata.
13
+ SUPPORTED_TYPES = (Schema::Thing::SUPPORTED_TYPES | Set['Product']).freeze
14
+ # Attribute names checked first for microdata property values.
15
+ VALUE_ATTRIBUTES = %w[content datetime href src data value].freeze
16
+
17
+ # @return [Symbol] scraper config key
18
+ def self.options_key = :microdata
19
+
20
+ class << self
21
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
22
+ def articles?(parsed_body)
23
+ supported_roots(parsed_body).any?
24
+ end
25
+
26
+ # @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
27
+ # @return [Array<Nokogiri::XML::Element>] top-level supported Microdata roots
28
+ def supported_roots(parsed_body)
29
+ return [] unless parsed_body
30
+
31
+ parsed_body.css(ITEM_SELECTOR).select { supported_root?(_1) }
32
+ end
33
+
34
+ # @param node [Nokogiri::XML::Element] itemscope candidate node
35
+ def supported_root?(node)
36
+ supported_type_name(node) && top_level_item?(node)
37
+ end
38
+
39
+ # @param node [Nokogiri::XML::Element] itemscope candidate node
40
+ # @return [String, nil] supported schema type name when present
41
+ def supported_type_name(node)
42
+ normalized_types(node['itemtype']).find { SUPPORTED_TYPES.include?(_1) }
43
+ end
44
+
45
+ # @param itemtype [String, nil] raw itemtype attribute value
46
+ # @return [Array<String>] normalized schema type names
47
+ def normalized_types(itemtype)
48
+ itemtype.to_s.split.filter_map do |value|
49
+ type = value.split('/').last.to_s.split('#').last.to_s
50
+ type unless type.empty?
51
+ end
52
+ end
53
+
54
+ # @param node [Nokogiri::XML::Element] itemscope candidate node
55
+ def top_level_item?(node)
56
+ return false if node.attribute('itemprop')
57
+
58
+ node.ancestors.none? { |ancestor| ancestor.attribute('itemscope') && ancestor.attribute('itemprop') }
59
+ end
60
+ end
61
+
62
+ ##
63
+ # Builds a Microdata scraper for an already parsed response body.
64
+ #
65
+ # @param parsed_body [Nokogiri::HTML5::Document, Nokogiri::HTML4::Document, Nokogiri::XML::Node, nil]
66
+ # the parsed response body to inspect for top-level Microdata items.
67
+ # @param url [Html2rss::Url] the absolute page URL used to resolve relative links.
68
+ # @param _opts [Hash] unused scraper-specific options.
69
+ # @option _opts [Object] :_reserved reserved for future scraper-specific options
70
+ # @return [void]
71
+ def initialize(parsed_body, url:, **_opts)
72
+ @parsed_body = parsed_body
73
+ @url = url
74
+ end
75
+
76
+ ##
77
+ # Iterates over normalized article hashes extracted from supported Microdata roots.
78
+ #
79
+ # @yieldparam article [Hash{Symbol => Object}] the normalized article attributes.
80
+ # @return [Enumerator, void] an enumerator when no block is given.
81
+ def each
82
+ return enum_for(:each) unless block_given?
83
+
84
+ self.class.supported_roots(parsed_body).each do |root|
85
+ article = article_from(root)
86
+ yield article if article
87
+ end
88
+ end
89
+
90
+ private
91
+
92
+ attr_reader :parsed_body, :url
93
+
94
+ # @param root [Nokogiri::XML::Element] supported Microdata root node
95
+ # @return [Hash{Symbol => Object}, nil] normalized article hash
96
+ def article_from(root)
97
+ schema_object = SchemaObjectBuilder.call(root)
98
+ return unless schema_object
99
+
100
+ article = Schema::Thing.new(schema_object, url:).call.compact
101
+ return unless valid_article?(article)
102
+
103
+ article
104
+ end
105
+
106
+ # @param article [Hash{Symbol => Object}] normalized article hash
107
+ # @return [Boolean] whether article contains required fields
108
+ def valid_article?(article)
109
+ return false unless article[:url]
110
+
111
+ article[:title] || article[:description]
112
+ end
113
+
114
+ # Extracts direct Microdata itemprop values for a single item root.
115
+ module ItemParser
116
+ module_function
117
+
118
+ # @param root [Nokogiri::XML::Element] microdata root node
119
+ # @return [Hash{Symbol => Object}] extracted direct properties
120
+ def call(root)
121
+ {}.tap do |properties|
122
+ direct_properties(root).each { append_properties!(properties, _1) }
123
+ end
124
+ end
125
+
126
+ # @param properties [Hash{Symbol => Object}] accumulator hash for parsed properties
127
+ # @param node [Nokogiri::XML::Element] itemprop node
128
+ # @return [void]
129
+ def append_properties!(properties, node)
130
+ value = property_value(node)
131
+ return if blank_value?(value)
132
+
133
+ property_names(node).each do |name|
134
+ append(properties, name.to_sym, value)
135
+ end
136
+ end
137
+
138
+ # @param root [Nokogiri::XML::Element] microdata root node
139
+ # @return [Array<Nokogiri::XML::Element>] direct property nodes for the root
140
+ def direct_properties(root)
141
+ root.css('[itemprop]').select { direct_property?(root, _1) }
142
+ end
143
+
144
+ # @param root [Nokogiri::XML::Element] microdata root node
145
+ # @param node [Nokogiri::XML::Element] candidate itemprop node
146
+ # @return [Boolean] whether the node belongs directly to the current root item
147
+ def direct_property?(root, node)
148
+ return false if node == root
149
+
150
+ node.ancestors.take_while { _1 != root }.none? { |ancestor| ancestor.attribute('itemscope') }
151
+ end
152
+
153
+ # @param node [Nokogiri::XML::Element] itemprop node
154
+ # @return [Array<String>] normalized property names
155
+ def property_names(node)
156
+ node['itemprop'].to_s.split.filter_map do |name|
157
+ stripped = name.strip
158
+ stripped unless stripped.empty?
159
+ end
160
+ end
161
+
162
+ # @param node [Nokogiri::XML::Element] itemprop node
163
+ # @return [Object, nil] parsed property value
164
+ def property_value(node)
165
+ value = if node.attribute('itemscope')
166
+ nested_item(node)
167
+ else
168
+ attribute_value(node) || text_value(node)
169
+ end
170
+
171
+ value unless blank_value?(value)
172
+ end
173
+
174
+ # @param node [Nokogiri::XML::Element] nested itemscope node
175
+ # @return [Hash{Symbol => Object}] nested parsed microdata item
176
+ def nested_item(node)
177
+ item = call(node)
178
+ itemtype = node['itemtype']
179
+ itemid = node['itemid']
180
+ item[:@type] = Microdata.normalized_types(itemtype).first if itemtype
181
+ item[:@id] = itemid if present?(itemid)
182
+ item
183
+ end
184
+
185
+ # @param node [Nokogiri::XML::Element] itemprop node
186
+ # @return [String, nil] first present attribute value
187
+ def attribute_value(node)
188
+ VALUE_ATTRIBUTES.each do |attribute|
189
+ value = node[attribute]
190
+ return value if present?(value)
191
+ end
192
+
193
+ nil
194
+ end
195
+
196
+ # @param node [Nokogiri::XML::Element] itemprop node
197
+ # @return [String, nil] normalized text content
198
+ def text_value(node)
199
+ value = node.text.to_s.strip
200
+ value unless value.empty?
201
+ end
202
+
203
+ # @param properties [Hash{Symbol => Object}] accumulator hash for parsed properties
204
+ # @param key [Symbol] target property key
205
+ # @param value [Object] parsed property value to assign for the key
206
+ # @return [void]
207
+ def append(properties, key, value)
208
+ return if blank_value?(value)
209
+
210
+ unless properties.key?(key)
211
+ properties[key] = value
212
+ return
213
+ end
214
+
215
+ properties[key] = Array(properties[key]) << value
216
+ end
217
+
218
+ # @param value [Object] candidate value
219
+ # @return [Boolean] whether value is blank for microdata extraction purposes
220
+ def blank_value?(value)
221
+ case value
222
+ when nil then true
223
+ when String then value.strip.empty?
224
+ when Array, Hash then value.empty?
225
+ else false
226
+ end
227
+ end
228
+
229
+ # @param value [Object] candidate value
230
+ # @return [Boolean] whether value is present for microdata extraction purposes
231
+ def present?(value)
232
+ !blank_value?(value)
233
+ end
234
+ end
235
+ private_constant :ItemParser
236
+
237
+ # Shared value normalization helpers for Microdata property conversion.
238
+ module ValueNormalizer
239
+ module_function
240
+
241
+ # @param values [Array<Object>] value candidates
242
+ # @return [String, nil] first URL-like value converted to string
243
+ def url_value(*values)
244
+ values.each do |value|
245
+ candidate = extract_nested_value(value, :url, :@id)
246
+ return candidate.to_s if present?(candidate)
247
+ end
248
+
249
+ nil
250
+ end
251
+
252
+ # @param values [Array<Object>] value candidates
253
+ # @return [String, Hash, nil] first normalized image candidate
254
+ def image_value(*values)
255
+ values.each do |value|
256
+ candidate = normalize_image(value)
257
+ return candidate if present?(candidate)
258
+ end
259
+
260
+ nil
261
+ end
262
+
263
+ # @param value [Object] image candidate value
264
+ # @return [String, Hash, nil] normalized image-like value
265
+ def normalize_image(value)
266
+ candidate = unwrap(value)
267
+ return unless present?(candidate)
268
+
269
+ return candidate if candidate.is_a?(String) || candidate.is_a?(Hash)
270
+
271
+ candidate.to_s
272
+ end
273
+
274
+ # @param value [Object] about candidate value
275
+ # @return [Array<String, Hash>, nil] normalized about values
276
+ def normalize_about(value)
277
+ candidate = unwrap(value)
278
+ items = candidate.is_a?(Array) ? candidate : [candidate]
279
+ values = items.filter_map { normalize_about_item(_1) }
280
+ values unless values.empty?
281
+ end
282
+
283
+ # @param item [Object] single about item
284
+ # @return [String, Hash, nil] normalized about item
285
+ def normalize_about_item(item)
286
+ case item
287
+ when Hash
288
+ name = item[:name]
289
+ { name: name.to_s } if name
290
+ when String then item
291
+ end
292
+ end
293
+
294
+ # @param value [Object] scalar or array candidate
295
+ # @return [String, Array<String>, nil] normalized scalar or string array
296
+ def string_or_array(value)
297
+ candidate = unwrap(value)
298
+ return unless present?(candidate)
299
+
300
+ return stringify(candidate) unless candidate.is_a?(Array)
301
+
302
+ result = string_values(candidate)
303
+ result unless result.empty?
304
+ end
305
+
306
+ # @param values [Array<Object>] value candidates
307
+ # @return [Array<String>, nil] normalized unique string values
308
+ def array_value(*values)
309
+ result = values.flat_map { string_values(Array(unwrap(_1))) }.uniq
310
+ result unless result.empty?
311
+ end
312
+
313
+ # @param values [Array<Object>] candidate scalar values collected from microdata arrays
314
+ # @return [Array<String>] normalized string values
315
+ def string_values(values)
316
+ values.filter_map { stringify(_1) }
317
+ end
318
+
319
+ # @param values [Array<Object>] value candidates
320
+ # @return [String, nil] first present string-like value
321
+ def first_string(*values)
322
+ values.each do |value|
323
+ candidate = stringify(unwrap(value))
324
+ return candidate if present?(candidate)
325
+ end
326
+
327
+ nil
328
+ end
329
+
330
+ # @param value [Object] nested container or scalar
331
+ # @param keys [Array<Symbol>] nested keys to probe in order
332
+ # @return [Object, nil] first matching nested value
333
+ def extract_nested_value(value, *keys)
334
+ candidate = unwrap(value)
335
+ return candidate unless candidate.is_a?(Hash)
336
+
337
+ keys.each do |key|
338
+ nested_value = candidate[key]
339
+ return nested_value if present?(nested_value)
340
+ end
341
+
342
+ nil
343
+ end
344
+
345
+ # @param value [Object] scalar or array candidate
346
+ # @return [Object] first array element or the original value
347
+ def unwrap(value)
348
+ value.is_a?(Array) ? value.first : value
349
+ end
350
+
351
+ # @param value [Object] scalar candidate normalized to string output
352
+ # @return [String, nil] normalized string representation
353
+ def stringify(value)
354
+ return unless present?(value)
355
+ return value if value.is_a?(String)
356
+ return if value.is_a?(Hash) || value.is_a?(Array)
357
+
358
+ value.to_s
359
+ end
360
+
361
+ # @param value [Object] candidate value
362
+ # @return [Boolean] whether value is present
363
+ def present?(value)
364
+ case value
365
+ when nil then false
366
+ when String then !value.strip.empty?
367
+ when Array, Hash then !value.empty?
368
+ else true
369
+ end
370
+ end
371
+ end
372
+ private_constant :ValueNormalizer
373
+
374
+ # Normalizes raw Microdata properties into the schema-like shape used downstream.
375
+ module SchemaObjectBuilder
376
+ module_function
377
+
378
+ extend ValueNormalizer
379
+
380
+ # @param root [Nokogiri::XML::Element] supported microdata root node
381
+ # @return [Hash{Symbol => Object}, nil] compact schema-like object
382
+ def call(root)
383
+ type = Microdata.supported_type_name(root)
384
+ return unless type
385
+
386
+ compact_object(type, root, ItemParser.call(root))
387
+ end
388
+
389
+ # @param type [String] schema type inferred from itemtype
390
+ # @param root [Nokogiri::XML::Element] supported microdata root node
391
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
392
+ # @return [Hash{Symbol => Object}] normalized schema-like object
393
+ def compact_object(type, root, properties)
394
+ object = base_attributes(type, root, properties)
395
+ merge_categories!(object, properties)
396
+ object.compact
397
+ end
398
+
399
+ # @param type [String] schema type inferred from itemtype
400
+ # @param root [Nokogiri::XML::Element] supported microdata root node
401
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
402
+ # @return [Hash{Symbol => Object}] base schema attributes before category merging
403
+ def base_attributes(type, root, properties)
404
+ identifier = first_string(root['itemid'], properties.delete(:identifier))
405
+
406
+ {
407
+ '@type': type,
408
+ '@id': identifier
409
+ }.merge(text_attributes(properties))
410
+ .merge(link_attributes(properties, identifier))
411
+ .merge(media_attributes(properties))
412
+ end
413
+
414
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
415
+ # @return [String, nil] normalized title
416
+ def title(properties)
417
+ first_string(properties.delete(:headline), properties.delete(:title), properties.delete(:name))
418
+ end
419
+
420
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
421
+ # @return [Hash{Symbol => Object}] normalized text attributes
422
+ def text_attributes(properties)
423
+ {
424
+ title: title(properties),
425
+ description: first_string(properties.delete(:description)),
426
+ schema_object_body: first_string(properties.delete(:articleBody)),
427
+ abstract: first_string(properties.delete(:abstract)),
428
+ datePublished: published_at(properties)
429
+ }
430
+ end
431
+
432
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
433
+ # @param identifier [String, nil] identifier candidate for fallback URL handling
434
+ # @return [Hash{Symbol => Object}] normalized link attributes
435
+ def link_attributes(properties, identifier)
436
+ {
437
+ url: url(properties, identifier)
438
+ }
439
+ end
440
+
441
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
442
+ # @return [Hash{Symbol => Object}] normalized media attributes
443
+ def media_attributes(properties)
444
+ {
445
+ image: image_value(properties.delete(:image), properties.delete(:thumbnailUrl))
446
+ }
447
+ end
448
+
449
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
450
+ # @param fallback_id [String, nil] identifier candidate for fallback URL handling
451
+ # @return [String, nil] normalized URL candidate
452
+ def url(properties, fallback_id)
453
+ url_value(
454
+ properties.delete(:url),
455
+ properties.delete(:mainEntityOfPage),
456
+ url_fallback(fallback_id)
457
+ )
458
+ end
459
+
460
+ # @param fallback_id [String, nil] identifier candidate for fallback URL handling
461
+ # @return [String, nil] fallback URL candidate when identifier looks URL-like
462
+ def url_fallback(fallback_id)
463
+ value = first_string(fallback_id)
464
+ return unless value
465
+ return value if value.start_with?('/')
466
+ return value if value.match?(%r{\Ahttps?://})
467
+
468
+ nil
469
+ end
470
+
471
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
472
+ # @return [String, nil] normalized published-at value
473
+ def published_at(properties)
474
+ first_string(
475
+ properties.delete(:datePublished),
476
+ properties.delete(:dateCreated),
477
+ properties.delete(:dateModified),
478
+ properties.delete(:uploadDate)
479
+ )
480
+ end
481
+
482
+ # @param object [Hash{Symbol => Object}] schema-like output object
483
+ # @param properties [Hash{Symbol => Object}] parsed microdata properties
484
+ # @return [void]
485
+ def merge_categories!(object, properties)
486
+ categories = array_value(properties.delete(:categories), properties.delete(:articleSection))
487
+ assign_if_present(object, :categories, categories)
488
+ assign_if_present(object, :keywords, string_or_array(properties.delete(:keywords)))
489
+ assign_if_present(object, :tags, string_or_array(properties.delete(:tags)))
490
+ assign_if_present(object, :about, normalize_about(properties.delete(:about)))
491
+ end
492
+
493
+ # @param object [Hash{Symbol => Object}] schema-like output object
494
+ # @param key [Symbol] target attribute key
495
+ # @param value [Object] value to assign when present
496
+ # @return [void]
497
+ def assign_if_present(object, key, value)
498
+ object[key] = value if value
499
+ end
500
+ end
501
+ private_constant :SchemaObjectBuilder
502
+ end
503
+ end
504
+ end
505
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class Schema
7
+ ##
8
+ # Extracts categories from Schema.org structured data.
9
+ module CategoryExtractor
10
+ ##
11
+ # Extracts categories from a schema object.
12
+ #
13
+ # @param schema_object [Hash] The schema object
14
+ # @return [Array<String>] Array of category strings
15
+ def self.call(schema_object)
16
+ # Build union of all category sources
17
+ field_categories = extract_field_categories(schema_object)
18
+ about_categories = extract_about_categories(schema_object)
19
+
20
+ (field_categories | about_categories).to_a
21
+ end
22
+
23
+ ##
24
+ # Extracts categories from keywords, categories, and tags fields.
25
+ #
26
+ # @param schema_object [Hash] The schema object
27
+ # @return [Set<String>] Set of category strings
28
+ def self.extract_field_categories(schema_object)
29
+ Set.new.tap do |categories|
30
+ %w[keywords categories tags].each do |field|
31
+ categories.merge(extract_field_value(schema_object, field))
32
+ end
33
+ end
34
+ end
35
+
36
+ ##
37
+ # Extracts categories from the about field.
38
+ #
39
+ # @param schema_object [Hash] The schema object
40
+ # @return [Set<String>] Set of category strings
41
+ def self.extract_about_categories(schema_object)
42
+ about = schema_object[:about]
43
+ return Set.new unless about
44
+
45
+ if about.is_a?(Array)
46
+ extract_about_array(about)
47
+ elsif about.is_a?(String)
48
+ extract_string_categories(about)
49
+ else
50
+ Set.new
51
+ end
52
+ end
53
+
54
+ ##
55
+ # Extracts categories from a single field value.
56
+ #
57
+ # @param schema_object [Hash] The schema object
58
+ # @param field [String] The field name
59
+ # @return [Set<String>] Set of category strings
60
+ def self.extract_field_value(schema_object, field)
61
+ value = schema_object[field.to_sym]
62
+ return Set.new unless value
63
+
64
+ if value.is_a?(Array)
65
+ Set.new(value.map(&:to_s).reject(&:empty?))
66
+ elsif value.is_a?(String)
67
+ extract_string_categories(value)
68
+ else
69
+ Set.new
70
+ end
71
+ end
72
+
73
+ ##
74
+ # Extracts categories from an about array.
75
+ #
76
+ # @param about [Array] The about array
77
+ # @return [Set<String>] Set of category strings
78
+ def self.extract_about_array(about)
79
+ Set.new.tap do |categories|
80
+ about.each do |item|
81
+ if item.is_a?(Hash) && item[:name]
82
+ categories.add(item[:name].to_s)
83
+ elsif item.is_a?(String)
84
+ categories.add(item)
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ ##
91
+ # Extracts categories from a string by splitting on separators.
92
+ #
93
+ # @param string [String] source string that may contain category delimiters
94
+ # @return [Set<String>] Set of category strings
95
+ def self.extract_string_categories(string)
96
+ Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?))
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
@@ -11,18 +11,19 @@ module Html2rss
11
11
  #
12
12
  # @see https://schema.org/ItemList
13
13
  class ItemList < Thing
14
+ # Schema.org type names handled by the ItemList extractor.
14
15
  SUPPORTED_TYPES = Set['ItemList']
15
16
 
16
17
  # @return [Array<Hash>] the scraped article hashes with DEFAULT_ATTRIBUTES
17
18
  def call
18
19
  hashes = [super]
19
20
 
20
- return hashes if (elements = @schema_object[:itemListElement]).nil?
21
+ return hashes unless (elements = @schema_object[:itemListElement])
21
22
 
22
23
  elements = [elements] unless elements.is_a?(Array)
23
24
 
24
25
  elements.each do |schema_object|
25
- hashes << ListItem.new(schema_object, url: @url).call
26
+ hashes << ListItem.new(schema_object, url: base_url || '').call
26
27
  end
27
28
 
28
29
  hashes
@@ -5,18 +5,20 @@ module Html2rss
5
5
  module Scraper
6
6
  class Schema
7
7
  ##
8
- #
9
8
  # @see https://schema.org/ListItem
10
9
  class ListItem < Thing
10
+ # @return [String, nil] stable list-item identifier
11
11
  def id = (id = (schema_object.dig(:item, :@id) || super).to_s).empty? ? nil : id
12
- def title = schema_object.dig(:item, :name) || super || (url ? Utils.titleized_url(url) : nil)
12
+ # @return [String, nil] list-item title
13
+ def title = schema_object.dig(:item, :name) || super || url&.titleized
14
+ # @return [String, nil] list-item description
13
15
  def description = schema_object.dig(:item, :description) || super
14
16
 
15
- # @return [Addressable::URI, nil]
17
+ # @return [Html2rss::Url, nil]
16
18
  def url
17
19
  url = schema_object.dig(:item, :url) || super
18
20
 
19
- Utils.build_absolute_url_from_relative(url, @url) if url
21
+ Url.from_relative(url, base_url || url) if url
20
22
  end
21
23
  end
22
24
  end