html2rss 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +48 -656
  3. data/exe/html2rss +1 -1
  4. data/html2rss.gemspec +5 -2
  5. data/lib/html2rss/articles/deduplicator.rb +49 -0
  6. data/lib/html2rss/auto_source/cleanup.rb +33 -5
  7. data/lib/html2rss/auto_source/scraper/html.rb +118 -43
  8. data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
  9. data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
  10. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
  11. data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
  12. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
  13. data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
  14. data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
  15. data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
  16. data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
  17. data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
  18. data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
  19. data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
  20. data/lib/html2rss/auto_source/scraper.rb +142 -8
  21. data/lib/html2rss/auto_source.rb +119 -47
  22. data/lib/html2rss/blocked_surface.rb +64 -0
  23. data/lib/html2rss/category_extractor.rb +82 -0
  24. data/lib/html2rss/cli.rb +170 -23
  25. data/lib/html2rss/config/class_methods.rb +189 -0
  26. data/lib/html2rss/config/dynamic_params.rb +68 -0
  27. data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
  28. data/lib/html2rss/config/request_headers.rb +130 -0
  29. data/lib/html2rss/config/schema.rb +208 -0
  30. data/lib/html2rss/config/validator.rb +108 -0
  31. data/lib/html2rss/config.rb +112 -61
  32. data/lib/html2rss/error.rb +6 -0
  33. data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
  34. data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
  35. data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
  36. data/lib/html2rss/html_extractor.rb +136 -0
  37. data/lib/html2rss/html_navigator.rb +46 -0
  38. data/lib/html2rss/json_feed_builder/item.rb +94 -0
  39. data/lib/html2rss/json_feed_builder.rb +58 -0
  40. data/lib/html2rss/rendering/audio_renderer.rb +31 -0
  41. data/lib/html2rss/rendering/description_builder.rb +88 -0
  42. data/lib/html2rss/rendering/image_renderer.rb +31 -0
  43. data/lib/html2rss/rendering/media_renderer.rb +33 -0
  44. data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
  45. data/lib/html2rss/rendering/video_renderer.rb +31 -0
  46. data/lib/html2rss/rendering.rb +14 -0
  47. data/lib/html2rss/request_controls.rb +128 -0
  48. data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
  49. data/lib/html2rss/request_service/budget.rb +39 -0
  50. data/lib/html2rss/request_service/context.rb +64 -20
  51. data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
  52. data/lib/html2rss/request_service/policy.rb +248 -0
  53. data/lib/html2rss/request_service/puppet_commander.rb +212 -13
  54. data/lib/html2rss/request_service/response.rb +42 -2
  55. data/lib/html2rss/request_service/response_guard.rb +62 -0
  56. data/lib/html2rss/request_service.rb +31 -15
  57. data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
  58. data/lib/html2rss/request_session/runtime_input.rb +57 -0
  59. data/lib/html2rss/request_session/runtime_policy.rb +76 -0
  60. data/lib/html2rss/request_session.rb +118 -0
  61. data/lib/html2rss/rss_builder/article.rb +166 -0
  62. data/lib/html2rss/rss_builder/channel.rb +96 -11
  63. data/lib/html2rss/rss_builder/enclosure.rb +48 -0
  64. data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
  65. data/lib/html2rss/rss_builder.rb +72 -71
  66. data/lib/html2rss/selectors/config.rb +122 -0
  67. data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
  68. data/lib/html2rss/selectors/extractors/href.rb +53 -0
  69. data/lib/html2rss/selectors/extractors/html.rb +48 -0
  70. data/lib/html2rss/selectors/extractors/static.rb +41 -0
  71. data/lib/html2rss/selectors/extractors/text.rb +46 -0
  72. data/lib/html2rss/selectors/extractors.rb +52 -0
  73. data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
  74. data/lib/html2rss/selectors/post_processors/base.rb +74 -0
  75. data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
  76. data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
  77. data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
  78. data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
  79. data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
  80. data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
  81. data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
  82. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
  83. data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
  84. data/lib/html2rss/selectors/post_processors/template.rb +73 -0
  85. data/lib/html2rss/selectors/post_processors.rb +43 -0
  86. data/lib/html2rss/selectors.rb +294 -0
  87. data/lib/html2rss/url.rb +262 -0
  88. data/lib/html2rss/version.rb +1 -1
  89. data/lib/html2rss.rb +129 -70
  90. data/lib/tasks/config_schema.rake +17 -0
  91. data/schema/html2rss-config.schema.json +469 -0
  92. metadata +115 -38
  93. data/lib/html2rss/attribute_post_processors/base.rb +0 -74
  94. data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
  95. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
  96. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
  97. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
  98. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
  99. data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
  100. data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
  101. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
  102. data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
  103. data/lib/html2rss/attribute_post_processors/template.rb +0 -101
  104. data/lib/html2rss/attribute_post_processors.rb +0 -44
  105. data/lib/html2rss/auto_source/article.rb +0 -127
  106. data/lib/html2rss/auto_source/channel.rb +0 -78
  107. data/lib/html2rss/auto_source/reducer.rb +0 -48
  108. data/lib/html2rss/auto_source/rss_builder.rb +0 -70
  109. data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
  110. data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
  111. data/lib/html2rss/config/channel.rb +0 -125
  112. data/lib/html2rss/config/selectors.rb +0 -103
  113. data/lib/html2rss/item.rb +0 -186
  114. data/lib/html2rss/item_extractors/attribute.rb +0 -50
  115. data/lib/html2rss/item_extractors/href.rb +0 -52
  116. data/lib/html2rss/item_extractors/html.rb +0 -46
  117. data/lib/html2rss/item_extractors/static.rb +0 -39
  118. data/lib/html2rss/item_extractors/text.rb +0 -44
  119. data/lib/html2rss/item_extractors.rb +0 -88
  120. data/lib/html2rss/object_to_xml_converter.rb +0 -56
  121. data/lib/html2rss/rss_builder/item.rb +0 -83
  122. data/lib/html2rss/utils.rb +0 -113
@@ -0,0 +1,399 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ # Scrapes Schema.org Microdata items embedded directly in HTML markup.
7
+ class Microdata
8
+ include Enumerable
9
+
10
+ ITEM_SELECTOR = '[itemscope][itemtype]'
11
+ SUPPORTED_TYPES = (Schema::Thing::SUPPORTED_TYPES | Set['Product']).freeze
12
+ VALUE_ATTRIBUTES = %w[content datetime href src data value].freeze
13
+
14
+ def self.options_key = :microdata
15
+
16
+ class << self
17
+ def articles?(parsed_body)
18
+ supported_roots(parsed_body).any?
19
+ end
20
+
21
+ def supported_roots(parsed_body)
22
+ return [] unless parsed_body
23
+
24
+ parsed_body.css(ITEM_SELECTOR).select { supported_root?(_1) }
25
+ end
26
+
27
+ def supported_root?(node)
28
+ supported_type_name(node) && top_level_item?(node)
29
+ end
30
+
31
+ def supported_type_name(node)
32
+ normalized_types(node['itemtype']).find { SUPPORTED_TYPES.include?(_1) }
33
+ end
34
+
35
+ def normalized_types(itemtype)
36
+ itemtype.to_s.split.filter_map do |value|
37
+ type = value.split('/').last.to_s.split('#').last.to_s
38
+ type unless type.empty?
39
+ end
40
+ end
41
+
42
+ def top_level_item?(node)
43
+ return false if node.attribute('itemprop')
44
+
45
+ node.ancestors.none? { |ancestor| ancestor.attribute('itemscope') && ancestor.attribute('itemprop') }
46
+ end
47
+ end
48
+
49
+ ##
50
+ # Builds a Microdata scraper for an already parsed response body.
51
+ #
52
+ # @param parsed_body [Nokogiri::HTML5::Document, Nokogiri::HTML4::Document, Nokogiri::XML::Node, nil]
53
+ # the parsed response body to inspect for top-level Microdata items.
54
+ # @param url [Html2rss::Url] the absolute page URL used to resolve relative links.
55
+ # @param _opts [Hash] unused scraper-specific options.
56
+ # @return [void]
57
+ def initialize(parsed_body, url:, **_opts)
58
+ @parsed_body = parsed_body
59
+ @url = url
60
+ end
61
+
62
+ ##
63
+ # Iterates over normalized article hashes extracted from supported Microdata roots.
64
+ #
65
+ # @yieldparam article [Hash<Symbol, Object>] the normalized article attributes.
66
+ # @return [Enumerator, void] an enumerator when no block is given.
67
+ def each
68
+ return enum_for(:each) unless block_given?
69
+
70
+ self.class.supported_roots(parsed_body).each do |root|
71
+ article = article_from(root)
72
+ yield article if article
73
+ end
74
+ end
75
+
76
+ private
77
+
78
+ attr_reader :parsed_body, :url
79
+
80
+ def article_from(root)
81
+ schema_object = SchemaObjectBuilder.call(root)
82
+ return unless schema_object
83
+
84
+ article = Schema::Thing.new(schema_object, url:).call.compact
85
+ return unless valid_article?(article)
86
+
87
+ article
88
+ end
89
+
90
+ def valid_article?(article)
91
+ return false unless article[:url]
92
+
93
+ article[:title] || article[:description]
94
+ end
95
+
96
+ # Extracts direct Microdata itemprop values for a single item root.
97
+ module ItemParser
98
+ module_function
99
+
100
+ def call(root)
101
+ {}.tap do |properties|
102
+ direct_properties(root).each { append_properties!(properties, _1) }
103
+ end
104
+ end
105
+
106
+ def append_properties!(properties, node)
107
+ value = property_value(node)
108
+ return if blank_value?(value)
109
+
110
+ property_names(node).each do |name|
111
+ append(properties, name.to_sym, value)
112
+ end
113
+ end
114
+
115
+ def direct_properties(root)
116
+ root.css('[itemprop]').select { direct_property?(root, _1) }
117
+ end
118
+
119
+ def direct_property?(root, node)
120
+ return false if node == root
121
+
122
+ node.ancestors.take_while { _1 != root }.none? { |ancestor| ancestor.attribute('itemscope') }
123
+ end
124
+
125
+ def property_names(node)
126
+ node['itemprop'].to_s.split.filter_map do |name|
127
+ stripped = name.strip
128
+ stripped unless stripped.empty?
129
+ end
130
+ end
131
+
132
+ def property_value(node)
133
+ value = if node.attribute('itemscope')
134
+ nested_item(node)
135
+ else
136
+ attribute_value(node) || text_value(node)
137
+ end
138
+
139
+ value unless blank_value?(value)
140
+ end
141
+
142
+ def nested_item(node)
143
+ item = call(node)
144
+ itemtype = node['itemtype']
145
+ itemid = node['itemid']
146
+ item[:@type] = Microdata.normalized_types(itemtype).first if itemtype
147
+ item[:@id] = itemid if present?(itemid)
148
+ item
149
+ end
150
+
151
+ def attribute_value(node)
152
+ VALUE_ATTRIBUTES.each do |attribute|
153
+ value = node[attribute]
154
+ return value if present?(value)
155
+ end
156
+
157
+ nil
158
+ end
159
+
160
+ def text_value(node)
161
+ value = node.text.to_s.strip
162
+ value unless value.empty?
163
+ end
164
+
165
+ def append(properties, key, value)
166
+ return if blank_value?(value)
167
+
168
+ unless properties.key?(key)
169
+ properties[key] = value
170
+ return
171
+ end
172
+
173
+ properties[key] = Array(properties[key]) << value
174
+ end
175
+
176
+ def blank_value?(value)
177
+ case value
178
+ when nil then true
179
+ when String then value.strip.empty?
180
+ when Array, Hash then value.empty?
181
+ else false
182
+ end
183
+ end
184
+
185
+ def present?(value)
186
+ !blank_value?(value)
187
+ end
188
+ end
189
+ private_constant :ItemParser
190
+
191
+ # Shared value normalization helpers for Microdata property conversion.
192
+ module ValueNormalizer
193
+ module_function
194
+
195
+ def url_value(*values)
196
+ values.each do |value|
197
+ candidate = extract_nested_value(value, :url, :@id)
198
+ return candidate.to_s if present?(candidate)
199
+ end
200
+
201
+ nil
202
+ end
203
+
204
+ def image_value(*values)
205
+ values.each do |value|
206
+ candidate = normalize_image(value)
207
+ return candidate if present?(candidate)
208
+ end
209
+
210
+ nil
211
+ end
212
+
213
+ def normalize_image(value)
214
+ candidate = unwrap(value)
215
+ return unless present?(candidate)
216
+
217
+ return candidate if candidate.is_a?(String) || candidate.is_a?(Hash)
218
+
219
+ candidate.to_s
220
+ end
221
+
222
+ def normalize_about(value)
223
+ candidate = unwrap(value)
224
+ items = candidate.is_a?(Array) ? candidate : [candidate]
225
+ values = items.filter_map { normalize_about_item(_1) }
226
+ values unless values.empty?
227
+ end
228
+
229
+ def normalize_about_item(item)
230
+ case item
231
+ when Hash
232
+ name = item[:name]
233
+ { name: name.to_s } if name
234
+ when String then item
235
+ end
236
+ end
237
+
238
+ def string_or_array(value)
239
+ candidate = unwrap(value)
240
+ return unless present?(candidate)
241
+
242
+ return stringify(candidate) unless candidate.is_a?(Array)
243
+
244
+ result = string_values(candidate)
245
+ result unless result.empty?
246
+ end
247
+
248
+ def array_value(*values)
249
+ result = values.flat_map { string_values(Array(unwrap(_1))) }.uniq
250
+ result unless result.empty?
251
+ end
252
+
253
+ def string_values(values)
254
+ values.filter_map { stringify(_1) }
255
+ end
256
+
257
+ def first_string(*values)
258
+ values.each do |value|
259
+ candidate = stringify(unwrap(value))
260
+ return candidate if present?(candidate)
261
+ end
262
+
263
+ nil
264
+ end
265
+
266
+ def extract_nested_value(value, *keys)
267
+ candidate = unwrap(value)
268
+ return candidate unless candidate.is_a?(Hash)
269
+
270
+ keys.each do |key|
271
+ nested_value = candidate[key]
272
+ return nested_value if present?(nested_value)
273
+ end
274
+
275
+ nil
276
+ end
277
+
278
+ def unwrap(value)
279
+ value.is_a?(Array) ? value.first : value
280
+ end
281
+
282
+ def stringify(value)
283
+ return unless present?(value)
284
+ return value if value.is_a?(String)
285
+ return if value.is_a?(Hash) || value.is_a?(Array)
286
+
287
+ value.to_s
288
+ end
289
+
290
+ def present?(value)
291
+ case value
292
+ when nil then false
293
+ when String then !value.strip.empty?
294
+ when Array, Hash then !value.empty?
295
+ else true
296
+ end
297
+ end
298
+ end
299
+ private_constant :ValueNormalizer
300
+
301
+ # Normalizes raw Microdata properties into the schema-like shape used downstream.
302
+ module SchemaObjectBuilder
303
+ module_function
304
+
305
+ extend ValueNormalizer
306
+
307
+ def call(root)
308
+ type = Microdata.supported_type_name(root)
309
+ return unless type
310
+
311
+ compact_object(type, root, ItemParser.call(root))
312
+ end
313
+
314
+ def compact_object(type, root, properties)
315
+ object = base_attributes(type, root, properties)
316
+ merge_categories!(object, properties)
317
+ object.compact
318
+ end
319
+
320
+ def base_attributes(type, root, properties)
321
+ identifier = first_string(root['itemid'], properties.delete(:identifier))
322
+
323
+ {
324
+ '@type': type,
325
+ '@id': identifier
326
+ }.merge(text_attributes(properties))
327
+ .merge(link_attributes(properties, identifier))
328
+ .merge(media_attributes(properties))
329
+ end
330
+
331
+ def title(properties)
332
+ first_string(properties.delete(:headline), properties.delete(:title), properties.delete(:name))
333
+ end
334
+
335
+ def text_attributes(properties)
336
+ {
337
+ title: title(properties),
338
+ description: first_string(properties.delete(:description)),
339
+ schema_object_body: first_string(properties.delete(:articleBody)),
340
+ abstract: first_string(properties.delete(:abstract)),
341
+ datePublished: published_at(properties)
342
+ }
343
+ end
344
+
345
+ def link_attributes(properties, identifier)
346
+ {
347
+ url: url(properties, identifier)
348
+ }
349
+ end
350
+
351
+ def media_attributes(properties)
352
+ {
353
+ image: image_value(properties.delete(:image), properties.delete(:thumbnailUrl))
354
+ }
355
+ end
356
+
357
+ def url(properties, fallback_id)
358
+ url_value(
359
+ properties.delete(:url),
360
+ properties.delete(:mainEntityOfPage),
361
+ url_fallback(fallback_id)
362
+ )
363
+ end
364
+
365
+ def url_fallback(fallback_id)
366
+ value = first_string(fallback_id)
367
+ return unless value
368
+ return value if value.start_with?('/')
369
+ return value if value.match?(%r{\Ahttps?://})
370
+
371
+ nil
372
+ end
373
+
374
+ def published_at(properties)
375
+ first_string(
376
+ properties.delete(:datePublished),
377
+ properties.delete(:dateCreated),
378
+ properties.delete(:dateModified),
379
+ properties.delete(:uploadDate)
380
+ )
381
+ end
382
+
383
+ def merge_categories!(object, properties)
384
+ categories = array_value(properties.delete(:categories), properties.delete(:articleSection))
385
+ assign_if_present(object, :categories, categories)
386
+ assign_if_present(object, :keywords, string_or_array(properties.delete(:keywords)))
387
+ assign_if_present(object, :tags, string_or_array(properties.delete(:tags)))
388
+ assign_if_present(object, :about, normalize_about(properties.delete(:about)))
389
+ end
390
+
391
+ def assign_if_present(object, key, value)
392
+ object[key] = value if value
393
+ end
394
+ end
395
+ private_constant :SchemaObjectBuilder
396
+ end
397
+ end
398
+ end
399
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class Schema
7
+ ##
8
+ # Extracts categories from Schema.org structured data.
9
+ module CategoryExtractor
10
+ ##
11
+ # Extracts categories from a schema object.
12
+ #
13
+ # @param schema_object [Hash] The schema object
14
+ # @return [Array<String>] Array of category strings
15
+ def self.call(schema_object)
16
+ # Build union of all category sources
17
+ field_categories = extract_field_categories(schema_object)
18
+ about_categories = extract_about_categories(schema_object)
19
+
20
+ (field_categories | about_categories).to_a
21
+ end
22
+
23
+ ##
24
+ # Extracts categories from keywords, categories, and tags fields.
25
+ #
26
+ # @param schema_object [Hash] The schema object
27
+ # @return [Set<String>] Set of category strings
28
+ def self.extract_field_categories(schema_object)
29
+ Set.new.tap do |categories|
30
+ %w[keywords categories tags].each do |field|
31
+ categories.merge(extract_field_value(schema_object, field))
32
+ end
33
+ end
34
+ end
35
+
36
+ ##
37
+ # Extracts categories from the about field.
38
+ #
39
+ # @param schema_object [Hash] The schema object
40
+ # @return [Set<String>] Set of category strings
41
+ def self.extract_about_categories(schema_object)
42
+ about = schema_object[:about]
43
+ return Set.new unless about
44
+
45
+ if about.is_a?(Array)
46
+ extract_about_array(about)
47
+ elsif about.is_a?(String)
48
+ extract_string_categories(about)
49
+ else
50
+ Set.new
51
+ end
52
+ end
53
+
54
+ ##
55
+ # Extracts categories from a single field value.
56
+ #
57
+ # @param schema_object [Hash] The schema object
58
+ # @param field [String] The field name
59
+ # @return [Set<String>] Set of category strings
60
+ def self.extract_field_value(schema_object, field)
61
+ value = schema_object[field.to_sym]
62
+ return Set.new unless value
63
+
64
+ if value.is_a?(Array)
65
+ Set.new(value.map(&:to_s).reject(&:empty?))
66
+ elsif value.is_a?(String)
67
+ extract_string_categories(value)
68
+ else
69
+ Set.new
70
+ end
71
+ end
72
+
73
+ ##
74
+ # Extracts categories from an about array.
75
+ #
76
+ # @param about [Array] The about array
77
+ # @return [Set<String>] Set of category strings
78
+ def self.extract_about_array(about)
79
+ Set.new.tap do |categories|
80
+ about.each do |item|
81
+ if item.is_a?(Hash) && item[:name]
82
+ categories.add(item[:name].to_s)
83
+ elsif item.is_a?(String)
84
+ categories.add(item)
85
+ end
86
+ end
87
+ end
88
+ end
89
+
90
+ ##
91
+ # Extracts categories from a string by splitting on separators.
92
+ #
93
+ # @param string [String] The string to process
94
+ # @return [Set<String>] Set of category strings
95
+ def self.extract_string_categories(string)
96
+ Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?))
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
@@ -17,12 +17,12 @@ module Html2rss
17
17
  def call
18
18
  hashes = [super]
19
19
 
20
- return hashes if (elements = @schema_object[:itemListElement]).nil?
20
+ return hashes unless (elements = @schema_object[:itemListElement])
21
21
 
22
22
  elements = [elements] unless elements.is_a?(Array)
23
23
 
24
24
  elements.each do |schema_object|
25
- hashes << ListItem.new(schema_object, url: @url).call
25
+ hashes << ListItem.new(schema_object, url: base_url || '').call
26
26
  end
27
27
 
28
28
  hashes
@@ -9,14 +9,14 @@ module Html2rss
9
9
  # @see https://schema.org/ListItem
10
10
  class ListItem < Thing
11
11
  def id = (id = (schema_object.dig(:item, :@id) || super).to_s).empty? ? nil : id
12
- def title = schema_object.dig(:item, :name) || super || (url ? Utils.titleized_url(url) : nil)
12
+ def title = schema_object.dig(:item, :name) || super || url&.titleized
13
13
  def description = schema_object.dig(:item, :description) || super
14
14
 
15
- # @return [Addressable::URI, nil]
15
+ # @return [Html2rss::Url, nil]
16
16
  def url
17
17
  url = schema_object.dig(:item, :url) || super
18
18
 
19
- Utils.build_absolute_url_from_relative(url, @url) if url
19
+ Url.from_relative(url, base_url || url) if url
20
20
  end
21
21
  end
22
22
  end
@@ -32,11 +32,11 @@ module Html2rss
32
32
  TechArticle
33
33
  ].to_set.freeze
34
34
 
35
- DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
35
+ DEFAULT_ATTRIBUTES = %i[id title description url image published_at categories].freeze
36
36
 
37
37
  def initialize(schema_object, url:)
38
38
  @schema_object = schema_object
39
- @url = url
39
+ @base_url = normalized_base_url(url)
40
40
  end
41
41
 
42
42
  # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
@@ -49,7 +49,7 @@ module Html2rss
49
49
  def id
50
50
  return @id if defined?(@id)
51
51
 
52
- id = (schema_object[:@id] || url&.path).to_s
52
+ id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s
53
53
 
54
54
  return if id.empty?
55
55
 
@@ -63,7 +63,7 @@ module Html2rss
63
63
  .max_by { |string| string.to_s.size }
64
64
  end
65
65
 
66
- # @return [Addressable::URI, nil] the URL of the schema object
66
+ # @return [Html2rss::Url, nil] the URL of the schema object
67
67
  def url
68
68
  url = schema_object[:url]
69
69
  if url.to_s.empty?
@@ -71,20 +71,24 @@ module Html2rss
71
71
  return
72
72
  end
73
73
 
74
- Utils.build_absolute_url_from_relative(url, @url)
74
+ Url.from_relative(url, base_url || url)
75
75
  end
76
76
 
77
77
  def image
78
78
  if (image_url = image_urls.first)
79
- Utils.build_absolute_url_from_relative(image_url, @url)
79
+ Url.from_relative(image_url, base_url || image_url)
80
80
  end
81
81
  end
82
82
 
83
83
  def published_at = schema_object[:datePublished]
84
84
 
85
- private
85
+ def categories
86
+ return @categories if defined?(@categories)
86
87
 
87
- attr_reader :schema_object
88
+ @categories = CategoryExtractor.call(schema_object)
89
+ end
90
+
91
+ attr_reader :schema_object, :base_url
88
92
 
89
93
  def image_urls
90
94
  schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
@@ -97,6 +101,42 @@ module Html2rss
97
101
  end
98
102
  end
99
103
  end
104
+
105
+ def normalized_id(value, reference_url:)
106
+ text = value.to_s
107
+ return if text.empty?
108
+
109
+ normalized_url = normalized_id_url(text, reference_url:)
110
+ return text unless reference_url && normalized_url.host == reference_url.host
111
+
112
+ normalized_id_value(normalized_url)
113
+ rescue ArgumentError
114
+ text
115
+ end
116
+
117
+ def normalized_id_url(text, reference_url:)
118
+ if text.start_with?('/')
119
+ Url.from_relative(text, reference_url || text)
120
+ else
121
+ Url.from_absolute(text)
122
+ end
123
+ end
124
+
125
+ def normalized_id_value(url)
126
+ path = url.path.to_s
127
+ return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
128
+ return path unless path.empty?
129
+
130
+ url.query
131
+ end
132
+
133
+ def normalized_base_url(url)
134
+ return if url.to_s.strip.empty?
135
+
136
+ Url.from_absolute(url)
137
+ rescue ArgumentError
138
+ nil
139
+ end
100
140
  end
101
141
  end
102
142
  end
@@ -19,13 +19,16 @@ module Html2rss
19
19
 
20
20
  TAG_SELECTOR = 'script[type="application/ld+json"]'
21
21
 
22
+ def self.options_key = :schema
23
+
22
24
  class << self
23
25
  def articles?(parsed_body)
24
- parsed_body.css(TAG_SELECTOR).any? do |script|
25
- (Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES).any? do |type|
26
- script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/)
27
- end
28
- end
26
+ parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) }
27
+ end
28
+
29
+ def supported_schema_type?(script)
30
+ supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
31
+ supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
29
32
  end
30
33
 
31
34
  ##
@@ -63,7 +66,7 @@ module Html2rss
63
66
  elsif ItemList::SUPPORTED_TYPES.member?(type)
64
67
  ItemList
65
68
  else
66
- Log.warn("Schema#scraper_for_schema_object: Unsupported schema object @type: #{type}")
69
+ Log.debug("#{name}: unsupported schema object @type=#{type.inspect}")
67
70
  nil
68
71
  end
69
72
  end
@@ -73,14 +76,15 @@ module Html2rss
73
76
  def parse_script_tag(script_tag)
74
77
  JSON.parse(script_tag.text, symbolize_names: true)
75
78
  rescue JSON::ParserError => error
76
- Log.warn('Schema#schema_objects: Failed to parse JSON', error: error.message)
79
+ Log.warn("#{name}: failed to parse JSON", error: error.message)
77
80
  []
78
81
  end
79
82
  end
80
83
 
81
- def initialize(parsed_body, url:)
84
+ def initialize(parsed_body, url:, **opts)
82
85
  @parsed_body = parsed_body
83
86
  @url = url
87
+ @opts = opts
84
88
  end
85
89
 
86
90
  ##