html2rss 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +60 -1
- data/lib/html2rss/articles/deduplicator.rb +1 -0
- data/lib/html2rss/auto_source/cleanup.rb +11 -0
- data/lib/html2rss/auto_source/scraper/html.rb +5 -0
- data/lib/html2rss/auto_source/scraper/json_state.rb +96 -16
- data/lib/html2rss/auto_source/scraper/microdata.rb +107 -1
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +1 -1
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +1 -0
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -1
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +21 -0
- data/lib/html2rss/auto_source/scraper/schema.rb +15 -4
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +5 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +4 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +60 -10
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +3 -2
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +19 -12
- data/lib/html2rss/auto_source/scraper.rb +19 -1
- data/lib/html2rss/auto_source.rb +4 -0
- data/lib/html2rss/blocked_surface.rb +1 -0
- data/lib/html2rss/category_extractor.rb +2 -2
- data/lib/html2rss/cli.rb +30 -6
- data/lib/html2rss/config/class_methods.rb +24 -35
- data/lib/html2rss/config/dynamic_params.rb +6 -4
- data/lib/html2rss/config/multiple_feeds_config.rb +3 -2
- data/lib/html2rss/config/request_headers.rb +9 -3
- data/lib/html2rss/config/schema.rb +33 -1
- data/lib/html2rss/config/validator.rb +40 -2
- data/lib/html2rss/config.rb +19 -13
- data/lib/html2rss/error.rb +25 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +1 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +9 -0
- data/lib/html2rss/html_extractor.rb +5 -0
- data/lib/html2rss/html_navigator.rb +8 -0
- data/lib/html2rss/json_feed_builder.rb +1 -0
- data/lib/html2rss/rendering/audio_renderer.rb +8 -3
- data/lib/html2rss/rendering/description_builder.rb +0 -1
- data/lib/html2rss/rendering/image_renderer.rb +17 -7
- data/lib/html2rss/rendering/media_renderer.rb +4 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +11 -5
- data/lib/html2rss/rendering/video_renderer.rb +8 -3
- data/lib/html2rss/rendering.rb +11 -2
- data/lib/html2rss/request_controls.rb +16 -21
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/context.rb +14 -2
- data/lib/html2rss/request_service/faraday_strategy.rb +6 -4
- data/lib/html2rss/request_service/policy.rb +4 -0
- data/lib/html2rss/request_service/response.rb +9 -1
- data/lib/html2rss/request_service.rb +19 -0
- data/lib/html2rss/request_session/runtime_input.rb +16 -2
- data/lib/html2rss/request_session/runtime_policy.rb +7 -0
- data/lib/html2rss/request_session.rb +13 -9
- data/lib/html2rss/rss_builder/article.rb +22 -1
- data/lib/html2rss/rss_builder/channel.rb +11 -2
- data/lib/html2rss/rss_builder/enclosure.rb +15 -1
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -0
- data/lib/html2rss/rss_builder.rb +4 -0
- data/lib/html2rss/selectors/config.rb +1 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +2 -0
- data/lib/html2rss/selectors/extractors/href.rb +2 -0
- data/lib/html2rss/selectors/extractors/html.rb +1 -0
- data/lib/html2rss/selectors/extractors/static.rb +2 -1
- data/lib/html2rss/selectors/extractors/text.rb +1 -0
- data/lib/html2rss/selectors/extractors.rb +2 -1
- data/lib/html2rss/selectors/object_to_xml_converter.rb +1 -0
- data/lib/html2rss/selectors/post_processors/base.rb +13 -7
- data/lib/html2rss/selectors/post_processors/gsub.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +3 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +9 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +6 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +3 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +5 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +3 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +5 -1
- data/lib/html2rss/selectors/post_processors/substring.rb +3 -0
- data/lib/html2rss/selectors/post_processors/template.rb +3 -0
- data/lib/html2rss/selectors/post_processors.rb +5 -0
- data/lib/html2rss/selectors.rb +7 -0
- data/lib/html2rss/url.rb +27 -23
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +15 -78
- data/schema/html2rss-config.schema.json +83 -1
- metadata +7 -2
|
@@ -7,31 +7,43 @@ module Html2rss
|
|
|
7
7
|
class Microdata
|
|
8
8
|
include Enumerable
|
|
9
9
|
|
|
10
|
+
# Selector matching nodes that define a microdata item scope.
|
|
10
11
|
ITEM_SELECTOR = '[itemscope][itemtype]'
|
|
12
|
+
# Schema.org types supported for article extraction via Microdata.
|
|
11
13
|
SUPPORTED_TYPES = (Schema::Thing::SUPPORTED_TYPES | Set['Product']).freeze
|
|
14
|
+
# Attribute names checked first for microdata property values.
|
|
12
15
|
VALUE_ATTRIBUTES = %w[content datetime href src data value].freeze
|
|
13
16
|
|
|
17
|
+
# @return [Symbol] scraper config key
|
|
14
18
|
def self.options_key = :microdata
|
|
15
19
|
|
|
16
20
|
class << self
|
|
21
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
17
22
|
def articles?(parsed_body)
|
|
18
23
|
supported_roots(parsed_body).any?
|
|
19
24
|
end
|
|
20
25
|
|
|
26
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
27
|
+
# @return [Array<Nokogiri::XML::Element>] top-level supported Microdata roots
|
|
21
28
|
def supported_roots(parsed_body)
|
|
22
29
|
return [] unless parsed_body
|
|
23
30
|
|
|
24
31
|
parsed_body.css(ITEM_SELECTOR).select { supported_root?(_1) }
|
|
25
32
|
end
|
|
26
33
|
|
|
34
|
+
# @param node [Nokogiri::XML::Element] itemscope candidate node
|
|
27
35
|
def supported_root?(node)
|
|
28
36
|
supported_type_name(node) && top_level_item?(node)
|
|
29
37
|
end
|
|
30
38
|
|
|
39
|
+
# @param node [Nokogiri::XML::Element] itemscope candidate node
|
|
40
|
+
# @return [String, nil] supported schema type name when present
|
|
31
41
|
def supported_type_name(node)
|
|
32
42
|
normalized_types(node['itemtype']).find { SUPPORTED_TYPES.include?(_1) }
|
|
33
43
|
end
|
|
34
44
|
|
|
45
|
+
# @param itemtype [String, nil] raw itemtype attribute value
|
|
46
|
+
# @return [Array<String>] normalized schema type names
|
|
35
47
|
def normalized_types(itemtype)
|
|
36
48
|
itemtype.to_s.split.filter_map do |value|
|
|
37
49
|
type = value.split('/').last.to_s.split('#').last.to_s
|
|
@@ -39,6 +51,7 @@ module Html2rss
|
|
|
39
51
|
end
|
|
40
52
|
end
|
|
41
53
|
|
|
54
|
+
# @param node [Nokogiri::XML::Element] itemscope candidate node
|
|
42
55
|
def top_level_item?(node)
|
|
43
56
|
return false if node.attribute('itemprop')
|
|
44
57
|
|
|
@@ -53,6 +66,7 @@ module Html2rss
|
|
|
53
66
|
# the parsed response body to inspect for top-level Microdata items.
|
|
54
67
|
# @param url [Html2rss::Url] the absolute page URL used to resolve relative links.
|
|
55
68
|
# @param _opts [Hash] unused scraper-specific options.
|
|
69
|
+
# @option _opts [Object] :_reserved reserved for future scraper-specific options
|
|
56
70
|
# @return [void]
|
|
57
71
|
def initialize(parsed_body, url:, **_opts)
|
|
58
72
|
@parsed_body = parsed_body
|
|
@@ -62,7 +76,7 @@ module Html2rss
|
|
|
62
76
|
##
|
|
63
77
|
# Iterates over normalized article hashes extracted from supported Microdata roots.
|
|
64
78
|
#
|
|
65
|
-
# @yieldparam article [Hash
|
|
79
|
+
# @yieldparam article [Hash{Symbol => Object}] the normalized article attributes.
|
|
66
80
|
# @return [Enumerator, void] an enumerator when no block is given.
|
|
67
81
|
def each
|
|
68
82
|
return enum_for(:each) unless block_given?
|
|
@@ -77,6 +91,8 @@ module Html2rss
|
|
|
77
91
|
|
|
78
92
|
attr_reader :parsed_body, :url
|
|
79
93
|
|
|
94
|
+
# @param root [Nokogiri::XML::Element] supported Microdata root node
|
|
95
|
+
# @return [Hash{Symbol => Object}, nil] normalized article hash
|
|
80
96
|
def article_from(root)
|
|
81
97
|
schema_object = SchemaObjectBuilder.call(root)
|
|
82
98
|
return unless schema_object
|
|
@@ -87,6 +103,8 @@ module Html2rss
|
|
|
87
103
|
article
|
|
88
104
|
end
|
|
89
105
|
|
|
106
|
+
# @param article [Hash{Symbol => Object}] normalized article hash
|
|
107
|
+
# @return [Boolean] whether article contains required fields
|
|
90
108
|
def valid_article?(article)
|
|
91
109
|
return false unless article[:url]
|
|
92
110
|
|
|
@@ -97,12 +115,17 @@ module Html2rss
|
|
|
97
115
|
module ItemParser
|
|
98
116
|
module_function
|
|
99
117
|
|
|
118
|
+
# @param root [Nokogiri::XML::Element] microdata root node
|
|
119
|
+
# @return [Hash{Symbol => Object}] extracted direct properties
|
|
100
120
|
def call(root)
|
|
101
121
|
{}.tap do |properties|
|
|
102
122
|
direct_properties(root).each { append_properties!(properties, _1) }
|
|
103
123
|
end
|
|
104
124
|
end
|
|
105
125
|
|
|
126
|
+
# @param properties [Hash{Symbol => Object}] accumulator hash for parsed properties
|
|
127
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
128
|
+
# @return [void]
|
|
106
129
|
def append_properties!(properties, node)
|
|
107
130
|
value = property_value(node)
|
|
108
131
|
return if blank_value?(value)
|
|
@@ -112,16 +135,23 @@ module Html2rss
|
|
|
112
135
|
end
|
|
113
136
|
end
|
|
114
137
|
|
|
138
|
+
# @param root [Nokogiri::XML::Element] microdata root node
|
|
139
|
+
# @return [Array<Nokogiri::XML::Element>] direct property nodes for the root
|
|
115
140
|
def direct_properties(root)
|
|
116
141
|
root.css('[itemprop]').select { direct_property?(root, _1) }
|
|
117
142
|
end
|
|
118
143
|
|
|
144
|
+
# @param root [Nokogiri::XML::Element] microdata root node
|
|
145
|
+
# @param node [Nokogiri::XML::Element] candidate itemprop node
|
|
146
|
+
# @return [Boolean] whether the node belongs directly to the current root item
|
|
119
147
|
def direct_property?(root, node)
|
|
120
148
|
return false if node == root
|
|
121
149
|
|
|
122
150
|
node.ancestors.take_while { _1 != root }.none? { |ancestor| ancestor.attribute('itemscope') }
|
|
123
151
|
end
|
|
124
152
|
|
|
153
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
154
|
+
# @return [Array<String>] normalized property names
|
|
125
155
|
def property_names(node)
|
|
126
156
|
node['itemprop'].to_s.split.filter_map do |name|
|
|
127
157
|
stripped = name.strip
|
|
@@ -129,6 +159,8 @@ module Html2rss
|
|
|
129
159
|
end
|
|
130
160
|
end
|
|
131
161
|
|
|
162
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
163
|
+
# @return [Object, nil] parsed property value
|
|
132
164
|
def property_value(node)
|
|
133
165
|
value = if node.attribute('itemscope')
|
|
134
166
|
nested_item(node)
|
|
@@ -139,6 +171,8 @@ module Html2rss
|
|
|
139
171
|
value unless blank_value?(value)
|
|
140
172
|
end
|
|
141
173
|
|
|
174
|
+
# @param node [Nokogiri::XML::Element] nested itemscope node
|
|
175
|
+
# @return [Hash{Symbol => Object}] nested parsed microdata item
|
|
142
176
|
def nested_item(node)
|
|
143
177
|
item = call(node)
|
|
144
178
|
itemtype = node['itemtype']
|
|
@@ -148,6 +182,8 @@ module Html2rss
|
|
|
148
182
|
item
|
|
149
183
|
end
|
|
150
184
|
|
|
185
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
186
|
+
# @return [String, nil] first present attribute value
|
|
151
187
|
def attribute_value(node)
|
|
152
188
|
VALUE_ATTRIBUTES.each do |attribute|
|
|
153
189
|
value = node[attribute]
|
|
@@ -157,11 +193,17 @@ module Html2rss
|
|
|
157
193
|
nil
|
|
158
194
|
end
|
|
159
195
|
|
|
196
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
197
|
+
# @return [String, nil] normalized text content
|
|
160
198
|
def text_value(node)
|
|
161
199
|
value = node.text.to_s.strip
|
|
162
200
|
value unless value.empty?
|
|
163
201
|
end
|
|
164
202
|
|
|
203
|
+
# @param properties [Hash{Symbol => Object}] accumulator hash for parsed properties
|
|
204
|
+
# @param key [Symbol] target property key
|
|
205
|
+
# @param value [Object] parsed property value to assign for the key
|
|
206
|
+
# @return [void]
|
|
165
207
|
def append(properties, key, value)
|
|
166
208
|
return if blank_value?(value)
|
|
167
209
|
|
|
@@ -173,6 +215,8 @@ module Html2rss
|
|
|
173
215
|
properties[key] = Array(properties[key]) << value
|
|
174
216
|
end
|
|
175
217
|
|
|
218
|
+
# @param value [Object] candidate value
|
|
219
|
+
# @return [Boolean] whether value is blank for microdata extraction purposes
|
|
176
220
|
def blank_value?(value)
|
|
177
221
|
case value
|
|
178
222
|
when nil then true
|
|
@@ -182,6 +226,8 @@ module Html2rss
|
|
|
182
226
|
end
|
|
183
227
|
end
|
|
184
228
|
|
|
229
|
+
# @param value [Object] candidate value
|
|
230
|
+
# @return [Boolean] whether value is present for microdata extraction purposes
|
|
185
231
|
def present?(value)
|
|
186
232
|
!blank_value?(value)
|
|
187
233
|
end
|
|
@@ -192,6 +238,8 @@ module Html2rss
|
|
|
192
238
|
module ValueNormalizer
|
|
193
239
|
module_function
|
|
194
240
|
|
|
241
|
+
# @param values [Array<Object>] value candidates
|
|
242
|
+
# @return [String, nil] first URL-like value converted to string
|
|
195
243
|
def url_value(*values)
|
|
196
244
|
values.each do |value|
|
|
197
245
|
candidate = extract_nested_value(value, :url, :@id)
|
|
@@ -201,6 +249,8 @@ module Html2rss
|
|
|
201
249
|
nil
|
|
202
250
|
end
|
|
203
251
|
|
|
252
|
+
# @param values [Array<Object>] value candidates
|
|
253
|
+
# @return [String, Hash, nil] first normalized image candidate
|
|
204
254
|
def image_value(*values)
|
|
205
255
|
values.each do |value|
|
|
206
256
|
candidate = normalize_image(value)
|
|
@@ -210,6 +260,8 @@ module Html2rss
|
|
|
210
260
|
nil
|
|
211
261
|
end
|
|
212
262
|
|
|
263
|
+
# @param value [Object] image candidate value
|
|
264
|
+
# @return [String, Hash, nil] normalized image-like value
|
|
213
265
|
def normalize_image(value)
|
|
214
266
|
candidate = unwrap(value)
|
|
215
267
|
return unless present?(candidate)
|
|
@@ -219,6 +271,8 @@ module Html2rss
|
|
|
219
271
|
candidate.to_s
|
|
220
272
|
end
|
|
221
273
|
|
|
274
|
+
# @param value [Object] about candidate value
|
|
275
|
+
# @return [Array<String, Hash>, nil] normalized about values
|
|
222
276
|
def normalize_about(value)
|
|
223
277
|
candidate = unwrap(value)
|
|
224
278
|
items = candidate.is_a?(Array) ? candidate : [candidate]
|
|
@@ -226,6 +280,8 @@ module Html2rss
|
|
|
226
280
|
values unless values.empty?
|
|
227
281
|
end
|
|
228
282
|
|
|
283
|
+
# @param item [Object] single about item
|
|
284
|
+
# @return [String, Hash, nil] normalized about item
|
|
229
285
|
def normalize_about_item(item)
|
|
230
286
|
case item
|
|
231
287
|
when Hash
|
|
@@ -235,6 +291,8 @@ module Html2rss
|
|
|
235
291
|
end
|
|
236
292
|
end
|
|
237
293
|
|
|
294
|
+
# @param value [Object] scalar or array candidate
|
|
295
|
+
# @return [String, Array<String>, nil] normalized scalar or string array
|
|
238
296
|
def string_or_array(value)
|
|
239
297
|
candidate = unwrap(value)
|
|
240
298
|
return unless present?(candidate)
|
|
@@ -245,15 +303,21 @@ module Html2rss
|
|
|
245
303
|
result unless result.empty?
|
|
246
304
|
end
|
|
247
305
|
|
|
306
|
+
# @param values [Array<Object>] value candidates
|
|
307
|
+
# @return [Array<String>, nil] normalized unique string values
|
|
248
308
|
def array_value(*values)
|
|
249
309
|
result = values.flat_map { string_values(Array(unwrap(_1))) }.uniq
|
|
250
310
|
result unless result.empty?
|
|
251
311
|
end
|
|
252
312
|
|
|
313
|
+
# @param values [Array<Object>] candidate scalar values collected from microdata arrays
|
|
314
|
+
# @return [Array<String>] normalized string values
|
|
253
315
|
def string_values(values)
|
|
254
316
|
values.filter_map { stringify(_1) }
|
|
255
317
|
end
|
|
256
318
|
|
|
319
|
+
# @param values [Array<Object>] value candidates
|
|
320
|
+
# @return [String, nil] first present string-like value
|
|
257
321
|
def first_string(*values)
|
|
258
322
|
values.each do |value|
|
|
259
323
|
candidate = stringify(unwrap(value))
|
|
@@ -263,6 +327,9 @@ module Html2rss
|
|
|
263
327
|
nil
|
|
264
328
|
end
|
|
265
329
|
|
|
330
|
+
# @param value [Object] nested container or scalar
|
|
331
|
+
# @param keys [Array<Symbol>] nested keys to probe in order
|
|
332
|
+
# @return [Object, nil] first matching nested value
|
|
266
333
|
def extract_nested_value(value, *keys)
|
|
267
334
|
candidate = unwrap(value)
|
|
268
335
|
return candidate unless candidate.is_a?(Hash)
|
|
@@ -275,10 +342,14 @@ module Html2rss
|
|
|
275
342
|
nil
|
|
276
343
|
end
|
|
277
344
|
|
|
345
|
+
# @param value [Object] scalar or array candidate
|
|
346
|
+
# @return [Object] first array element or the original value
|
|
278
347
|
def unwrap(value)
|
|
279
348
|
value.is_a?(Array) ? value.first : value
|
|
280
349
|
end
|
|
281
350
|
|
|
351
|
+
# @param value [Object] scalar candidate normalized to string output
|
|
352
|
+
# @return [String, nil] normalized string representation
|
|
282
353
|
def stringify(value)
|
|
283
354
|
return unless present?(value)
|
|
284
355
|
return value if value.is_a?(String)
|
|
@@ -287,6 +358,8 @@ module Html2rss
|
|
|
287
358
|
value.to_s
|
|
288
359
|
end
|
|
289
360
|
|
|
361
|
+
# @param value [Object] candidate value
|
|
362
|
+
# @return [Boolean] whether value is present
|
|
290
363
|
def present?(value)
|
|
291
364
|
case value
|
|
292
365
|
when nil then false
|
|
@@ -304,6 +377,8 @@ module Html2rss
|
|
|
304
377
|
|
|
305
378
|
extend ValueNormalizer
|
|
306
379
|
|
|
380
|
+
# @param root [Nokogiri::XML::Element] supported microdata root node
|
|
381
|
+
# @return [Hash{Symbol => Object}, nil] compact schema-like object
|
|
307
382
|
def call(root)
|
|
308
383
|
type = Microdata.supported_type_name(root)
|
|
309
384
|
return unless type
|
|
@@ -311,12 +386,20 @@ module Html2rss
|
|
|
311
386
|
compact_object(type, root, ItemParser.call(root))
|
|
312
387
|
end
|
|
313
388
|
|
|
389
|
+
# @param type [String] schema type inferred from itemtype
|
|
390
|
+
# @param root [Nokogiri::XML::Element] supported microdata root node
|
|
391
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
392
|
+
# @return [Hash{Symbol => Object}] normalized schema-like object
|
|
314
393
|
def compact_object(type, root, properties)
|
|
315
394
|
object = base_attributes(type, root, properties)
|
|
316
395
|
merge_categories!(object, properties)
|
|
317
396
|
object.compact
|
|
318
397
|
end
|
|
319
398
|
|
|
399
|
+
# @param type [String] schema type inferred from itemtype
|
|
400
|
+
# @param root [Nokogiri::XML::Element] supported microdata root node
|
|
401
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
402
|
+
# @return [Hash{Symbol => Object}] base schema attributes before category merging
|
|
320
403
|
def base_attributes(type, root, properties)
|
|
321
404
|
identifier = first_string(root['itemid'], properties.delete(:identifier))
|
|
322
405
|
|
|
@@ -328,10 +411,14 @@ module Html2rss
|
|
|
328
411
|
.merge(media_attributes(properties))
|
|
329
412
|
end
|
|
330
413
|
|
|
414
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
415
|
+
# @return [String, nil] normalized title
|
|
331
416
|
def title(properties)
|
|
332
417
|
first_string(properties.delete(:headline), properties.delete(:title), properties.delete(:name))
|
|
333
418
|
end
|
|
334
419
|
|
|
420
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
421
|
+
# @return [Hash{Symbol => Object}] normalized text attributes
|
|
335
422
|
def text_attributes(properties)
|
|
336
423
|
{
|
|
337
424
|
title: title(properties),
|
|
@@ -342,18 +429,26 @@ module Html2rss
|
|
|
342
429
|
}
|
|
343
430
|
end
|
|
344
431
|
|
|
432
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
433
|
+
# @param identifier [String, nil] identifier candidate for fallback URL handling
|
|
434
|
+
# @return [Hash{Symbol => Object}] normalized link attributes
|
|
345
435
|
def link_attributes(properties, identifier)
|
|
346
436
|
{
|
|
347
437
|
url: url(properties, identifier)
|
|
348
438
|
}
|
|
349
439
|
end
|
|
350
440
|
|
|
441
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
442
|
+
# @return [Hash{Symbol => Object}] normalized media attributes
|
|
351
443
|
def media_attributes(properties)
|
|
352
444
|
{
|
|
353
445
|
image: image_value(properties.delete(:image), properties.delete(:thumbnailUrl))
|
|
354
446
|
}
|
|
355
447
|
end
|
|
356
448
|
|
|
449
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
450
|
+
# @param fallback_id [String, nil] identifier candidate for fallback URL handling
|
|
451
|
+
# @return [String, nil] normalized URL candidate
|
|
357
452
|
def url(properties, fallback_id)
|
|
358
453
|
url_value(
|
|
359
454
|
properties.delete(:url),
|
|
@@ -362,6 +457,8 @@ module Html2rss
|
|
|
362
457
|
)
|
|
363
458
|
end
|
|
364
459
|
|
|
460
|
+
# @param fallback_id [String, nil] identifier candidate for fallback URL handling
|
|
461
|
+
# @return [String, nil] fallback URL candidate when identifier looks URL-like
|
|
365
462
|
def url_fallback(fallback_id)
|
|
366
463
|
value = first_string(fallback_id)
|
|
367
464
|
return unless value
|
|
@@ -371,6 +468,8 @@ module Html2rss
|
|
|
371
468
|
nil
|
|
372
469
|
end
|
|
373
470
|
|
|
471
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
472
|
+
# @return [String, nil] normalized published-at value
|
|
374
473
|
def published_at(properties)
|
|
375
474
|
first_string(
|
|
376
475
|
properties.delete(:datePublished),
|
|
@@ -380,6 +479,9 @@ module Html2rss
|
|
|
380
479
|
)
|
|
381
480
|
end
|
|
382
481
|
|
|
482
|
+
# @param object [Hash{Symbol => Object}] schema-like output object
|
|
483
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
484
|
+
# @return [void]
|
|
383
485
|
def merge_categories!(object, properties)
|
|
384
486
|
categories = array_value(properties.delete(:categories), properties.delete(:articleSection))
|
|
385
487
|
assign_if_present(object, :categories, categories)
|
|
@@ -388,6 +490,10 @@ module Html2rss
|
|
|
388
490
|
assign_if_present(object, :about, normalize_about(properties.delete(:about)))
|
|
389
491
|
end
|
|
390
492
|
|
|
493
|
+
# @param object [Hash{Symbol => Object}] schema-like output object
|
|
494
|
+
# @param key [Symbol] target attribute key
|
|
495
|
+
# @param value [Object] value to assign when present
|
|
496
|
+
# @return [void]
|
|
391
497
|
def assign_if_present(object, key, value)
|
|
392
498
|
object[key] = value if value
|
|
393
499
|
end
|
|
@@ -90,7 +90,7 @@ module Html2rss
|
|
|
90
90
|
##
|
|
91
91
|
# Extracts categories from a string by splitting on separators.
|
|
92
92
|
#
|
|
93
|
-
# @param string [String]
|
|
93
|
+
# @param string [String] source string that may contain category delimiters
|
|
94
94
|
# @return [Set<String>] Set of category strings
|
|
95
95
|
def self.extract_string_categories(string)
|
|
96
96
|
Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?))
|
|
@@ -11,6 +11,7 @@ module Html2rss
|
|
|
11
11
|
#
|
|
12
12
|
# @see https://schema.org/ItemList
|
|
13
13
|
class ItemList < Thing
|
|
14
|
+
# Schema.org type names handled by the ItemList extractor.
|
|
14
15
|
SUPPORTED_TYPES = Set['ItemList']
|
|
15
16
|
|
|
16
17
|
# @return [Array<Hash>] the scraped article hashes with DEFAULT_ATTRIBUTES
|
|
@@ -5,11 +5,13 @@ module Html2rss
|
|
|
5
5
|
module Scraper
|
|
6
6
|
class Schema
|
|
7
7
|
##
|
|
8
|
-
#
|
|
9
8
|
# @see https://schema.org/ListItem
|
|
10
9
|
class ListItem < Thing
|
|
10
|
+
# @return [String, nil] stable list-item identifier
|
|
11
11
|
def id = (id = (schema_object.dig(:item, :@id) || super).to_s).empty? ? nil : id
|
|
12
|
+
# @return [String, nil] list-item title
|
|
12
13
|
def title = schema_object.dig(:item, :name) || super || url&.titleized
|
|
14
|
+
# @return [String, nil] list-item description
|
|
13
15
|
def description = schema_object.dig(:item, :description) || super
|
|
14
16
|
|
|
15
17
|
# @return [Html2rss::Url, nil]
|
|
@@ -11,6 +11,7 @@ module Html2rss
|
|
|
11
11
|
#
|
|
12
12
|
# @see https://schema.org/Thing
|
|
13
13
|
class Thing
|
|
14
|
+
# Supported Schema.org `@type` values mapped to article extraction.
|
|
14
15
|
SUPPORTED_TYPES = %w[
|
|
15
16
|
AdvertiserContentArticle
|
|
16
17
|
AnalysisNewsArticle
|
|
@@ -32,8 +33,11 @@ module Html2rss
|
|
|
32
33
|
TechArticle
|
|
33
34
|
].to_set.freeze
|
|
34
35
|
|
|
36
|
+
# Attributes exposed by `#call` in generated article hashes.
|
|
35
37
|
DEFAULT_ATTRIBUTES = %i[id title description url image published_at categories].freeze
|
|
36
38
|
|
|
39
|
+
# @param schema_object [Hash{Symbol => Object}] parsed schema.org object
|
|
40
|
+
# @param url [String, Html2rss::Url, nil] base URL used for relative normalization
|
|
37
41
|
def initialize(schema_object, url:)
|
|
38
42
|
@schema_object = schema_object
|
|
39
43
|
@base_url = normalized_base_url(url)
|
|
@@ -46,6 +50,7 @@ module Html2rss
|
|
|
46
50
|
end
|
|
47
51
|
end
|
|
48
52
|
|
|
53
|
+
# @return [String, nil] stable schema object identifier
|
|
49
54
|
def id
|
|
50
55
|
return @id if defined?(@id)
|
|
51
56
|
|
|
@@ -56,8 +61,10 @@ module Html2rss
|
|
|
56
61
|
@id = id
|
|
57
62
|
end
|
|
58
63
|
|
|
64
|
+
# @return [String, nil] article title
|
|
59
65
|
def title = schema_object[:title]
|
|
60
66
|
|
|
67
|
+
# @return [String, nil] longest available description field
|
|
61
68
|
def description
|
|
62
69
|
schema_object.values_at(:description, :schema_object_body, :abstract)
|
|
63
70
|
.max_by { |string| string.to_s.size }
|
|
@@ -74,14 +81,17 @@ module Html2rss
|
|
|
74
81
|
Url.from_relative(url, base_url || url)
|
|
75
82
|
end
|
|
76
83
|
|
|
84
|
+
# @return [Html2rss::Url, nil] normalized article image URL
|
|
77
85
|
def image
|
|
78
86
|
if (image_url = image_urls.first)
|
|
79
87
|
Url.from_relative(image_url, base_url || image_url)
|
|
80
88
|
end
|
|
81
89
|
end
|
|
82
90
|
|
|
91
|
+
# @return [String, nil] published-at timestamp string
|
|
83
92
|
def published_at = schema_object[:datePublished]
|
|
84
93
|
|
|
94
|
+
# @return [Array<String>, nil] extracted category labels
|
|
85
95
|
def categories
|
|
86
96
|
return @categories if defined?(@categories)
|
|
87
97
|
|
|
@@ -90,6 +100,7 @@ module Html2rss
|
|
|
90
100
|
|
|
91
101
|
attr_reader :schema_object, :base_url
|
|
92
102
|
|
|
103
|
+
# @return [Array<String>] normalized image URL candidates
|
|
93
104
|
def image_urls
|
|
94
105
|
schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
|
|
95
106
|
next unless object
|
|
@@ -102,6 +113,9 @@ module Html2rss
|
|
|
102
113
|
end
|
|
103
114
|
end
|
|
104
115
|
|
|
116
|
+
# @param value [String, Symbol, nil] candidate schema identifier
|
|
117
|
+
# @param reference_url [Html2rss::Url, nil] URL used for same-origin normalization
|
|
118
|
+
# @return [String, nil] normalized identifier value
|
|
105
119
|
def normalized_id(value, reference_url:)
|
|
106
120
|
text = value.to_s
|
|
107
121
|
return if text.empty?
|
|
@@ -114,6 +128,9 @@ module Html2rss
|
|
|
114
128
|
text
|
|
115
129
|
end
|
|
116
130
|
|
|
131
|
+
# @param text [String] raw identifier text
|
|
132
|
+
# @param reference_url [Html2rss::Url, nil] URL used to resolve relative IDs
|
|
133
|
+
# @return [Html2rss::Url] normalized identifier URL
|
|
117
134
|
def normalized_id_url(text, reference_url:)
|
|
118
135
|
if text.start_with?('/')
|
|
119
136
|
Url.from_relative(text, reference_url || text)
|
|
@@ -122,6 +139,8 @@ module Html2rss
|
|
|
122
139
|
end
|
|
123
140
|
end
|
|
124
141
|
|
|
142
|
+
# @param url [Html2rss::Url] normalized identifier URL
|
|
143
|
+
# @return [String, nil] path/query portion used as stable ID
|
|
125
144
|
def normalized_id_value(url)
|
|
126
145
|
path = url.path.to_s
|
|
127
146
|
return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
|
|
@@ -130,6 +149,8 @@ module Html2rss
|
|
|
130
149
|
url.query
|
|
131
150
|
end
|
|
132
151
|
|
|
152
|
+
# @param url [String, Html2rss::Url, nil] candidate page URL
|
|
153
|
+
# @return [Html2rss::Url, nil] normalized absolute URL for schema resolution
|
|
133
154
|
def normalized_base_url(url)
|
|
134
155
|
return if url.to_s.strip.empty?
|
|
135
156
|
|
|
@@ -8,24 +8,28 @@ module Html2rss
|
|
|
8
8
|
module Scraper
|
|
9
9
|
##
|
|
10
10
|
# Scrapes articles from Schema.org objects, by looking for the objects in:
|
|
11
|
-
|
|
12
11
|
# <script type="application/ld+json"> "schema" tags.
|
|
13
12
|
#
|
|
14
|
-
#
|
|
15
|
-
#
|
|
16
|
-
# 2. https://developers.google.com/search/docs/appearance/structured-data/article#microdata
|
|
13
|
+
# @see https://schema.org/docs/full.html
|
|
14
|
+
# @see https://developers.google.com/search/docs/appearance/structured-data/article#microdata
|
|
17
15
|
class Schema
|
|
18
16
|
include Enumerable
|
|
19
17
|
|
|
18
|
+
# Selector for JSON-LD script tags containing Schema.org objects.
|
|
20
19
|
TAG_SELECTOR = 'script[type="application/ld+json"]'
|
|
21
20
|
|
|
21
|
+
# @return [Symbol] scraper config key
|
|
22
22
|
def self.options_key = :schema
|
|
23
23
|
|
|
24
24
|
class << self
|
|
25
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
26
|
+
# @return [Boolean] whether the page includes supported schema types
|
|
25
27
|
def articles?(parsed_body)
|
|
26
28
|
parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) }
|
|
27
29
|
end
|
|
28
30
|
|
|
31
|
+
# @param script [Nokogiri::XML::Element] schema JSON-LD script tag
|
|
32
|
+
# @return [Boolean] whether the tag references a supported schema type
|
|
29
33
|
def supported_schema_type?(script)
|
|
30
34
|
supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
|
|
31
35
|
supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
|
|
@@ -52,11 +56,14 @@ module Html2rss
|
|
|
52
56
|
end
|
|
53
57
|
end
|
|
54
58
|
|
|
59
|
+
# @param object [Hash{Symbol => Object}] schema candidate object
|
|
60
|
+
# @return [Boolean] whether an extractor exists for the candidate object
|
|
55
61
|
def supported_schema_object?(object)
|
|
56
62
|
scraper_for_schema_object(object) ? true : false
|
|
57
63
|
end
|
|
58
64
|
|
|
59
65
|
##
|
|
66
|
+
# @param schema_object [Hash{Symbol => Object}] schema object with an @type key
|
|
60
67
|
# @return [Scraper::Schema::Thing, Scraper::Schema::ItemList, nil] a class responding to `#call`
|
|
61
68
|
def scraper_for_schema_object(schema_object)
|
|
62
69
|
type = schema_object[:@type]
|
|
@@ -81,6 +88,10 @@ module Html2rss
|
|
|
81
88
|
end
|
|
82
89
|
end
|
|
83
90
|
|
|
91
|
+
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
92
|
+
# @param url [String, Html2rss::Url] base page URL
|
|
93
|
+
# @param opts [Hash] scraper-specific options
|
|
94
|
+
# @option opts [Object] :_reserved reserved for future scraper-specific options
|
|
84
95
|
def initialize(parsed_body, url:, **opts)
|
|
85
96
|
@parsed_body = parsed_body
|
|
86
97
|
@url = url
|
|
@@ -25,7 +25,9 @@ module Html2rss
|
|
|
25
25
|
:score
|
|
26
26
|
)
|
|
27
27
|
|
|
28
|
+
# Comma-separated heading selector used for heading/anchor matching.
|
|
28
29
|
HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
|
|
30
|
+
# Path segments that usually represent utility navigation rather than article content.
|
|
29
31
|
UTILITY_PATH_SEGMENTS = %w[
|
|
30
32
|
about account author category comment comments contact feedback help
|
|
31
33
|
login newsletter profile register search settings share signup subscribe
|
|
@@ -40,11 +42,14 @@ module Html2rss
|
|
|
40
42
|
logout
|
|
41
43
|
user users
|
|
42
44
|
].to_set.freeze
|
|
45
|
+
# Path segments that signal content-like destinations.
|
|
43
46
|
CONTENT_PATH_SEGMENTS = %w[
|
|
44
47
|
article articles news post posts story stories update updates
|
|
45
48
|
].to_set.freeze
|
|
49
|
+
# Ancestor tags that usually indicate navigation/utility regions.
|
|
46
50
|
UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
|
|
47
51
|
|
|
52
|
+
# @param base_url [String, Html2rss::Url] page URL used to normalize href destinations
|
|
48
53
|
def initialize(base_url)
|
|
49
54
|
@base_url = base_url
|
|
50
55
|
end
|
|
@@ -20,8 +20,10 @@ module Html2rss
|
|
|
20
20
|
class SemanticHtml
|
|
21
21
|
include Enumerable
|
|
22
22
|
|
|
23
|
+
# Container plus selected anchor chosen for extraction.
|
|
23
24
|
Entry = Data.define(:container, :selected_anchor)
|
|
24
25
|
|
|
26
|
+
# Candidate semantic container selectors used to locate extractable blocks.
|
|
25
27
|
CONTAINER_SELECTORS = [
|
|
26
28
|
'article:not(:has(article))',
|
|
27
29
|
'section:not(:has(section))',
|
|
@@ -45,6 +47,8 @@ module Html2rss
|
|
|
45
47
|
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
46
48
|
# @param url [String, Html2rss::Url] base url
|
|
47
49
|
# @param extractor [Class] extractor class used for article extraction
|
|
50
|
+
# @param _opts [Hash] scraper-specific options
|
|
51
|
+
# @option _opts [Object] :_reserved reserved for future scraper-specific options
|
|
48
52
|
def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
|
|
49
53
|
@parsed_body = parsed_body
|
|
50
54
|
@url = url
|