html2rss 0.17.0 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +48 -656
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +49 -0
- data/lib/html2rss/auto_source/cleanup.rb +33 -5
- data/lib/html2rss/auto_source/scraper/html.rb +118 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +377 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +399 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +2 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -3
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +48 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +12 -8
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +199 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +84 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +261 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +134 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +179 -0
- data/lib/html2rss/auto_source/scraper.rb +142 -8
- data/lib/html2rss/auto_source.rb +119 -47
- data/lib/html2rss/blocked_surface.rb +64 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +170 -23
- data/lib/html2rss/config/class_methods.rb +189 -0
- data/lib/html2rss/config/dynamic_params.rb +68 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +50 -0
- data/lib/html2rss/config/request_headers.rb +130 -0
- data/lib/html2rss/config/schema.rb +208 -0
- data/lib/html2rss/config/validator.rb +108 -0
- data/lib/html2rss/config.rb +112 -61
- data/lib/html2rss/error.rb +6 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +19 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +101 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +49 -0
- data/lib/html2rss/html_extractor.rb +136 -0
- data/lib/html2rss/html_navigator.rb +46 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +58 -0
- data/lib/html2rss/rendering/audio_renderer.rb +31 -0
- data/lib/html2rss/rendering/description_builder.rb +88 -0
- data/lib/html2rss/rendering/image_renderer.rb +31 -0
- data/lib/html2rss/rendering/media_renderer.rb +33 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +28 -0
- data/lib/html2rss/rendering/video_renderer.rb +31 -0
- data/lib/html2rss/rendering.rb +14 -0
- data/lib/html2rss/request_controls.rb +128 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +64 -20
- data/lib/html2rss/request_service/faraday_strategy.rb +135 -5
- data/lib/html2rss/request_service/policy.rb +248 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +42 -2
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +31 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +57 -0
- data/lib/html2rss/request_session/runtime_policy.rb +76 -0
- data/lib/html2rss/request_session.rb +118 -0
- data/lib/html2rss/rss_builder/article.rb +166 -0
- data/lib/html2rss/rss_builder/channel.rb +96 -11
- data/lib/html2rss/rss_builder/enclosure.rb +48 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +4 -4
- data/lib/html2rss/rss_builder.rb +72 -71
- data/lib/html2rss/selectors/config.rb +122 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +50 -0
- data/lib/html2rss/selectors/extractors/href.rb +53 -0
- data/lib/html2rss/selectors/extractors/html.rb +48 -0
- data/lib/html2rss/selectors/extractors/static.rb +41 -0
- data/lib/html2rss/selectors/extractors/text.rb +46 -0
- data/lib/html2rss/selectors/extractors.rb +52 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +61 -0
- data/lib/html2rss/selectors/post_processors/base.rb +74 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +85 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +45 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +35 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +47 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +52 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +73 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +40 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +150 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +74 -0
- data/lib/html2rss/selectors/post_processors/template.rb +73 -0
- data/lib/html2rss/selectors/post_processors.rb +43 -0
- data/lib/html2rss/selectors.rb +294 -0
- data/lib/html2rss/url.rb +262 -0
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +129 -70
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +469 -0
- metadata +115 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
# Scrapes Schema.org Microdata items embedded directly in HTML markup.
|
|
7
|
+
class Microdata
|
|
8
|
+
include Enumerable
|
|
9
|
+
|
|
10
|
+
ITEM_SELECTOR = '[itemscope][itemtype]'
|
|
11
|
+
SUPPORTED_TYPES = (Schema::Thing::SUPPORTED_TYPES | Set['Product']).freeze
|
|
12
|
+
VALUE_ATTRIBUTES = %w[content datetime href src data value].freeze
|
|
13
|
+
|
|
14
|
+
def self.options_key = :microdata
|
|
15
|
+
|
|
16
|
+
class << self
|
|
17
|
+
def articles?(parsed_body)
|
|
18
|
+
supported_roots(parsed_body).any?
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def supported_roots(parsed_body)
|
|
22
|
+
return [] unless parsed_body
|
|
23
|
+
|
|
24
|
+
parsed_body.css(ITEM_SELECTOR).select { supported_root?(_1) }
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def supported_root?(node)
|
|
28
|
+
supported_type_name(node) && top_level_item?(node)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def supported_type_name(node)
|
|
32
|
+
normalized_types(node['itemtype']).find { SUPPORTED_TYPES.include?(_1) }
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def normalized_types(itemtype)
|
|
36
|
+
itemtype.to_s.split.filter_map do |value|
|
|
37
|
+
type = value.split('/').last.to_s.split('#').last.to_s
|
|
38
|
+
type unless type.empty?
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def top_level_item?(node)
|
|
43
|
+
return false if node.attribute('itemprop')
|
|
44
|
+
|
|
45
|
+
node.ancestors.none? { |ancestor| ancestor.attribute('itemscope') && ancestor.attribute('itemprop') }
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
##
|
|
50
|
+
# Builds a Microdata scraper for an already parsed response body.
|
|
51
|
+
#
|
|
52
|
+
# @param parsed_body [Nokogiri::HTML5::Document, Nokogiri::HTML4::Document, Nokogiri::XML::Node, nil]
|
|
53
|
+
# the parsed response body to inspect for top-level Microdata items.
|
|
54
|
+
# @param url [Html2rss::Url] the absolute page URL used to resolve relative links.
|
|
55
|
+
# @param _opts [Hash] unused scraper-specific options.
|
|
56
|
+
# @return [void]
|
|
57
|
+
def initialize(parsed_body, url:, **_opts)
|
|
58
|
+
@parsed_body = parsed_body
|
|
59
|
+
@url = url
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
##
|
|
63
|
+
# Iterates over normalized article hashes extracted from supported Microdata roots.
|
|
64
|
+
#
|
|
65
|
+
# @yieldparam article [Hash<Symbol, Object>] the normalized article attributes.
|
|
66
|
+
# @return [Enumerator, void] an enumerator when no block is given.
|
|
67
|
+
def each
|
|
68
|
+
return enum_for(:each) unless block_given?
|
|
69
|
+
|
|
70
|
+
self.class.supported_roots(parsed_body).each do |root|
|
|
71
|
+
article = article_from(root)
|
|
72
|
+
yield article if article
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
attr_reader :parsed_body, :url
|
|
79
|
+
|
|
80
|
+
def article_from(root)
|
|
81
|
+
schema_object = SchemaObjectBuilder.call(root)
|
|
82
|
+
return unless schema_object
|
|
83
|
+
|
|
84
|
+
article = Schema::Thing.new(schema_object, url:).call.compact
|
|
85
|
+
return unless valid_article?(article)
|
|
86
|
+
|
|
87
|
+
article
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def valid_article?(article)
|
|
91
|
+
return false unless article[:url]
|
|
92
|
+
|
|
93
|
+
article[:title] || article[:description]
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Extracts direct Microdata itemprop values for a single item root.
|
|
97
|
+
module ItemParser
|
|
98
|
+
module_function
|
|
99
|
+
|
|
100
|
+
def call(root)
|
|
101
|
+
{}.tap do |properties|
|
|
102
|
+
direct_properties(root).each { append_properties!(properties, _1) }
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def append_properties!(properties, node)
|
|
107
|
+
value = property_value(node)
|
|
108
|
+
return if blank_value?(value)
|
|
109
|
+
|
|
110
|
+
property_names(node).each do |name|
|
|
111
|
+
append(properties, name.to_sym, value)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def direct_properties(root)
|
|
116
|
+
root.css('[itemprop]').select { direct_property?(root, _1) }
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def direct_property?(root, node)
|
|
120
|
+
return false if node == root
|
|
121
|
+
|
|
122
|
+
node.ancestors.take_while { _1 != root }.none? { |ancestor| ancestor.attribute('itemscope') }
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def property_names(node)
|
|
126
|
+
node['itemprop'].to_s.split.filter_map do |name|
|
|
127
|
+
stripped = name.strip
|
|
128
|
+
stripped unless stripped.empty?
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def property_value(node)
|
|
133
|
+
value = if node.attribute('itemscope')
|
|
134
|
+
nested_item(node)
|
|
135
|
+
else
|
|
136
|
+
attribute_value(node) || text_value(node)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
value unless blank_value?(value)
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
def nested_item(node)
|
|
143
|
+
item = call(node)
|
|
144
|
+
itemtype = node['itemtype']
|
|
145
|
+
itemid = node['itemid']
|
|
146
|
+
item[:@type] = Microdata.normalized_types(itemtype).first if itemtype
|
|
147
|
+
item[:@id] = itemid if present?(itemid)
|
|
148
|
+
item
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def attribute_value(node)
|
|
152
|
+
VALUE_ATTRIBUTES.each do |attribute|
|
|
153
|
+
value = node[attribute]
|
|
154
|
+
return value if present?(value)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
nil
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def text_value(node)
|
|
161
|
+
value = node.text.to_s.strip
|
|
162
|
+
value unless value.empty?
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
def append(properties, key, value)
|
|
166
|
+
return if blank_value?(value)
|
|
167
|
+
|
|
168
|
+
unless properties.key?(key)
|
|
169
|
+
properties[key] = value
|
|
170
|
+
return
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
properties[key] = Array(properties[key]) << value
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def blank_value?(value)
|
|
177
|
+
case value
|
|
178
|
+
when nil then true
|
|
179
|
+
when String then value.strip.empty?
|
|
180
|
+
when Array, Hash then value.empty?
|
|
181
|
+
else false
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def present?(value)
|
|
186
|
+
!blank_value?(value)
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
private_constant :ItemParser
|
|
190
|
+
|
|
191
|
+
# Shared value normalization helpers for Microdata property conversion.
|
|
192
|
+
module ValueNormalizer
|
|
193
|
+
module_function
|
|
194
|
+
|
|
195
|
+
def url_value(*values)
|
|
196
|
+
values.each do |value|
|
|
197
|
+
candidate = extract_nested_value(value, :url, :@id)
|
|
198
|
+
return candidate.to_s if present?(candidate)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
nil
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def image_value(*values)
|
|
205
|
+
values.each do |value|
|
|
206
|
+
candidate = normalize_image(value)
|
|
207
|
+
return candidate if present?(candidate)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
nil
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
def normalize_image(value)
|
|
214
|
+
candidate = unwrap(value)
|
|
215
|
+
return unless present?(candidate)
|
|
216
|
+
|
|
217
|
+
return candidate if candidate.is_a?(String) || candidate.is_a?(Hash)
|
|
218
|
+
|
|
219
|
+
candidate.to_s
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def normalize_about(value)
|
|
223
|
+
candidate = unwrap(value)
|
|
224
|
+
items = candidate.is_a?(Array) ? candidate : [candidate]
|
|
225
|
+
values = items.filter_map { normalize_about_item(_1) }
|
|
226
|
+
values unless values.empty?
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def normalize_about_item(item)
|
|
230
|
+
case item
|
|
231
|
+
when Hash
|
|
232
|
+
name = item[:name]
|
|
233
|
+
{ name: name.to_s } if name
|
|
234
|
+
when String then item
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def string_or_array(value)
|
|
239
|
+
candidate = unwrap(value)
|
|
240
|
+
return unless present?(candidate)
|
|
241
|
+
|
|
242
|
+
return stringify(candidate) unless candidate.is_a?(Array)
|
|
243
|
+
|
|
244
|
+
result = string_values(candidate)
|
|
245
|
+
result unless result.empty?
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
def array_value(*values)
|
|
249
|
+
result = values.flat_map { string_values(Array(unwrap(_1))) }.uniq
|
|
250
|
+
result unless result.empty?
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
def string_values(values)
|
|
254
|
+
values.filter_map { stringify(_1) }
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
def first_string(*values)
|
|
258
|
+
values.each do |value|
|
|
259
|
+
candidate = stringify(unwrap(value))
|
|
260
|
+
return candidate if present?(candidate)
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
nil
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def extract_nested_value(value, *keys)
|
|
267
|
+
candidate = unwrap(value)
|
|
268
|
+
return candidate unless candidate.is_a?(Hash)
|
|
269
|
+
|
|
270
|
+
keys.each do |key|
|
|
271
|
+
nested_value = candidate[key]
|
|
272
|
+
return nested_value if present?(nested_value)
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
nil
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def unwrap(value)
|
|
279
|
+
value.is_a?(Array) ? value.first : value
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def stringify(value)
|
|
283
|
+
return unless present?(value)
|
|
284
|
+
return value if value.is_a?(String)
|
|
285
|
+
return if value.is_a?(Hash) || value.is_a?(Array)
|
|
286
|
+
|
|
287
|
+
value.to_s
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
def present?(value)
|
|
291
|
+
case value
|
|
292
|
+
when nil then false
|
|
293
|
+
when String then !value.strip.empty?
|
|
294
|
+
when Array, Hash then !value.empty?
|
|
295
|
+
else true
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
end
|
|
299
|
+
private_constant :ValueNormalizer
|
|
300
|
+
|
|
301
|
+
# Normalizes raw Microdata properties into the schema-like shape used downstream.
|
|
302
|
+
module SchemaObjectBuilder
|
|
303
|
+
module_function
|
|
304
|
+
|
|
305
|
+
extend ValueNormalizer
|
|
306
|
+
|
|
307
|
+
def call(root)
|
|
308
|
+
type = Microdata.supported_type_name(root)
|
|
309
|
+
return unless type
|
|
310
|
+
|
|
311
|
+
compact_object(type, root, ItemParser.call(root))
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
def compact_object(type, root, properties)
|
|
315
|
+
object = base_attributes(type, root, properties)
|
|
316
|
+
merge_categories!(object, properties)
|
|
317
|
+
object.compact
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
def base_attributes(type, root, properties)
|
|
321
|
+
identifier = first_string(root['itemid'], properties.delete(:identifier))
|
|
322
|
+
|
|
323
|
+
{
|
|
324
|
+
'@type': type,
|
|
325
|
+
'@id': identifier
|
|
326
|
+
}.merge(text_attributes(properties))
|
|
327
|
+
.merge(link_attributes(properties, identifier))
|
|
328
|
+
.merge(media_attributes(properties))
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
def title(properties)
|
|
332
|
+
first_string(properties.delete(:headline), properties.delete(:title), properties.delete(:name))
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def text_attributes(properties)
|
|
336
|
+
{
|
|
337
|
+
title: title(properties),
|
|
338
|
+
description: first_string(properties.delete(:description)),
|
|
339
|
+
schema_object_body: first_string(properties.delete(:articleBody)),
|
|
340
|
+
abstract: first_string(properties.delete(:abstract)),
|
|
341
|
+
datePublished: published_at(properties)
|
|
342
|
+
}
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
def link_attributes(properties, identifier)
|
|
346
|
+
{
|
|
347
|
+
url: url(properties, identifier)
|
|
348
|
+
}
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
def media_attributes(properties)
|
|
352
|
+
{
|
|
353
|
+
image: image_value(properties.delete(:image), properties.delete(:thumbnailUrl))
|
|
354
|
+
}
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
def url(properties, fallback_id)
|
|
358
|
+
url_value(
|
|
359
|
+
properties.delete(:url),
|
|
360
|
+
properties.delete(:mainEntityOfPage),
|
|
361
|
+
url_fallback(fallback_id)
|
|
362
|
+
)
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
def url_fallback(fallback_id)
|
|
366
|
+
value = first_string(fallback_id)
|
|
367
|
+
return unless value
|
|
368
|
+
return value if value.start_with?('/')
|
|
369
|
+
return value if value.match?(%r{\Ahttps?://})
|
|
370
|
+
|
|
371
|
+
nil
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
def published_at(properties)
|
|
375
|
+
first_string(
|
|
376
|
+
properties.delete(:datePublished),
|
|
377
|
+
properties.delete(:dateCreated),
|
|
378
|
+
properties.delete(:dateModified),
|
|
379
|
+
properties.delete(:uploadDate)
|
|
380
|
+
)
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
def merge_categories!(object, properties)
|
|
384
|
+
categories = array_value(properties.delete(:categories), properties.delete(:articleSection))
|
|
385
|
+
assign_if_present(object, :categories, categories)
|
|
386
|
+
assign_if_present(object, :keywords, string_or_array(properties.delete(:keywords)))
|
|
387
|
+
assign_if_present(object, :tags, string_or_array(properties.delete(:tags)))
|
|
388
|
+
assign_if_present(object, :about, normalize_about(properties.delete(:about)))
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
def assign_if_present(object, key, value)
|
|
392
|
+
object[key] = value if value
|
|
393
|
+
end
|
|
394
|
+
end
|
|
395
|
+
private_constant :SchemaObjectBuilder
|
|
396
|
+
end
|
|
397
|
+
end
|
|
398
|
+
end
|
|
399
|
+
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class Schema
|
|
7
|
+
##
|
|
8
|
+
# Extracts categories from Schema.org structured data.
|
|
9
|
+
module CategoryExtractor
|
|
10
|
+
##
|
|
11
|
+
# Extracts categories from a schema object.
|
|
12
|
+
#
|
|
13
|
+
# @param schema_object [Hash] The schema object
|
|
14
|
+
# @return [Array<String>] Array of category strings
|
|
15
|
+
def self.call(schema_object)
|
|
16
|
+
# Build union of all category sources
|
|
17
|
+
field_categories = extract_field_categories(schema_object)
|
|
18
|
+
about_categories = extract_about_categories(schema_object)
|
|
19
|
+
|
|
20
|
+
(field_categories | about_categories).to_a
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
##
|
|
24
|
+
# Extracts categories from keywords, categories, and tags fields.
|
|
25
|
+
#
|
|
26
|
+
# @param schema_object [Hash] The schema object
|
|
27
|
+
# @return [Set<String>] Set of category strings
|
|
28
|
+
def self.extract_field_categories(schema_object)
|
|
29
|
+
Set.new.tap do |categories|
|
|
30
|
+
%w[keywords categories tags].each do |field|
|
|
31
|
+
categories.merge(extract_field_value(schema_object, field))
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
##
|
|
37
|
+
# Extracts categories from the about field.
|
|
38
|
+
#
|
|
39
|
+
# @param schema_object [Hash] The schema object
|
|
40
|
+
# @return [Set<String>] Set of category strings
|
|
41
|
+
def self.extract_about_categories(schema_object)
|
|
42
|
+
about = schema_object[:about]
|
|
43
|
+
return Set.new unless about
|
|
44
|
+
|
|
45
|
+
if about.is_a?(Array)
|
|
46
|
+
extract_about_array(about)
|
|
47
|
+
elsif about.is_a?(String)
|
|
48
|
+
extract_string_categories(about)
|
|
49
|
+
else
|
|
50
|
+
Set.new
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
##
|
|
55
|
+
# Extracts categories from a single field value.
|
|
56
|
+
#
|
|
57
|
+
# @param schema_object [Hash] The schema object
|
|
58
|
+
# @param field [String] The field name
|
|
59
|
+
# @return [Set<String>] Set of category strings
|
|
60
|
+
def self.extract_field_value(schema_object, field)
|
|
61
|
+
value = schema_object[field.to_sym]
|
|
62
|
+
return Set.new unless value
|
|
63
|
+
|
|
64
|
+
if value.is_a?(Array)
|
|
65
|
+
Set.new(value.map(&:to_s).reject(&:empty?))
|
|
66
|
+
elsif value.is_a?(String)
|
|
67
|
+
extract_string_categories(value)
|
|
68
|
+
else
|
|
69
|
+
Set.new
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
##
|
|
74
|
+
# Extracts categories from an about array.
|
|
75
|
+
#
|
|
76
|
+
# @param about [Array] The about array
|
|
77
|
+
# @return [Set<String>] Set of category strings
|
|
78
|
+
def self.extract_about_array(about)
|
|
79
|
+
Set.new.tap do |categories|
|
|
80
|
+
about.each do |item|
|
|
81
|
+
if item.is_a?(Hash) && item[:name]
|
|
82
|
+
categories.add(item[:name].to_s)
|
|
83
|
+
elsif item.is_a?(String)
|
|
84
|
+
categories.add(item)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
##
|
|
91
|
+
# Extracts categories from a string by splitting on separators.
|
|
92
|
+
#
|
|
93
|
+
# @param string [String] The string to process
|
|
94
|
+
# @return [Set<String>] Set of category strings
|
|
95
|
+
def self.extract_string_categories(string)
|
|
96
|
+
Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?))
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
@@ -17,12 +17,12 @@ module Html2rss
|
|
|
17
17
|
def call
|
|
18
18
|
hashes = [super]
|
|
19
19
|
|
|
20
|
-
return hashes
|
|
20
|
+
return hashes unless (elements = @schema_object[:itemListElement])
|
|
21
21
|
|
|
22
22
|
elements = [elements] unless elements.is_a?(Array)
|
|
23
23
|
|
|
24
24
|
elements.each do |schema_object|
|
|
25
|
-
hashes << ListItem.new(schema_object, url:
|
|
25
|
+
hashes << ListItem.new(schema_object, url: base_url || '').call
|
|
26
26
|
end
|
|
27
27
|
|
|
28
28
|
hashes
|
|
@@ -9,14 +9,14 @@ module Html2rss
|
|
|
9
9
|
# @see https://schema.org/ListItem
|
|
10
10
|
class ListItem < Thing
|
|
11
11
|
def id = (id = (schema_object.dig(:item, :@id) || super).to_s).empty? ? nil : id
|
|
12
|
-
def title = schema_object.dig(:item, :name) || super ||
|
|
12
|
+
def title = schema_object.dig(:item, :name) || super || url&.titleized
|
|
13
13
|
def description = schema_object.dig(:item, :description) || super
|
|
14
14
|
|
|
15
|
-
# @return [
|
|
15
|
+
# @return [Html2rss::Url, nil]
|
|
16
16
|
def url
|
|
17
17
|
url = schema_object.dig(:item, :url) || super
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
Url.from_relative(url, base_url || url) if url
|
|
20
20
|
end
|
|
21
21
|
end
|
|
22
22
|
end
|
|
@@ -32,11 +32,11 @@ module Html2rss
|
|
|
32
32
|
TechArticle
|
|
33
33
|
].to_set.freeze
|
|
34
34
|
|
|
35
|
-
DEFAULT_ATTRIBUTES = %i[id title description url image published_at].freeze
|
|
35
|
+
DEFAULT_ATTRIBUTES = %i[id title description url image published_at categories].freeze
|
|
36
36
|
|
|
37
37
|
def initialize(schema_object, url:)
|
|
38
38
|
@schema_object = schema_object
|
|
39
|
-
@
|
|
39
|
+
@base_url = normalized_base_url(url)
|
|
40
40
|
end
|
|
41
41
|
|
|
42
42
|
# @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
|
|
@@ -49,7 +49,7 @@ module Html2rss
|
|
|
49
49
|
def id
|
|
50
50
|
return @id if defined?(@id)
|
|
51
51
|
|
|
52
|
-
id = (schema_object[:@id] || url&.path
|
|
52
|
+
id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s
|
|
53
53
|
|
|
54
54
|
return if id.empty?
|
|
55
55
|
|
|
@@ -63,7 +63,7 @@ module Html2rss
|
|
|
63
63
|
.max_by { |string| string.to_s.size }
|
|
64
64
|
end
|
|
65
65
|
|
|
66
|
-
# @return [
|
|
66
|
+
# @return [Html2rss::Url, nil] the URL of the schema object
|
|
67
67
|
def url
|
|
68
68
|
url = schema_object[:url]
|
|
69
69
|
if url.to_s.empty?
|
|
@@ -71,20 +71,24 @@ module Html2rss
|
|
|
71
71
|
return
|
|
72
72
|
end
|
|
73
73
|
|
|
74
|
-
|
|
74
|
+
Url.from_relative(url, base_url || url)
|
|
75
75
|
end
|
|
76
76
|
|
|
77
77
|
def image
|
|
78
78
|
if (image_url = image_urls.first)
|
|
79
|
-
|
|
79
|
+
Url.from_relative(image_url, base_url || image_url)
|
|
80
80
|
end
|
|
81
81
|
end
|
|
82
82
|
|
|
83
83
|
def published_at = schema_object[:datePublished]
|
|
84
84
|
|
|
85
|
-
|
|
85
|
+
def categories
|
|
86
|
+
return @categories if defined?(@categories)
|
|
86
87
|
|
|
87
|
-
|
|
88
|
+
@categories = CategoryExtractor.call(schema_object)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
attr_reader :schema_object, :base_url
|
|
88
92
|
|
|
89
93
|
def image_urls
|
|
90
94
|
schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
|
|
@@ -97,6 +101,42 @@ module Html2rss
|
|
|
97
101
|
end
|
|
98
102
|
end
|
|
99
103
|
end
|
|
104
|
+
|
|
105
|
+
def normalized_id(value, reference_url:)
|
|
106
|
+
text = value.to_s
|
|
107
|
+
return if text.empty?
|
|
108
|
+
|
|
109
|
+
normalized_url = normalized_id_url(text, reference_url:)
|
|
110
|
+
return text unless reference_url && normalized_url.host == reference_url.host
|
|
111
|
+
|
|
112
|
+
normalized_id_value(normalized_url)
|
|
113
|
+
rescue ArgumentError
|
|
114
|
+
text
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def normalized_id_url(text, reference_url:)
|
|
118
|
+
if text.start_with?('/')
|
|
119
|
+
Url.from_relative(text, reference_url || text)
|
|
120
|
+
else
|
|
121
|
+
Url.from_absolute(text)
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def normalized_id_value(url)
|
|
126
|
+
path = url.path.to_s
|
|
127
|
+
return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
|
|
128
|
+
return path unless path.empty?
|
|
129
|
+
|
|
130
|
+
url.query
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def normalized_base_url(url)
|
|
134
|
+
return if url.to_s.strip.empty?
|
|
135
|
+
|
|
136
|
+
Url.from_absolute(url)
|
|
137
|
+
rescue ArgumentError
|
|
138
|
+
nil
|
|
139
|
+
end
|
|
100
140
|
end
|
|
101
141
|
end
|
|
102
142
|
end
|
|
@@ -19,13 +19,16 @@ module Html2rss
|
|
|
19
19
|
|
|
20
20
|
TAG_SELECTOR = 'script[type="application/ld+json"]'
|
|
21
21
|
|
|
22
|
+
def self.options_key = :schema
|
|
23
|
+
|
|
22
24
|
class << self
|
|
23
25
|
def articles?(parsed_body)
|
|
24
|
-
parsed_body.css(TAG_SELECTOR).any?
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
26
|
+
parsed_body.css(TAG_SELECTOR).any? { |script| supported_schema_type?(script) }
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def supported_schema_type?(script)
|
|
30
|
+
supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
|
|
31
|
+
supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
|
|
29
32
|
end
|
|
30
33
|
|
|
31
34
|
##
|
|
@@ -63,7 +66,7 @@ module Html2rss
|
|
|
63
66
|
elsif ItemList::SUPPORTED_TYPES.member?(type)
|
|
64
67
|
ItemList
|
|
65
68
|
else
|
|
66
|
-
Log.
|
|
69
|
+
Log.debug("#{name}: unsupported schema object @type=#{type.inspect}")
|
|
67
70
|
nil
|
|
68
71
|
end
|
|
69
72
|
end
|
|
@@ -73,14 +76,15 @@ module Html2rss
|
|
|
73
76
|
def parse_script_tag(script_tag)
|
|
74
77
|
JSON.parse(script_tag.text, symbolize_names: true)
|
|
75
78
|
rescue JSON::ParserError => error
|
|
76
|
-
Log.warn(
|
|
79
|
+
Log.warn("#{name}: failed to parse JSON", error: error.message)
|
|
77
80
|
[]
|
|
78
81
|
end
|
|
79
82
|
end
|
|
80
83
|
|
|
81
|
-
def initialize(parsed_body, url
|
|
84
|
+
def initialize(parsed_body, url:, **opts)
|
|
82
85
|
@parsed_body = parsed_body
|
|
83
86
|
@url = url
|
|
87
|
+
@opts = opts
|
|
84
88
|
end
|
|
85
89
|
|
|
86
90
|
##
|