html2rss 0.17.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +90 -639
- data/exe/html2rss +1 -1
- data/html2rss.gemspec +5 -2
- data/lib/html2rss/articles/deduplicator.rb +50 -0
- data/lib/html2rss/auto_source/cleanup.rb +44 -5
- data/lib/html2rss/auto_source/scraper/html.rb +123 -43
- data/lib/html2rss/auto_source/scraper/json_state.rb +457 -0
- data/lib/html2rss/auto_source/scraper/microdata.rb +505 -0
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +102 -0
- data/lib/html2rss/auto_source/scraper/schema/item_list.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +6 -4
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +69 -8
- data/lib/html2rss/auto_source/scraper/schema.rb +27 -12
- data/lib/html2rss/auto_source/scraper/semantic_html/anchor_selector.rb +204 -0
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +88 -79
- data/lib/html2rss/auto_source/scraper/wordpress_api/page_scope.rb +311 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api/posts_endpoint.rb +135 -0
- data/lib/html2rss/auto_source/scraper/wordpress_api.rb +186 -0
- data/lib/html2rss/auto_source/scraper.rb +160 -8
- data/lib/html2rss/auto_source.rb +123 -47
- data/lib/html2rss/blocked_surface.rb +65 -0
- data/lib/html2rss/category_extractor.rb +82 -0
- data/lib/html2rss/cli.rb +194 -23
- data/lib/html2rss/config/class_methods.rb +178 -0
- data/lib/html2rss/config/dynamic_params.rb +70 -0
- data/lib/html2rss/config/multiple_feeds_config.rb +51 -0
- data/lib/html2rss/config/request_headers.rb +136 -0
- data/lib/html2rss/config/schema.rb +240 -0
- data/lib/html2rss/config/validator.rb +146 -0
- data/lib/html2rss/config.rb +118 -61
- data/lib/html2rss/error.rb +31 -0
- data/lib/html2rss/feed_pipeline/auto_fallback.rb +127 -0
- data/lib/html2rss/feed_pipeline.rb +127 -0
- data/lib/html2rss/hash_util.rb +101 -0
- data/lib/html2rss/html_extractor/date_extractor.rb +20 -0
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +120 -0
- data/lib/html2rss/html_extractor/image_extractor.rb +58 -0
- data/lib/html2rss/html_extractor.rb +141 -0
- data/lib/html2rss/html_navigator.rb +54 -0
- data/lib/html2rss/json_feed_builder/item.rb +94 -0
- data/lib/html2rss/json_feed_builder.rb +59 -0
- data/lib/html2rss/rendering/audio_renderer.rb +36 -0
- data/lib/html2rss/rendering/description_builder.rb +87 -0
- data/lib/html2rss/rendering/image_renderer.rb +41 -0
- data/lib/html2rss/rendering/media_renderer.rb +37 -0
- data/lib/html2rss/rendering/pdf_renderer.rb +34 -0
- data/lib/html2rss/rendering/video_renderer.rb +36 -0
- data/lib/html2rss/rendering.rb +23 -0
- data/lib/html2rss/request_controls.rb +123 -0
- data/lib/html2rss/request_service/botasaurus_contract.rb +161 -0
- data/lib/html2rss/request_service/botasaurus_strategy.rb +98 -0
- data/lib/html2rss/request_service/browserless_strategy.rb +103 -7
- data/lib/html2rss/request_service/budget.rb +39 -0
- data/lib/html2rss/request_service/context.rb +77 -21
- data/lib/html2rss/request_service/faraday_strategy.rb +137 -5
- data/lib/html2rss/request_service/policy.rb +252 -0
- data/lib/html2rss/request_service/puppet_commander.rb +212 -13
- data/lib/html2rss/request_service/response.rb +51 -3
- data/lib/html2rss/request_service/response_guard.rb +62 -0
- data/lib/html2rss/request_service.rb +50 -15
- data/lib/html2rss/request_session/rel_next_pager.rb +70 -0
- data/lib/html2rss/request_session/runtime_input.rb +71 -0
- data/lib/html2rss/request_session/runtime_policy.rb +83 -0
- data/lib/html2rss/request_session.rb +122 -0
- data/lib/html2rss/rss_builder/article.rb +187 -0
- data/lib/html2rss/rss_builder/channel.rb +105 -11
- data/lib/html2rss/rss_builder/enclosure.rb +62 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +8 -4
- data/lib/html2rss/rss_builder.rb +76 -71
- data/lib/html2rss/selectors/config.rb +123 -0
- data/lib/html2rss/selectors/extractors/attribute.rb +52 -0
- data/lib/html2rss/selectors/extractors/href.rb +55 -0
- data/lib/html2rss/selectors/extractors/html.rb +49 -0
- data/lib/html2rss/selectors/extractors/static.rb +42 -0
- data/lib/html2rss/selectors/extractors/text.rb +47 -0
- data/lib/html2rss/selectors/extractors.rb +53 -0
- data/lib/html2rss/selectors/object_to_xml_converter.rb +62 -0
- data/lib/html2rss/selectors/post_processors/base.rb +80 -0
- data/lib/html2rss/selectors/post_processors/gsub.rb +88 -0
- data/lib/html2rss/selectors/post_processors/html_to_markdown.rb +48 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/transform_urls_to_absolute_ones.rb +44 -0
- data/lib/html2rss/selectors/post_processors/html_transformers/wrap_img_in_a.rb +53 -0
- data/lib/html2rss/selectors/post_processors/markdown_to_html.rb +55 -0
- data/lib/html2rss/selectors/post_processors/parse_time.rb +78 -0
- data/lib/html2rss/selectors/post_processors/parse_uri.rb +43 -0
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +154 -0
- data/lib/html2rss/selectors/post_processors/substring.rb +77 -0
- data/lib/html2rss/selectors/post_processors/template.rb +76 -0
- data/lib/html2rss/selectors/post_processors.rb +48 -0
- data/lib/html2rss/selectors.rb +301 -0
- data/lib/html2rss/url.rb +266 -0
- data/lib/html2rss/version.rb +2 -1
- data/lib/html2rss.rb +67 -71
- data/lib/tasks/config_schema.rake +17 -0
- data/schema/html2rss-config.schema.json +551 -0
- metadata +120 -38
- data/lib/html2rss/attribute_post_processors/base.rb +0 -74
- data/lib/html2rss/attribute_post_processors/gsub.rb +0 -64
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +0 -43
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +0 -27
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +0 -41
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +0 -50
- data/lib/html2rss/attribute_post_processors/parse_time.rb +0 -46
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +0 -46
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +0 -115
- data/lib/html2rss/attribute_post_processors/substring.rb +0 -72
- data/lib/html2rss/attribute_post_processors/template.rb +0 -101
- data/lib/html2rss/attribute_post_processors.rb +0 -44
- data/lib/html2rss/auto_source/article.rb +0 -127
- data/lib/html2rss/auto_source/channel.rb +0 -78
- data/lib/html2rss/auto_source/reducer.rb +0 -48
- data/lib/html2rss/auto_source/rss_builder.rb +0 -70
- data/lib/html2rss/auto_source/scraper/semantic_html/extractor.rb +0 -136
- data/lib/html2rss/auto_source/scraper/semantic_html/image.rb +0 -54
- data/lib/html2rss/config/channel.rb +0 -125
- data/lib/html2rss/config/selectors.rb +0 -103
- data/lib/html2rss/item.rb +0 -186
- data/lib/html2rss/item_extractors/attribute.rb +0 -50
- data/lib/html2rss/item_extractors/href.rb +0 -52
- data/lib/html2rss/item_extractors/html.rb +0 -46
- data/lib/html2rss/item_extractors/static.rb +0 -39
- data/lib/html2rss/item_extractors/text.rb +0 -44
- data/lib/html2rss/item_extractors.rb +0 -88
- data/lib/html2rss/object_to_xml_converter.rb +0 -56
- data/lib/html2rss/rss_builder/item.rb +0 -83
- data/lib/html2rss/utils.rb +0 -113
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
# Scrapes Schema.org Microdata items embedded directly in HTML markup.
|
|
7
|
+
class Microdata
|
|
8
|
+
include Enumerable
|
|
9
|
+
|
|
10
|
+
# Selector matching nodes that define a microdata item scope.
|
|
11
|
+
ITEM_SELECTOR = '[itemscope][itemtype]'
|
|
12
|
+
# Schema.org types supported for article extraction via Microdata.
|
|
13
|
+
SUPPORTED_TYPES = (Schema::Thing::SUPPORTED_TYPES | Set['Product']).freeze
|
|
14
|
+
# Attribute names checked first for microdata property values.
|
|
15
|
+
VALUE_ATTRIBUTES = %w[content datetime href src data value].freeze
|
|
16
|
+
|
|
17
|
+
# @return [Symbol] scraper config key
|
|
18
|
+
def self.options_key = :microdata
|
|
19
|
+
|
|
20
|
+
class << self
|
|
21
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
22
|
+
def articles?(parsed_body)
|
|
23
|
+
supported_roots(parsed_body).any?
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# @param parsed_body [Nokogiri::HTML::Document, nil] parsed HTML document
|
|
27
|
+
# @return [Array<Nokogiri::XML::Element>] top-level supported Microdata roots
|
|
28
|
+
def supported_roots(parsed_body)
|
|
29
|
+
return [] unless parsed_body
|
|
30
|
+
|
|
31
|
+
parsed_body.css(ITEM_SELECTOR).select { supported_root?(_1) }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# @param node [Nokogiri::XML::Element] itemscope candidate node
|
|
35
|
+
def supported_root?(node)
|
|
36
|
+
supported_type_name(node) && top_level_item?(node)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# @param node [Nokogiri::XML::Element] itemscope candidate node
|
|
40
|
+
# @return [String, nil] supported schema type name when present
|
|
41
|
+
def supported_type_name(node)
|
|
42
|
+
normalized_types(node['itemtype']).find { SUPPORTED_TYPES.include?(_1) }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# @param itemtype [String, nil] raw itemtype attribute value
|
|
46
|
+
# @return [Array<String>] normalized schema type names
|
|
47
|
+
def normalized_types(itemtype)
|
|
48
|
+
itemtype.to_s.split.filter_map do |value|
|
|
49
|
+
type = value.split('/').last.to_s.split('#').last.to_s
|
|
50
|
+
type unless type.empty?
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# @param node [Nokogiri::XML::Element] itemscope candidate node
|
|
55
|
+
def top_level_item?(node)
|
|
56
|
+
return false if node.attribute('itemprop')
|
|
57
|
+
|
|
58
|
+
node.ancestors.none? { |ancestor| ancestor.attribute('itemscope') && ancestor.attribute('itemprop') }
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
##
|
|
63
|
+
# Builds a Microdata scraper for an already parsed response body.
|
|
64
|
+
#
|
|
65
|
+
# @param parsed_body [Nokogiri::HTML5::Document, Nokogiri::HTML4::Document, Nokogiri::XML::Node, nil]
|
|
66
|
+
# the parsed response body to inspect for top-level Microdata items.
|
|
67
|
+
# @param url [Html2rss::Url] the absolute page URL used to resolve relative links.
|
|
68
|
+
# @param _opts [Hash] unused scraper-specific options.
|
|
69
|
+
# @option _opts [Object] :_reserved reserved for future scraper-specific options
|
|
70
|
+
# @return [void]
|
|
71
|
+
def initialize(parsed_body, url:, **_opts)
|
|
72
|
+
@parsed_body = parsed_body
|
|
73
|
+
@url = url
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
##
|
|
77
|
+
# Iterates over normalized article hashes extracted from supported Microdata roots.
|
|
78
|
+
#
|
|
79
|
+
# @yieldparam article [Hash{Symbol => Object}] the normalized article attributes.
|
|
80
|
+
# @return [Enumerator, void] an enumerator when no block is given.
|
|
81
|
+
def each
|
|
82
|
+
return enum_for(:each) unless block_given?
|
|
83
|
+
|
|
84
|
+
self.class.supported_roots(parsed_body).each do |root|
|
|
85
|
+
article = article_from(root)
|
|
86
|
+
yield article if article
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
private
|
|
91
|
+
|
|
92
|
+
attr_reader :parsed_body, :url
|
|
93
|
+
|
|
94
|
+
# @param root [Nokogiri::XML::Element] supported Microdata root node
|
|
95
|
+
# @return [Hash{Symbol => Object}, nil] normalized article hash
|
|
96
|
+
def article_from(root)
|
|
97
|
+
schema_object = SchemaObjectBuilder.call(root)
|
|
98
|
+
return unless schema_object
|
|
99
|
+
|
|
100
|
+
article = Schema::Thing.new(schema_object, url:).call.compact
|
|
101
|
+
return unless valid_article?(article)
|
|
102
|
+
|
|
103
|
+
article
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# @param article [Hash{Symbol => Object}] normalized article hash
|
|
107
|
+
# @return [Boolean] whether article contains required fields
|
|
108
|
+
def valid_article?(article)
|
|
109
|
+
return false unless article[:url]
|
|
110
|
+
|
|
111
|
+
article[:title] || article[:description]
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# Extracts direct Microdata itemprop values for a single item root.
|
|
115
|
+
module ItemParser
|
|
116
|
+
module_function
|
|
117
|
+
|
|
118
|
+
# @param root [Nokogiri::XML::Element] microdata root node
|
|
119
|
+
# @return [Hash{Symbol => Object}] extracted direct properties
|
|
120
|
+
def call(root)
|
|
121
|
+
{}.tap do |properties|
|
|
122
|
+
direct_properties(root).each { append_properties!(properties, _1) }
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# @param properties [Hash{Symbol => Object}] accumulator hash for parsed properties
|
|
127
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
128
|
+
# @return [void]
|
|
129
|
+
def append_properties!(properties, node)
|
|
130
|
+
value = property_value(node)
|
|
131
|
+
return if blank_value?(value)
|
|
132
|
+
|
|
133
|
+
property_names(node).each do |name|
|
|
134
|
+
append(properties, name.to_sym, value)
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# @param root [Nokogiri::XML::Element] microdata root node
|
|
139
|
+
# @return [Array<Nokogiri::XML::Element>] direct property nodes for the root
|
|
140
|
+
def direct_properties(root)
|
|
141
|
+
root.css('[itemprop]').select { direct_property?(root, _1) }
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
# @param root [Nokogiri::XML::Element] microdata root node
|
|
145
|
+
# @param node [Nokogiri::XML::Element] candidate itemprop node
|
|
146
|
+
# @return [Boolean] whether the node belongs directly to the current root item
|
|
147
|
+
def direct_property?(root, node)
|
|
148
|
+
return false if node == root
|
|
149
|
+
|
|
150
|
+
node.ancestors.take_while { _1 != root }.none? { |ancestor| ancestor.attribute('itemscope') }
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
154
|
+
# @return [Array<String>] normalized property names
|
|
155
|
+
def property_names(node)
|
|
156
|
+
node['itemprop'].to_s.split.filter_map do |name|
|
|
157
|
+
stripped = name.strip
|
|
158
|
+
stripped unless stripped.empty?
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
163
|
+
# @return [Object, nil] parsed property value
|
|
164
|
+
def property_value(node)
|
|
165
|
+
value = if node.attribute('itemscope')
|
|
166
|
+
nested_item(node)
|
|
167
|
+
else
|
|
168
|
+
attribute_value(node) || text_value(node)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
value unless blank_value?(value)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# @param node [Nokogiri::XML::Element] nested itemscope node
|
|
175
|
+
# @return [Hash{Symbol => Object}] nested parsed microdata item
|
|
176
|
+
def nested_item(node)
|
|
177
|
+
item = call(node)
|
|
178
|
+
itemtype = node['itemtype']
|
|
179
|
+
itemid = node['itemid']
|
|
180
|
+
item[:@type] = Microdata.normalized_types(itemtype).first if itemtype
|
|
181
|
+
item[:@id] = itemid if present?(itemid)
|
|
182
|
+
item
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
186
|
+
# @return [String, nil] first present attribute value
|
|
187
|
+
def attribute_value(node)
|
|
188
|
+
VALUE_ATTRIBUTES.each do |attribute|
|
|
189
|
+
value = node[attribute]
|
|
190
|
+
return value if present?(value)
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
nil
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# @param node [Nokogiri::XML::Element] itemprop node
|
|
197
|
+
# @return [String, nil] normalized text content
|
|
198
|
+
def text_value(node)
|
|
199
|
+
value = node.text.to_s.strip
|
|
200
|
+
value unless value.empty?
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# @param properties [Hash{Symbol => Object}] accumulator hash for parsed properties
|
|
204
|
+
# @param key [Symbol] target property key
|
|
205
|
+
# @param value [Object] parsed property value to assign for the key
|
|
206
|
+
# @return [void]
|
|
207
|
+
def append(properties, key, value)
|
|
208
|
+
return if blank_value?(value)
|
|
209
|
+
|
|
210
|
+
unless properties.key?(key)
|
|
211
|
+
properties[key] = value
|
|
212
|
+
return
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
properties[key] = Array(properties[key]) << value
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
# @param value [Object] candidate value
|
|
219
|
+
# @return [Boolean] whether value is blank for microdata extraction purposes
|
|
220
|
+
def blank_value?(value)
|
|
221
|
+
case value
|
|
222
|
+
when nil then true
|
|
223
|
+
when String then value.strip.empty?
|
|
224
|
+
when Array, Hash then value.empty?
|
|
225
|
+
else false
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# @param value [Object] candidate value
|
|
230
|
+
# @return [Boolean] whether value is present for microdata extraction purposes
|
|
231
|
+
def present?(value)
|
|
232
|
+
!blank_value?(value)
|
|
233
|
+
end
|
|
234
|
+
end
|
|
235
|
+
private_constant :ItemParser
|
|
236
|
+
|
|
237
|
+
# Shared value normalization helpers for Microdata property conversion.
|
|
238
|
+
module ValueNormalizer
|
|
239
|
+
module_function
|
|
240
|
+
|
|
241
|
+
# @param values [Array<Object>] value candidates
|
|
242
|
+
# @return [String, nil] first URL-like value converted to string
|
|
243
|
+
def url_value(*values)
|
|
244
|
+
values.each do |value|
|
|
245
|
+
candidate = extract_nested_value(value, :url, :@id)
|
|
246
|
+
return candidate.to_s if present?(candidate)
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
nil
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# @param values [Array<Object>] value candidates
|
|
253
|
+
# @return [String, Hash, nil] first normalized image candidate
|
|
254
|
+
def image_value(*values)
|
|
255
|
+
values.each do |value|
|
|
256
|
+
candidate = normalize_image(value)
|
|
257
|
+
return candidate if present?(candidate)
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
nil
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
# @param value [Object] image candidate value
|
|
264
|
+
# @return [String, Hash, nil] normalized image-like value
|
|
265
|
+
def normalize_image(value)
|
|
266
|
+
candidate = unwrap(value)
|
|
267
|
+
return unless present?(candidate)
|
|
268
|
+
|
|
269
|
+
return candidate if candidate.is_a?(String) || candidate.is_a?(Hash)
|
|
270
|
+
|
|
271
|
+
candidate.to_s
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
# @param value [Object] about candidate value
|
|
275
|
+
# @return [Array<String, Hash>, nil] normalized about values
|
|
276
|
+
def normalize_about(value)
|
|
277
|
+
candidate = unwrap(value)
|
|
278
|
+
items = candidate.is_a?(Array) ? candidate : [candidate]
|
|
279
|
+
values = items.filter_map { normalize_about_item(_1) }
|
|
280
|
+
values unless values.empty?
|
|
281
|
+
end
|
|
282
|
+
|
|
283
|
+
# @param item [Object] single about item
|
|
284
|
+
# @return [String, Hash, nil] normalized about item
|
|
285
|
+
def normalize_about_item(item)
|
|
286
|
+
case item
|
|
287
|
+
when Hash
|
|
288
|
+
name = item[:name]
|
|
289
|
+
{ name: name.to_s } if name
|
|
290
|
+
when String then item
|
|
291
|
+
end
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
# @param value [Object] scalar or array candidate
|
|
295
|
+
# @return [String, Array<String>, nil] normalized scalar or string array
|
|
296
|
+
def string_or_array(value)
|
|
297
|
+
candidate = unwrap(value)
|
|
298
|
+
return unless present?(candidate)
|
|
299
|
+
|
|
300
|
+
return stringify(candidate) unless candidate.is_a?(Array)
|
|
301
|
+
|
|
302
|
+
result = string_values(candidate)
|
|
303
|
+
result unless result.empty?
|
|
304
|
+
end
|
|
305
|
+
|
|
306
|
+
# @param values [Array<Object>] value candidates
|
|
307
|
+
# @return [Array<String>, nil] normalized unique string values
|
|
308
|
+
def array_value(*values)
|
|
309
|
+
result = values.flat_map { string_values(Array(unwrap(_1))) }.uniq
|
|
310
|
+
result unless result.empty?
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
# @param values [Array<Object>] candidate scalar values collected from microdata arrays
|
|
314
|
+
# @return [Array<String>] normalized string values
|
|
315
|
+
def string_values(values)
|
|
316
|
+
values.filter_map { stringify(_1) }
|
|
317
|
+
end
|
|
318
|
+
|
|
319
|
+
# @param values [Array<Object>] value candidates
|
|
320
|
+
# @return [String, nil] first present string-like value
|
|
321
|
+
def first_string(*values)
|
|
322
|
+
values.each do |value|
|
|
323
|
+
candidate = stringify(unwrap(value))
|
|
324
|
+
return candidate if present?(candidate)
|
|
325
|
+
end
|
|
326
|
+
|
|
327
|
+
nil
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# @param value [Object] nested container or scalar
|
|
331
|
+
# @param keys [Array<Symbol>] nested keys to probe in order
|
|
332
|
+
# @return [Object, nil] first matching nested value
|
|
333
|
+
def extract_nested_value(value, *keys)
|
|
334
|
+
candidate = unwrap(value)
|
|
335
|
+
return candidate unless candidate.is_a?(Hash)
|
|
336
|
+
|
|
337
|
+
keys.each do |key|
|
|
338
|
+
nested_value = candidate[key]
|
|
339
|
+
return nested_value if present?(nested_value)
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
nil
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# @param value [Object] scalar or array candidate
|
|
346
|
+
# @return [Object] first array element or the original value
|
|
347
|
+
def unwrap(value)
|
|
348
|
+
value.is_a?(Array) ? value.first : value
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
# @param value [Object] scalar candidate normalized to string output
|
|
352
|
+
# @return [String, nil] normalized string representation
|
|
353
|
+
def stringify(value)
|
|
354
|
+
return unless present?(value)
|
|
355
|
+
return value if value.is_a?(String)
|
|
356
|
+
return if value.is_a?(Hash) || value.is_a?(Array)
|
|
357
|
+
|
|
358
|
+
value.to_s
|
|
359
|
+
end
|
|
360
|
+
|
|
361
|
+
# @param value [Object] candidate value
|
|
362
|
+
# @return [Boolean] whether value is present
|
|
363
|
+
def present?(value)
|
|
364
|
+
case value
|
|
365
|
+
when nil then false
|
|
366
|
+
when String then !value.strip.empty?
|
|
367
|
+
when Array, Hash then !value.empty?
|
|
368
|
+
else true
|
|
369
|
+
end
|
|
370
|
+
end
|
|
371
|
+
end
|
|
372
|
+
private_constant :ValueNormalizer
|
|
373
|
+
|
|
374
|
+
# Normalizes raw Microdata properties into the schema-like shape used downstream.
|
|
375
|
+
module SchemaObjectBuilder
|
|
376
|
+
module_function
|
|
377
|
+
|
|
378
|
+
extend ValueNormalizer
|
|
379
|
+
|
|
380
|
+
# @param root [Nokogiri::XML::Element] supported microdata root node
|
|
381
|
+
# @return [Hash{Symbol => Object}, nil] compact schema-like object
|
|
382
|
+
def call(root)
|
|
383
|
+
type = Microdata.supported_type_name(root)
|
|
384
|
+
return unless type
|
|
385
|
+
|
|
386
|
+
compact_object(type, root, ItemParser.call(root))
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
# @param type [String] schema type inferred from itemtype
|
|
390
|
+
# @param root [Nokogiri::XML::Element] supported microdata root node
|
|
391
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
392
|
+
# @return [Hash{Symbol => Object}] normalized schema-like object
|
|
393
|
+
def compact_object(type, root, properties)
|
|
394
|
+
object = base_attributes(type, root, properties)
|
|
395
|
+
merge_categories!(object, properties)
|
|
396
|
+
object.compact
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
# @param type [String] schema type inferred from itemtype
|
|
400
|
+
# @param root [Nokogiri::XML::Element] supported microdata root node
|
|
401
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
402
|
+
# @return [Hash{Symbol => Object}] base schema attributes before category merging
|
|
403
|
+
def base_attributes(type, root, properties)
|
|
404
|
+
identifier = first_string(root['itemid'], properties.delete(:identifier))
|
|
405
|
+
|
|
406
|
+
{
|
|
407
|
+
'@type': type,
|
|
408
|
+
'@id': identifier
|
|
409
|
+
}.merge(text_attributes(properties))
|
|
410
|
+
.merge(link_attributes(properties, identifier))
|
|
411
|
+
.merge(media_attributes(properties))
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
415
|
+
# @return [String, nil] normalized title
|
|
416
|
+
def title(properties)
|
|
417
|
+
first_string(properties.delete(:headline), properties.delete(:title), properties.delete(:name))
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
421
|
+
# @return [Hash{Symbol => Object}] normalized text attributes
|
|
422
|
+
def text_attributes(properties)
|
|
423
|
+
{
|
|
424
|
+
title: title(properties),
|
|
425
|
+
description: first_string(properties.delete(:description)),
|
|
426
|
+
schema_object_body: first_string(properties.delete(:articleBody)),
|
|
427
|
+
abstract: first_string(properties.delete(:abstract)),
|
|
428
|
+
datePublished: published_at(properties)
|
|
429
|
+
}
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
433
|
+
# @param identifier [String, nil] identifier candidate for fallback URL handling
|
|
434
|
+
# @return [Hash{Symbol => Object}] normalized link attributes
|
|
435
|
+
def link_attributes(properties, identifier)
|
|
436
|
+
{
|
|
437
|
+
url: url(properties, identifier)
|
|
438
|
+
}
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
442
|
+
# @return [Hash{Symbol => Object}] normalized media attributes
|
|
443
|
+
def media_attributes(properties)
|
|
444
|
+
{
|
|
445
|
+
image: image_value(properties.delete(:image), properties.delete(:thumbnailUrl))
|
|
446
|
+
}
|
|
447
|
+
end
|
|
448
|
+
|
|
449
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
450
|
+
# @param fallback_id [String, nil] identifier candidate for fallback URL handling
|
|
451
|
+
# @return [String, nil] normalized URL candidate
|
|
452
|
+
def url(properties, fallback_id)
|
|
453
|
+
url_value(
|
|
454
|
+
properties.delete(:url),
|
|
455
|
+
properties.delete(:mainEntityOfPage),
|
|
456
|
+
url_fallback(fallback_id)
|
|
457
|
+
)
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
# @param fallback_id [String, nil] identifier candidate for fallback URL handling
|
|
461
|
+
# @return [String, nil] fallback URL candidate when identifier looks URL-like
|
|
462
|
+
def url_fallback(fallback_id)
|
|
463
|
+
value = first_string(fallback_id)
|
|
464
|
+
return unless value
|
|
465
|
+
return value if value.start_with?('/')
|
|
466
|
+
return value if value.match?(%r{\Ahttps?://})
|
|
467
|
+
|
|
468
|
+
nil
|
|
469
|
+
end
|
|
470
|
+
|
|
471
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
472
|
+
# @return [String, nil] normalized published-at value
|
|
473
|
+
def published_at(properties)
|
|
474
|
+
first_string(
|
|
475
|
+
properties.delete(:datePublished),
|
|
476
|
+
properties.delete(:dateCreated),
|
|
477
|
+
properties.delete(:dateModified),
|
|
478
|
+
properties.delete(:uploadDate)
|
|
479
|
+
)
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
# @param object [Hash{Symbol => Object}] schema-like output object
|
|
483
|
+
# @param properties [Hash{Symbol => Object}] parsed microdata properties
|
|
484
|
+
# @return [void]
|
|
485
|
+
def merge_categories!(object, properties)
|
|
486
|
+
categories = array_value(properties.delete(:categories), properties.delete(:articleSection))
|
|
487
|
+
assign_if_present(object, :categories, categories)
|
|
488
|
+
assign_if_present(object, :keywords, string_or_array(properties.delete(:keywords)))
|
|
489
|
+
assign_if_present(object, :tags, string_or_array(properties.delete(:tags)))
|
|
490
|
+
assign_if_present(object, :about, normalize_about(properties.delete(:about)))
|
|
491
|
+
end
|
|
492
|
+
|
|
493
|
+
# @param object [Hash{Symbol => Object}] schema-like output object
|
|
494
|
+
# @param key [Symbol] target attribute key
|
|
495
|
+
# @param value [Object] value to assign when present
|
|
496
|
+
# @return [void]
|
|
497
|
+
def assign_if_present(object, key, value)
|
|
498
|
+
object[key] = value if value
|
|
499
|
+
end
|
|
500
|
+
end
|
|
501
|
+
private_constant :SchemaObjectBuilder
|
|
502
|
+
end
|
|
503
|
+
end
|
|
504
|
+
end
|
|
505
|
+
end
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Html2rss
|
|
4
|
+
class AutoSource
|
|
5
|
+
module Scraper
|
|
6
|
+
class Schema
|
|
7
|
+
##
|
|
8
|
+
# Extracts categories from Schema.org structured data.
|
|
9
|
+
module CategoryExtractor
|
|
10
|
+
##
|
|
11
|
+
# Extracts categories from a schema object.
|
|
12
|
+
#
|
|
13
|
+
# @param schema_object [Hash] The schema object
|
|
14
|
+
# @return [Array<String>] Array of category strings
|
|
15
|
+
def self.call(schema_object)
|
|
16
|
+
# Build union of all category sources
|
|
17
|
+
field_categories = extract_field_categories(schema_object)
|
|
18
|
+
about_categories = extract_about_categories(schema_object)
|
|
19
|
+
|
|
20
|
+
(field_categories | about_categories).to_a
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
##
|
|
24
|
+
# Extracts categories from keywords, categories, and tags fields.
|
|
25
|
+
#
|
|
26
|
+
# @param schema_object [Hash] The schema object
|
|
27
|
+
# @return [Set<String>] Set of category strings
|
|
28
|
+
def self.extract_field_categories(schema_object)
|
|
29
|
+
Set.new.tap do |categories|
|
|
30
|
+
%w[keywords categories tags].each do |field|
|
|
31
|
+
categories.merge(extract_field_value(schema_object, field))
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
##
|
|
37
|
+
# Extracts categories from the about field.
|
|
38
|
+
#
|
|
39
|
+
# @param schema_object [Hash] The schema object
|
|
40
|
+
# @return [Set<String>] Set of category strings
|
|
41
|
+
def self.extract_about_categories(schema_object)
|
|
42
|
+
about = schema_object[:about]
|
|
43
|
+
return Set.new unless about
|
|
44
|
+
|
|
45
|
+
if about.is_a?(Array)
|
|
46
|
+
extract_about_array(about)
|
|
47
|
+
elsif about.is_a?(String)
|
|
48
|
+
extract_string_categories(about)
|
|
49
|
+
else
|
|
50
|
+
Set.new
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
##
|
|
55
|
+
# Extracts categories from a single field value.
|
|
56
|
+
#
|
|
57
|
+
# @param schema_object [Hash] The schema object
|
|
58
|
+
# @param field [String] The field name
|
|
59
|
+
# @return [Set<String>] Set of category strings
|
|
60
|
+
def self.extract_field_value(schema_object, field)
|
|
61
|
+
value = schema_object[field.to_sym]
|
|
62
|
+
return Set.new unless value
|
|
63
|
+
|
|
64
|
+
if value.is_a?(Array)
|
|
65
|
+
Set.new(value.map(&:to_s).reject(&:empty?))
|
|
66
|
+
elsif value.is_a?(String)
|
|
67
|
+
extract_string_categories(value)
|
|
68
|
+
else
|
|
69
|
+
Set.new
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
##
|
|
74
|
+
# Extracts categories from an about array.
|
|
75
|
+
#
|
|
76
|
+
# @param about [Array] The about array
|
|
77
|
+
# @return [Set<String>] Set of category strings
|
|
78
|
+
def self.extract_about_array(about)
|
|
79
|
+
Set.new.tap do |categories|
|
|
80
|
+
about.each do |item|
|
|
81
|
+
if item.is_a?(Hash) && item[:name]
|
|
82
|
+
categories.add(item[:name].to_s)
|
|
83
|
+
elsif item.is_a?(String)
|
|
84
|
+
categories.add(item)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
##
|
|
91
|
+
# Extracts categories from a string by splitting on separators.
|
|
92
|
+
#
|
|
93
|
+
# @param string [String] source string that may contain category delimiters
|
|
94
|
+
# @return [Set<String>] Set of category strings
|
|
95
|
+
def self.extract_string_categories(string)
|
|
96
|
+
Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?))
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
@@ -11,18 +11,19 @@ module Html2rss
|
|
|
11
11
|
#
|
|
12
12
|
# @see https://schema.org/ItemList
|
|
13
13
|
class ItemList < Thing
|
|
14
|
+
# Schema.org type names handled by the ItemList extractor.
|
|
14
15
|
SUPPORTED_TYPES = Set['ItemList']
|
|
15
16
|
|
|
16
17
|
# @return [Array<Hash>] the scraped article hashes with DEFAULT_ATTRIBUTES
|
|
17
18
|
def call
|
|
18
19
|
hashes = [super]
|
|
19
20
|
|
|
20
|
-
return hashes
|
|
21
|
+
return hashes unless (elements = @schema_object[:itemListElement])
|
|
21
22
|
|
|
22
23
|
elements = [elements] unless elements.is_a?(Array)
|
|
23
24
|
|
|
24
25
|
elements.each do |schema_object|
|
|
25
|
-
hashes << ListItem.new(schema_object, url:
|
|
26
|
+
hashes << ListItem.new(schema_object, url: base_url || '').call
|
|
26
27
|
end
|
|
27
28
|
|
|
28
29
|
hashes
|
|
@@ -5,18 +5,20 @@ module Html2rss
|
|
|
5
5
|
module Scraper
|
|
6
6
|
class Schema
|
|
7
7
|
##
|
|
8
|
-
#
|
|
9
8
|
# @see https://schema.org/ListItem
|
|
10
9
|
class ListItem < Thing
|
|
10
|
+
# @return [String, nil] stable list-item identifier
|
|
11
11
|
def id = (id = (schema_object.dig(:item, :@id) || super).to_s).empty? ? nil : id
|
|
12
|
-
|
|
12
|
+
# @return [String, nil] list-item title
|
|
13
|
+
def title = schema_object.dig(:item, :name) || super || url&.titleized
|
|
14
|
+
# @return [String, nil] list-item description
|
|
13
15
|
def description = schema_object.dig(:item, :description) || super
|
|
14
16
|
|
|
15
|
-
# @return [
|
|
17
|
+
# @return [Html2rss::Url, nil]
|
|
16
18
|
def url
|
|
17
19
|
url = schema_object.dig(:item, :url) || super
|
|
18
20
|
|
|
19
|
-
|
|
21
|
+
Url.from_relative(url, base_url || url) if url
|
|
20
22
|
end
|
|
21
23
|
end
|
|
22
24
|
end
|