html2rss 0.22.0 → 0.22.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/html2rss/auto_source/scraper/html/class_clustering.rb +14 -3
- data/lib/html2rss/auto_source/scraper/microdata.rb +14 -2
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +1 -1
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +28 -20
- data/lib/html2rss/auto_source.rb +3 -1
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +4 -18
- data/lib/html2rss/html_extractor/semantic_containers.rb +28 -3
- data/lib/html2rss/html_extractor.rb +36 -17
- data/lib/html2rss/html_navigator.rb +17 -0
- data/lib/html2rss/selectors.rb +8 -3
- data/lib/html2rss/version.rb +1 -1
- data/schema/html2rss-config.schema.json +8 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 750b7fb967b328cef2238b66729cafe122d1bae23bee05fd8504bb31e760b8a7
|
|
4
|
+
data.tar.gz: 327406de9c7c97ea13e90c89bec1c2653c962bbcaccfd29ddb78b282477f7578
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5f41e00edfdd19ceb012900db7518f28236b417662dc4f11f45d9c498ede0720bdbbc0fb31443d80495eaf6076df646062fdf7f972b27bd71c50cc4e198b4540
|
|
7
|
+
data.tar.gz: 7a7eff85bd7f98cd872131041aa58faf9d3fba1aff47893a6d286378b6f52904ee11120f053e3ca8beca3060d86d1438cf2037e3e4e7e1586d3d3c4679b026f0
|
|
@@ -10,7 +10,7 @@ module Html2rss
|
|
|
10
10
|
# rubocop:disable Metrics/ClassLength
|
|
11
11
|
class ClassClustering
|
|
12
12
|
# Node tags considered layout containers
|
|
13
|
-
LAYOUT_TAG_NAMES = Set['div', 'section', 'article'].freeze
|
|
13
|
+
LAYOUT_TAG_NAMES = Set['div', 'section', 'article', 'li', 'ul', 'ol'].freeze
|
|
14
14
|
# HTML/layout tags excluded from candidate nodes
|
|
15
15
|
EXCLUDED_TAGS = Set['html', 'body', 'nav', 'footer', 'header', 'svg', 'script', 'style'].freeze
|
|
16
16
|
|
|
@@ -83,13 +83,24 @@ module Html2rss
|
|
|
83
83
|
end
|
|
84
84
|
end
|
|
85
85
|
|
|
86
|
+
# rubocop:disable Metrics/MethodLength
|
|
86
87
|
def container_of?(nodes_a, nodes_b)
|
|
87
88
|
return false unless LAYOUT_TAG_NAMES.include?(nodes_b.first.name)
|
|
88
89
|
|
|
89
90
|
nodes_a.any? do |node_a|
|
|
90
|
-
|
|
91
|
+
count = 0
|
|
92
|
+
nodes_b.each do |node_b|
|
|
93
|
+
next if node_a == node_b
|
|
94
|
+
|
|
95
|
+
if HtmlNavigator.descendant_of?(node_b, node_a)
|
|
96
|
+
count += 1
|
|
97
|
+
break if count > 1
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
count > 1
|
|
91
101
|
end
|
|
92
102
|
end
|
|
103
|
+
# rubocop:enable Metrics/MethodLength
|
|
93
104
|
|
|
94
105
|
# If group A contains group B, and they have the same size:
|
|
95
106
|
# - If B (the descendant) contains >= 80% of A's words, AND B's tag is div/section/article,
|
|
@@ -112,7 +123,7 @@ module Html2rss
|
|
|
112
123
|
nodes_a = groups[cls_a]
|
|
113
124
|
nodes_b = groups[cls_b]
|
|
114
125
|
return if nodes_a.size != nodes_b.size
|
|
115
|
-
return unless nodes_a.zip(nodes_b).all? { |a, b| a != b &&
|
|
126
|
+
return unless nodes_a.zip(nodes_b).all? { |a, b| a != b && HtmlNavigator.descendant_of?(b, a) }
|
|
116
127
|
|
|
117
128
|
discarded << (keep_descendant?(nodes_a, nodes_b) ? cls_a : cls_b)
|
|
118
129
|
end
|
|
@@ -55,7 +55,13 @@ module Html2rss
|
|
|
55
55
|
def top_level_item?(node)
|
|
56
56
|
return false if node.attribute('itemprop')
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
curr = node.parent
|
|
59
|
+
while curr && !curr.document? && curr.name != 'html'
|
|
60
|
+
return false if curr.attribute('itemscope') && curr.attribute('itemprop')
|
|
61
|
+
|
|
62
|
+
curr = curr.parent
|
|
63
|
+
end
|
|
64
|
+
true
|
|
59
65
|
end
|
|
60
66
|
end
|
|
61
67
|
|
|
@@ -147,7 +153,13 @@ module Html2rss
|
|
|
147
153
|
def direct_property?(root, node)
|
|
148
154
|
return false if node == root
|
|
149
155
|
|
|
150
|
-
|
|
156
|
+
curr = node.parent
|
|
157
|
+
while curr && curr != root
|
|
158
|
+
return false if curr.attribute('itemscope')
|
|
159
|
+
|
|
160
|
+
curr = curr.parent
|
|
161
|
+
end
|
|
162
|
+
true
|
|
151
163
|
end
|
|
152
164
|
|
|
153
165
|
# @param node [Nokogiri::XML::Element] itemprop node
|
|
@@ -37,7 +37,7 @@ module Html2rss
|
|
|
37
37
|
|
|
38
38
|
@article_cache.fetch(entry) do
|
|
39
39
|
@article_cache[entry] = @extractor.new(
|
|
40
|
-
entry.container, base_url: @url, selected_anchor: entry.selected_anchor
|
|
40
|
+
entry.container, base_url: @url, selected_anchor: entry.selected_anchor, fallback_anchorless: true
|
|
41
41
|
).call
|
|
42
42
|
end
|
|
43
43
|
end
|
|
@@ -60,12 +60,13 @@ module Html2rss
|
|
|
60
60
|
# @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
|
|
61
61
|
# @param url [String, Html2rss::Url] base url
|
|
62
62
|
# @param extractor [Class] extractor class used for article extraction
|
|
63
|
-
# @param
|
|
64
|
-
# @option
|
|
65
|
-
def initialize(parsed_body, url:, extractor: HtmlExtractor, **
|
|
63
|
+
# @param opts [Hash] scraper-specific options
|
|
64
|
+
# @option opts [Boolean] :fallback_anchorless whether to extract anchorless blocks
|
|
65
|
+
def initialize(parsed_body, url:, extractor: HtmlExtractor, **opts)
|
|
66
66
|
@parsed_body = parsed_body
|
|
67
67
|
@url = url
|
|
68
68
|
@extractor = extractor
|
|
69
|
+
@fallback_anchorless = opts.fetch(:fallback_anchorless, false)
|
|
69
70
|
@link_heuristics = LinkHeuristics.new(url)
|
|
70
71
|
@anchor_selector = AnchorSelector.new(url)
|
|
71
72
|
end
|
|
@@ -107,14 +108,15 @@ module Html2rss
|
|
|
107
108
|
@anchor_selector.primary_anchor_for(container)
|
|
108
109
|
end
|
|
109
110
|
|
|
110
|
-
|
|
111
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
112
|
+
def extractable_entries
|
|
111
113
|
@extractable_entries ||= candidate_containers.filter_map do |container|
|
|
112
114
|
selected_anchor = primary_anchor_for(container)
|
|
113
115
|
|
|
114
|
-
next unless selected_anchor
|
|
116
|
+
next unless selected_anchor || @fallback_anchorless
|
|
115
117
|
|
|
116
|
-
destination_facts = normalized_destination(selected_anchor)
|
|
117
|
-
next
|
|
118
|
+
destination_facts = selected_anchor ? normalized_destination(selected_anchor) : nil
|
|
119
|
+
next if selected_anchor && !destination_facts
|
|
118
120
|
next if hard_junk_entry?(container, selected_anchor, destination_facts)
|
|
119
121
|
|
|
120
122
|
quality = quality_score(container, selected_anchor, destination_facts)
|
|
@@ -132,6 +134,7 @@ module Html2rss
|
|
|
132
134
|
)
|
|
133
135
|
end
|
|
134
136
|
end
|
|
137
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
135
138
|
|
|
136
139
|
# rubocop:disable Metrics/MethodLength
|
|
137
140
|
def ranked_entries
|
|
@@ -177,8 +180,8 @@ module Html2rss
|
|
|
177
180
|
|
|
178
181
|
score += 40 if words >= 3
|
|
179
182
|
score += 15 if words >= 7
|
|
180
|
-
score += 20 if destination_facts
|
|
181
|
-
score += 15 if destination_facts
|
|
183
|
+
score += 20 if destination_facts&.url&.path.to_s.length > 6
|
|
184
|
+
score += 15 if destination_facts&.content_path
|
|
182
185
|
score += 15 if publish_marker?(container)
|
|
183
186
|
score += 10 if descriptive_context?(container_text, title)
|
|
184
187
|
score += 10 if article_container?(container)
|
|
@@ -190,12 +193,12 @@ module Html2rss
|
|
|
190
193
|
title = entry_title(container, selected_anchor)
|
|
191
194
|
utility_text = @link_heuristics.utility_prefix_text?(title)
|
|
192
195
|
recommended_text = @link_heuristics.recommended_text?(title)
|
|
193
|
-
content_signal = destination_facts
|
|
196
|
+
content_signal = destination_facts&.content_path
|
|
194
197
|
no_content_signal = !content_signal
|
|
195
198
|
non_content_utility_path =
|
|
196
|
-
destination_facts
|
|
199
|
+
destination_facts&.utility_path &&
|
|
197
200
|
no_content_signal &&
|
|
198
|
-
!destination_facts
|
|
201
|
+
!destination_facts&.strong_post_suffix
|
|
199
202
|
publish_signal = publish_marker?(container)
|
|
200
203
|
descriptive_signal = descriptive_context?(visible_text(container), title)
|
|
201
204
|
weak_container = !publish_signal && !descriptive_signal
|
|
@@ -203,19 +206,20 @@ module Html2rss
|
|
|
203
206
|
|
|
204
207
|
score += 25 if non_content_utility_path
|
|
205
208
|
score += 15 if utility_text && word_count(title) <= 6
|
|
206
|
-
score += 10 if destination_facts
|
|
209
|
+
score += 10 if destination_facts&.shallow
|
|
207
210
|
score += 10 if weak_container
|
|
208
211
|
score += 10 if recommended_text && no_content_signal
|
|
209
|
-
score += 5 if destination_facts
|
|
212
|
+
score += 5 if destination_facts&.high_confidence_junk_path
|
|
210
213
|
score += 15 if junk_tokens?(container_tokens(container))
|
|
211
214
|
score
|
|
212
215
|
end
|
|
213
216
|
|
|
214
|
-
|
|
217
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
218
|
+
def hard_junk_entry?(container, selected_anchor, destination_facts)
|
|
215
219
|
title = entry_title(container, selected_anchor)
|
|
216
220
|
publish_signal = publish_marker?(container)
|
|
217
221
|
descriptive_signal = descriptive_context?(visible_text(container), title)
|
|
218
|
-
content_signal = destination_facts
|
|
222
|
+
content_signal = destination_facts&.content_path
|
|
219
223
|
weak_article_candidate = article_signal_count(
|
|
220
224
|
container,
|
|
221
225
|
publish_signal:,
|
|
@@ -223,12 +227,16 @@ module Html2rss
|
|
|
223
227
|
content_signal:
|
|
224
228
|
) < 2
|
|
225
229
|
|
|
226
|
-
destination_facts
|
|
227
|
-
(
|
|
228
|
-
|
|
229
|
-
destination_facts
|
|
230
|
+
destination_facts&.high_confidence_junk_path ||
|
|
231
|
+
(selected_anchor &&
|
|
232
|
+
@link_heuristics.recommended_text?(title) &&
|
|
233
|
+
destination_facts&.shallow &&
|
|
234
|
+
weak_article_candidate) ||
|
|
235
|
+
(selected_anchor && @link_heuristics.utility_prefix_text?(title) &&
|
|
236
|
+
destination_facts&.high_confidence_utility_destination &&
|
|
230
237
|
weak_article_candidate)
|
|
231
238
|
end
|
|
239
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
232
240
|
|
|
233
241
|
##
|
|
234
242
|
# @param container [Nokogiri::XML::Node]
|
data/lib/html2rss/auto_source.rb
CHANGED
|
@@ -32,7 +32,8 @@ module Html2rss
|
|
|
32
32
|
enabled: true
|
|
33
33
|
},
|
|
34
34
|
semantic_html: {
|
|
35
|
-
enabled: true
|
|
35
|
+
enabled: true,
|
|
36
|
+
fallback_anchorless: true
|
|
36
37
|
},
|
|
37
38
|
html: {
|
|
38
39
|
enabled: true,
|
|
@@ -59,6 +60,7 @@ module Html2rss
|
|
|
59
60
|
end
|
|
60
61
|
optional(:semantic_html).hash do
|
|
61
62
|
optional(:enabled).filled(:bool)
|
|
63
|
+
optional(:fallback_anchorless).filled(:bool)
|
|
62
64
|
end
|
|
63
65
|
optional(:html).hash do
|
|
64
66
|
optional(:enabled).filled(:bool)
|
|
@@ -138,17 +138,7 @@ module Html2rss
|
|
|
138
138
|
# @return [Boolean] true when the anchor is inside the selected heading
|
|
139
139
|
def heading_anchor?
|
|
140
140
|
heading = @context.heading
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
curr = @anchor
|
|
144
|
-
container = @context.container
|
|
145
|
-
while curr.respond_to?(:parent)
|
|
146
|
-
return true if curr == heading
|
|
147
|
-
break if curr == container
|
|
148
|
-
|
|
149
|
-
curr = curr.parent
|
|
150
|
-
end
|
|
151
|
-
false
|
|
141
|
+
heading && (@anchor == heading || HtmlNavigator.descendant_of?(@anchor, heading))
|
|
152
142
|
end
|
|
153
143
|
|
|
154
144
|
# @return [Boolean] true when anchor text exactly matches heading text
|
|
@@ -183,15 +173,11 @@ module Html2rss
|
|
|
183
173
|
end
|
|
184
174
|
|
|
185
175
|
def utility_landmark_ancestor?
|
|
186
|
-
curr = @anchor.parent
|
|
187
176
|
container = @context.container
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
break if curr == container
|
|
177
|
+
condition = proc { |node| node == container || Context::UTILITY_LANDMARK_TAGS.include?(node.name) }
|
|
178
|
+
landmark = HtmlNavigator.parent_until_condition(@anchor.parent, condition)
|
|
191
179
|
|
|
192
|
-
|
|
193
|
-
end
|
|
194
|
-
false
|
|
180
|
+
landmark && landmark != container
|
|
195
181
|
end
|
|
196
182
|
|
|
197
183
|
def icon_only_anchor?
|
|
@@ -32,9 +32,34 @@ module Html2rss
|
|
|
32
32
|
HtmlExtractor.ignored_container_path?(node, cache)
|
|
33
33
|
end
|
|
34
34
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
35
|
+
candidates = filter_nested_containers(candidates)
|
|
36
|
+
sort_by_depth(candidates)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def filter_nested_containers(candidates)
|
|
42
|
+
candidate_set = Set.new(candidates)
|
|
43
|
+
rejected = Set.new
|
|
44
|
+
|
|
45
|
+
candidates.each do |candidate_b|
|
|
46
|
+
next if candidate_b.name == 'div'
|
|
47
|
+
|
|
48
|
+
find_and_reject_ancestors(candidate_b, candidate_set, rejected)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
candidates.reject { |c| rejected.include?(c) }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def find_and_reject_ancestors(node, candidate_set, rejected)
|
|
55
|
+
curr = node.parent
|
|
56
|
+
while curr && !curr.document? && curr.name != 'html'
|
|
57
|
+
rejected << curr if candidate_set.include?(curr)
|
|
58
|
+
curr = curr.parent
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def sort_by_depth(candidates)
|
|
38
63
|
candidates.each_with_index
|
|
39
64
|
.sort_by { |node, index| [-node.ancestors.size, index] }
|
|
40
65
|
.map!(&:first)
|
|
@@ -47,25 +47,33 @@ module Html2rss
|
|
|
47
47
|
# @param node [Nokogiri::XML::Node]
|
|
48
48
|
# @param cache [Hash, nil] identity cache used to store results (must use compare_by_identity)
|
|
49
49
|
# @return [Boolean] true when the node belongs to ignored DOM chrome
|
|
50
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
50
51
|
def ignored_container_path?(node, cache = nil)
|
|
51
52
|
return cache[node] if cache&.key?(node)
|
|
52
53
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
end
|
|
54
|
+
curr = node
|
|
55
|
+
visited = []
|
|
56
|
+
is_ignored = false
|
|
57
57
|
|
|
58
|
-
|
|
58
|
+
while curr.respond_to?(:parent) && curr
|
|
59
|
+
if cache&.key?(curr)
|
|
60
|
+
is_ignored = cache[curr]
|
|
61
|
+
break
|
|
62
|
+
end
|
|
59
63
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
+
if IGNORED_CONTAINER_TAGS.include?(curr.name)
|
|
65
|
+
is_ignored = true
|
|
66
|
+
break
|
|
67
|
+
end
|
|
64
68
|
|
|
69
|
+
visited << curr
|
|
65
70
|
curr = curr.parent
|
|
66
71
|
end
|
|
67
|
-
|
|
72
|
+
visited.each { |n| cache[n] = is_ignored } if cache
|
|
73
|
+
|
|
74
|
+
is_ignored
|
|
68
75
|
end
|
|
76
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
|
|
69
77
|
end
|
|
70
78
|
|
|
71
79
|
##
|
|
@@ -119,14 +127,16 @@ module Html2rss
|
|
|
119
127
|
Url.from_relative("##{id}", base_url) if id
|
|
120
128
|
end
|
|
121
129
|
|
|
130
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
122
131
|
def extract_title
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
132
|
+
source = heading || selected_anchor
|
|
133
|
+
title_text = source ? self.class.extract_visible_text(source) : fallback_anchorless_title
|
|
134
|
+
return unless title_text
|
|
135
|
+
|
|
136
|
+
kicker = kicker_node ? self.class.extract_visible_text(kicker_node).to_s.strip : nil
|
|
137
|
+
kicker && !kicker.empty? && !title_text.include?(kicker) ? "#{kicker}: #{title_text}" : title_text
|
|
129
138
|
end
|
|
139
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
130
140
|
|
|
131
141
|
def fallback_anchorless_title
|
|
132
142
|
return unless @fallback_anchorless && selected_anchor.nil?
|
|
@@ -143,8 +153,17 @@ module Html2rss
|
|
|
143
153
|
)
|
|
144
154
|
end
|
|
145
155
|
|
|
156
|
+
def kicker_node
|
|
157
|
+
@kicker_node ||= begin
|
|
158
|
+
selector = '[data-tb-kicker], [class*="kicker"], [class*="eyebrow"], ' \
|
|
159
|
+
'[class*="pre-title"], [class*="pretitle"], [class*="overline"]'
|
|
160
|
+
node = article_tag.at_css(selector)
|
|
161
|
+
node && heading && (node == heading || HtmlNavigator.descendant_of?(node, heading)) ? nil : node
|
|
162
|
+
end
|
|
163
|
+
end
|
|
164
|
+
|
|
146
165
|
def extract_description
|
|
147
|
-
exclude = [heading, selected_anchor].compact.to_set
|
|
166
|
+
exclude = [heading, selected_anchor, kicker_node].compact.to_set
|
|
148
167
|
description = self.class.extract_visible_text(article_tag, exclude_nodes: exclude)
|
|
149
168
|
return if description.nil?
|
|
150
169
|
|
|
@@ -49,6 +49,23 @@ module Html2rss
|
|
|
49
49
|
|
|
50
50
|
current_tag.ancestors(tag_name).first
|
|
51
51
|
end
|
|
52
|
+
|
|
53
|
+
##
|
|
54
|
+
# Returns true if child_node is a descendant of parent_node.
|
|
55
|
+
# Walks up using parent pointers to avoid NodeSet allocations.
|
|
56
|
+
#
|
|
57
|
+
# @param child_node [Nokogiri::XML::Node] potential descendant
|
|
58
|
+
# @param parent_node [Nokogiri::XML::Node] potential ancestor
|
|
59
|
+
# @return [Boolean] true when child_node is a descendant of parent_node
|
|
60
|
+
def descendant_of?(child_node, parent_node)
|
|
61
|
+
curr = child_node.respond_to?(:parent) ? child_node.parent : nil
|
|
62
|
+
while curr
|
|
63
|
+
return true if curr == parent_node
|
|
64
|
+
|
|
65
|
+
curr = curr.respond_to?(:parent) ? curr.parent : nil
|
|
66
|
+
end
|
|
67
|
+
false
|
|
68
|
+
end
|
|
52
69
|
end
|
|
53
70
|
end
|
|
54
71
|
end
|
data/lib/html2rss/selectors.rb
CHANGED
|
@@ -103,11 +103,15 @@ module Html2rss
|
|
|
103
103
|
# @param article_tag [Nokogiri::XML::Element] HTML element to extract additional info from.
|
|
104
104
|
# @param base_url [String, Html2rss::Url] base URL for normalization during enhancement
|
|
105
105
|
# @return [Hash] The enhanced article hash.
|
|
106
|
+
# rubocop:disable Metrics/MethodLength
|
|
106
107
|
def enhance_article_hash(article_hash, article_tag, base_url = @url)
|
|
107
108
|
selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
109
|
+
extracted = HtmlExtractor.new(
|
|
110
|
+
article_tag,
|
|
111
|
+
base_url:,
|
|
112
|
+
selected_anchor:,
|
|
113
|
+
fallback_anchorless: true
|
|
114
|
+
).call
|
|
111
115
|
return article_hash unless extracted
|
|
112
116
|
|
|
113
117
|
extracted.each_with_object(article_hash) do |(key, value), hash|
|
|
@@ -116,6 +120,7 @@ module Html2rss
|
|
|
116
120
|
hash[key] = value
|
|
117
121
|
end
|
|
118
122
|
end
|
|
123
|
+
# rubocop:enable Metrics/MethodLength
|
|
119
124
|
|
|
120
125
|
##
|
|
121
126
|
# Selects the value for a given attribute from an HTML element.
|
data/lib/html2rss/version.rb
CHANGED
|
@@ -153,6 +153,12 @@
|
|
|
153
153
|
"not": {
|
|
154
154
|
"type": "null"
|
|
155
155
|
}
|
|
156
|
+
},
|
|
157
|
+
"fallback_anchorless": {
|
|
158
|
+
"type": "boolean",
|
|
159
|
+
"not": {
|
|
160
|
+
"type": "null"
|
|
161
|
+
}
|
|
156
162
|
}
|
|
157
163
|
},
|
|
158
164
|
"required": []
|
|
@@ -228,7 +234,8 @@
|
|
|
228
234
|
"enabled": true
|
|
229
235
|
},
|
|
230
236
|
"semantic_html": {
|
|
231
|
-
"enabled": true
|
|
237
|
+
"enabled": true,
|
|
238
|
+
"fallback_anchorless": true
|
|
232
239
|
},
|
|
233
240
|
"html": {
|
|
234
241
|
"enabled": true,
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.22.
|
|
4
|
+
version: 0.22.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
@@ -381,7 +381,7 @@ licenses:
|
|
|
381
381
|
- MIT
|
|
382
382
|
metadata:
|
|
383
383
|
allowed_push_host: https://rubygems.org
|
|
384
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.
|
|
384
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.1
|
|
385
385
|
rubygems_mfa_required: 'true'
|
|
386
386
|
rdoc_options: []
|
|
387
387
|
require_paths:
|