html2rss 0.22.0 → 0.22.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9ff7cdc4e25f3abc6da000e4f672d6832fb6027e49885aecc0cad38329c5e6ae
4
- data.tar.gz: e42c216e328bb2c56971dd58871f023f15e672563e27741c56c6cf7fe4cb322a
3
+ metadata.gz: 750b7fb967b328cef2238b66729cafe122d1bae23bee05fd8504bb31e760b8a7
4
+ data.tar.gz: 327406de9c7c97ea13e90c89bec1c2653c962bbcaccfd29ddb78b282477f7578
5
5
  SHA512:
6
- metadata.gz: e52812f947561b9a52537f1b28c530f63e116194642d13ff526fac1ad32f02d7ea6ff8ca9b5ee16e2d7f686e1babec1908ca51399ba42ef9461ed3dbe0d02117
7
- data.tar.gz: 4754495a5947aca6de71846d1c88128d9fc1826e1014fd00806f58e7cd1e1575dc3bd2380ced3aeecd0cd6d51e0366ddee7a1f7db0220750e13f66645de2edde
6
+ metadata.gz: 5f41e00edfdd19ceb012900db7518f28236b417662dc4f11f45d9c498ede0720bdbbc0fb31443d80495eaf6076df646062fdf7f972b27bd71c50cc4e198b4540
7
+ data.tar.gz: 7a7eff85bd7f98cd872131041aa58faf9d3fba1aff47893a6d286378b6f52904ee11120f053e3ca8beca3060d86d1438cf2037e3e4e7e1586d3d3c4679b026f0
@@ -10,7 +10,7 @@ module Html2rss
10
10
  # rubocop:disable Metrics/ClassLength
11
11
  class ClassClustering
12
12
  # Node tags considered layout containers
13
- LAYOUT_TAG_NAMES = Set['div', 'section', 'article'].freeze
13
+ LAYOUT_TAG_NAMES = Set['div', 'section', 'article', 'li', 'ul', 'ol'].freeze
14
14
  # HTML/layout tags excluded from candidate nodes
15
15
  EXCLUDED_TAGS = Set['html', 'body', 'nav', 'footer', 'header', 'svg', 'script', 'style'].freeze
16
16
 
@@ -83,13 +83,24 @@ module Html2rss
83
83
  end
84
84
  end
85
85
 
86
+ # rubocop:disable Metrics/MethodLength
86
87
  def container_of?(nodes_a, nodes_b)
87
88
  return false unless LAYOUT_TAG_NAMES.include?(nodes_b.first.name)
88
89
 
89
90
  nodes_a.any? do |node_a|
90
- nodes_b.count { |node_b| node_a != node_b && node_b.ancestors.include?(node_a) } > 1
91
+ count = 0
92
+ nodes_b.each do |node_b|
93
+ next if node_a == node_b
94
+
95
+ if HtmlNavigator.descendant_of?(node_b, node_a)
96
+ count += 1
97
+ break if count > 1
98
+ end
99
+ end
100
+ count > 1
91
101
  end
92
102
  end
103
+ # rubocop:enable Metrics/MethodLength
93
104
 
94
105
  # If group A contains group B, and they have the same size:
95
106
  # - If B (the descendant) contains >= 80% of A's words, AND B's tag is div/section/article,
@@ -112,7 +123,7 @@ module Html2rss
112
123
  nodes_a = groups[cls_a]
113
124
  nodes_b = groups[cls_b]
114
125
  return if nodes_a.size != nodes_b.size
115
- return unless nodes_a.zip(nodes_b).all? { |a, b| a != b && b.ancestors.include?(a) }
126
+ return unless nodes_a.zip(nodes_b).all? { |a, b| a != b && HtmlNavigator.descendant_of?(b, a) }
116
127
 
117
128
  discarded << (keep_descendant?(nodes_a, nodes_b) ? cls_a : cls_b)
118
129
  end
@@ -55,7 +55,13 @@ module Html2rss
55
55
  def top_level_item?(node)
56
56
  return false if node.attribute('itemprop')
57
57
 
58
- node.ancestors.none? { |ancestor| ancestor.attribute('itemscope') && ancestor.attribute('itemprop') }
58
+ curr = node.parent
59
+ while curr && !curr.document? && curr.name != 'html'
60
+ return false if curr.attribute('itemscope') && curr.attribute('itemprop')
61
+
62
+ curr = curr.parent
63
+ end
64
+ true
59
65
  end
60
66
  end
61
67
 
@@ -147,7 +153,13 @@ module Html2rss
147
153
  def direct_property?(root, node)
148
154
  return false if node == root
149
155
 
150
- node.ancestors.take_while { _1 != root }.none? { |ancestor| ancestor.attribute('itemscope') }
156
+ curr = node.parent
157
+ while curr && curr != root
158
+ return false if curr.attribute('itemscope')
159
+
160
+ curr = curr.parent
161
+ end
162
+ true
151
163
  end
152
164
 
153
165
  # @param node [Nokogiri::XML::Element] itemprop node
@@ -37,7 +37,7 @@ module Html2rss
37
37
 
38
38
  @article_cache.fetch(entry) do
39
39
  @article_cache[entry] = @extractor.new(
40
- entry.container, base_url: @url, selected_anchor: entry.selected_anchor
40
+ entry.container, base_url: @url, selected_anchor: entry.selected_anchor, fallback_anchorless: true
41
41
  ).call
42
42
  end
43
43
  end
@@ -60,12 +60,13 @@ module Html2rss
60
60
  # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
61
61
  # @param url [String, Html2rss::Url] base url
62
62
  # @param extractor [Class] extractor class used for article extraction
63
- # @param _opts [Hash] scraper-specific options
64
- # @option _opts [Object] :_reserved reserved for future scraper-specific options
65
- def initialize(parsed_body, url:, extractor: HtmlExtractor, **_opts)
63
+ # @param opts [Hash] scraper-specific options
64
+ # @option opts [Boolean] :fallback_anchorless whether to extract anchorless blocks
65
+ def initialize(parsed_body, url:, extractor: HtmlExtractor, **opts)
66
66
  @parsed_body = parsed_body
67
67
  @url = url
68
68
  @extractor = extractor
69
+ @fallback_anchorless = opts.fetch(:fallback_anchorless, false)
69
70
  @link_heuristics = LinkHeuristics.new(url)
70
71
  @anchor_selector = AnchorSelector.new(url)
71
72
  end
@@ -107,14 +108,15 @@ module Html2rss
107
108
  @anchor_selector.primary_anchor_for(container)
108
109
  end
109
110
 
110
- def extractable_entries # rubocop:disable Metrics/MethodLength
111
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
112
+ def extractable_entries
111
113
  @extractable_entries ||= candidate_containers.filter_map do |container|
112
114
  selected_anchor = primary_anchor_for(container)
113
115
 
114
- next unless selected_anchor
116
+ next unless selected_anchor || @fallback_anchorless
115
117
 
116
- destination_facts = normalized_destination(selected_anchor)
117
- next unless destination_facts
118
+ destination_facts = selected_anchor ? normalized_destination(selected_anchor) : nil
119
+ next if selected_anchor && !destination_facts
118
120
  next if hard_junk_entry?(container, selected_anchor, destination_facts)
119
121
 
120
122
  quality = quality_score(container, selected_anchor, destination_facts)
@@ -132,6 +134,7 @@ module Html2rss
132
134
  )
133
135
  end
134
136
  end
137
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
135
138
 
136
139
  # rubocop:disable Metrics/MethodLength
137
140
  def ranked_entries
@@ -177,8 +180,8 @@ module Html2rss
177
180
 
178
181
  score += 40 if words >= 3
179
182
  score += 15 if words >= 7
180
- score += 20 if destination_facts.url.path.to_s.length > 6
181
- score += 15 if destination_facts.content_path
183
+ score += 20 if destination_facts&.url&.path.to_s.length > 6
184
+ score += 15 if destination_facts&.content_path
182
185
  score += 15 if publish_marker?(container)
183
186
  score += 10 if descriptive_context?(container_text, title)
184
187
  score += 10 if article_container?(container)
@@ -190,12 +193,12 @@ module Html2rss
190
193
  title = entry_title(container, selected_anchor)
191
194
  utility_text = @link_heuristics.utility_prefix_text?(title)
192
195
  recommended_text = @link_heuristics.recommended_text?(title)
193
- content_signal = destination_facts.content_path
196
+ content_signal = destination_facts&.content_path
194
197
  no_content_signal = !content_signal
195
198
  non_content_utility_path =
196
- destination_facts.utility_path &&
199
+ destination_facts&.utility_path &&
197
200
  no_content_signal &&
198
- !destination_facts.strong_post_suffix
201
+ !destination_facts&.strong_post_suffix
199
202
  publish_signal = publish_marker?(container)
200
203
  descriptive_signal = descriptive_context?(visible_text(container), title)
201
204
  weak_container = !publish_signal && !descriptive_signal
@@ -203,19 +206,20 @@ module Html2rss
203
206
 
204
207
  score += 25 if non_content_utility_path
205
208
  score += 15 if utility_text && word_count(title) <= 6
206
- score += 10 if destination_facts.shallow
209
+ score += 10 if destination_facts&.shallow
207
210
  score += 10 if weak_container
208
211
  score += 10 if recommended_text && no_content_signal
209
- score += 5 if destination_facts.high_confidence_junk_path
212
+ score += 5 if destination_facts&.high_confidence_junk_path
210
213
  score += 15 if junk_tokens?(container_tokens(container))
211
214
  score
212
215
  end
213
216
 
214
- def hard_junk_entry?(container, selected_anchor, destination_facts) # rubocop:disable Metrics/MethodLength
217
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
218
+ def hard_junk_entry?(container, selected_anchor, destination_facts)
215
219
  title = entry_title(container, selected_anchor)
216
220
  publish_signal = publish_marker?(container)
217
221
  descriptive_signal = descriptive_context?(visible_text(container), title)
218
- content_signal = destination_facts.content_path
222
+ content_signal = destination_facts&.content_path
219
223
  weak_article_candidate = article_signal_count(
220
224
  container,
221
225
  publish_signal:,
@@ -223,12 +227,16 @@ module Html2rss
223
227
  content_signal:
224
228
  ) < 2
225
229
 
226
- destination_facts.high_confidence_junk_path ||
227
- (@link_heuristics.recommended_text?(title) && destination_facts.shallow && weak_article_candidate) ||
228
- (@link_heuristics.utility_prefix_text?(title) &&
229
- destination_facts.high_confidence_utility_destination &&
230
+ destination_facts&.high_confidence_junk_path ||
231
+ (selected_anchor &&
232
+ @link_heuristics.recommended_text?(title) &&
233
+ destination_facts&.shallow &&
234
+ weak_article_candidate) ||
235
+ (selected_anchor && @link_heuristics.utility_prefix_text?(title) &&
236
+ destination_facts&.high_confidence_utility_destination &&
230
237
  weak_article_candidate)
231
238
  end
239
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
232
240
 
233
241
  ##
234
242
  # @param container [Nokogiri::XML::Node]
@@ -32,7 +32,8 @@ module Html2rss
32
32
  enabled: true
33
33
  },
34
34
  semantic_html: {
35
- enabled: true
35
+ enabled: true,
36
+ fallback_anchorless: true
36
37
  },
37
38
  html: {
38
39
  enabled: true,
@@ -59,6 +60,7 @@ module Html2rss
59
60
  end
60
61
  optional(:semantic_html).hash do
61
62
  optional(:enabled).filled(:bool)
63
+ optional(:fallback_anchorless).filled(:bool)
62
64
  end
63
65
  optional(:html).hash do
64
66
  optional(:enabled).filled(:bool)
@@ -138,17 +138,7 @@ module Html2rss
138
138
  # @return [Boolean] true when the anchor is inside the selected heading
139
139
  def heading_anchor?
140
140
  heading = @context.heading
141
- return false unless heading
142
-
143
- curr = @anchor
144
- container = @context.container
145
- while curr.respond_to?(:parent)
146
- return true if curr == heading
147
- break if curr == container
148
-
149
- curr = curr.parent
150
- end
151
- false
141
+ heading && (@anchor == heading || HtmlNavigator.descendant_of?(@anchor, heading))
152
142
  end
153
143
 
154
144
  # @return [Boolean] true when anchor text exactly matches heading text
@@ -183,15 +173,11 @@ module Html2rss
183
173
  end
184
174
 
185
175
  def utility_landmark_ancestor?
186
- curr = @anchor.parent
187
176
  container = @context.container
188
- while curr.respond_to?(:parent)
189
- return true if Context::UTILITY_LANDMARK_TAGS.include?(curr.name)
190
- break if curr == container
177
+ condition = proc { |node| node == container || Context::UTILITY_LANDMARK_TAGS.include?(node.name) }
178
+ landmark = HtmlNavigator.parent_until_condition(@anchor.parent, condition)
191
179
 
192
- curr = curr.parent
193
- end
194
- false
180
+ landmark && landmark != container
195
181
  end
196
182
 
197
183
  def icon_only_anchor?
@@ -32,9 +32,34 @@ module Html2rss
32
32
  HtmlExtractor.ignored_container_path?(node, cache)
33
33
  end
34
34
 
35
- # Preserve the original post-order traversal intent (specific-first)
36
- # by sorting candidates by depth (descending) while keeping original document
37
- # order for nodes at the same depth.
35
+ candidates = filter_nested_containers(candidates)
36
+ sort_by_depth(candidates)
37
+ end
38
+
39
+ private
40
+
41
+ def filter_nested_containers(candidates)
42
+ candidate_set = Set.new(candidates)
43
+ rejected = Set.new
44
+
45
+ candidates.each do |candidate_b|
46
+ next if candidate_b.name == 'div'
47
+
48
+ find_and_reject_ancestors(candidate_b, candidate_set, rejected)
49
+ end
50
+
51
+ candidates.reject { |c| rejected.include?(c) }
52
+ end
53
+
54
+ def find_and_reject_ancestors(node, candidate_set, rejected)
55
+ curr = node.parent
56
+ while curr && !curr.document? && curr.name != 'html'
57
+ rejected << curr if candidate_set.include?(curr)
58
+ curr = curr.parent
59
+ end
60
+ end
61
+
62
+ def sort_by_depth(candidates)
38
63
  candidates.each_with_index
39
64
  .sort_by { |node, index| [-node.ancestors.size, index] }
40
65
  .map!(&:first)
@@ -47,25 +47,33 @@ module Html2rss
47
47
  # @param node [Nokogiri::XML::Node]
48
48
  # @param cache [Hash, nil] identity cache used to store results (must use compare_by_identity)
49
49
  # @return [Boolean] true when the node belongs to ignored DOM chrome
50
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
50
51
  def ignored_container_path?(node, cache = nil)
51
52
  return cache[node] if cache&.key?(node)
52
53
 
53
- res = walk_ignored_container_path?(node)
54
- cache[node] = res if cache
55
- res
56
- end
54
+ curr = node
55
+ visited = []
56
+ is_ignored = false
57
57
 
58
- private
58
+ while curr.respond_to?(:parent) && curr
59
+ if cache&.key?(curr)
60
+ is_ignored = cache[curr]
61
+ break
62
+ end
59
63
 
60
- def walk_ignored_container_path?(node)
61
- curr = node
62
- while curr.respond_to?(:parent)
63
- return true if IGNORED_CONTAINER_TAGS.include?(curr.name)
64
+ if IGNORED_CONTAINER_TAGS.include?(curr.name)
65
+ is_ignored = true
66
+ break
67
+ end
64
68
 
69
+ visited << curr
65
70
  curr = curr.parent
66
71
  end
67
- false
72
+ visited.each { |n| cache[n] = is_ignored } if cache
73
+
74
+ is_ignored
68
75
  end
76
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
69
77
  end
70
78
 
71
79
  ##
@@ -119,14 +127,16 @@ module Html2rss
119
127
  Url.from_relative("##{id}", base_url) if id
120
128
  end
121
129
 
130
+ # rubocop:disable Metrics/CyclomaticComplexity
122
131
  def extract_title
123
- title_source = heading || selected_anchor
124
- if title_source
125
- self.class.extract_visible_text(title_source)
126
- else
127
- fallback_anchorless_title
128
- end
132
+ source = heading || selected_anchor
133
+ title_text = source ? self.class.extract_visible_text(source) : fallback_anchorless_title
134
+ return unless title_text
135
+
136
+ kicker = kicker_node ? self.class.extract_visible_text(kicker_node).to_s.strip : nil
137
+ kicker && !kicker.empty? && !title_text.include?(kicker) ? "#{kicker}: #{title_text}" : title_text
129
138
  end
139
+ # rubocop:enable Metrics/CyclomaticComplexity
130
140
 
131
141
  def fallback_anchorless_title
132
142
  return unless @fallback_anchorless && selected_anchor.nil?
@@ -143,8 +153,17 @@ module Html2rss
143
153
  )
144
154
  end
145
155
 
156
+ def kicker_node
157
+ @kicker_node ||= begin
158
+ selector = '[data-tb-kicker], [class*="kicker"], [class*="eyebrow"], ' \
159
+ '[class*="pre-title"], [class*="pretitle"], [class*="overline"]'
160
+ node = article_tag.at_css(selector)
161
+ node && heading && (node == heading || HtmlNavigator.descendant_of?(node, heading)) ? nil : node
162
+ end
163
+ end
164
+
146
165
  def extract_description
147
- exclude = [heading, selected_anchor].compact.to_set
166
+ exclude = [heading, selected_anchor, kicker_node].compact.to_set
148
167
  description = self.class.extract_visible_text(article_tag, exclude_nodes: exclude)
149
168
  return if description.nil?
150
169
 
@@ -49,6 +49,23 @@ module Html2rss
49
49
 
50
50
  current_tag.ancestors(tag_name).first
51
51
  end
52
+
53
+ ##
54
+ # Returns true if child_node is a descendant of parent_node.
55
+ # Walks up using parent pointers to avoid NodeSet allocations.
56
+ #
57
+ # @param child_node [Nokogiri::XML::Node] potential descendant
58
+ # @param parent_node [Nokogiri::XML::Node] potential ancestor
59
+ # @return [Boolean] true when child_node is a descendant of parent_node
60
+ def descendant_of?(child_node, parent_node)
61
+ curr = child_node.respond_to?(:parent) ? child_node.parent : nil
62
+ while curr
63
+ return true if curr == parent_node
64
+
65
+ curr = curr.respond_to?(:parent) ? curr.parent : nil
66
+ end
67
+ false
68
+ end
52
69
  end
53
70
  end
54
71
  end
@@ -103,11 +103,15 @@ module Html2rss
103
103
  # @param article_tag [Nokogiri::XML::Element] HTML element to extract additional info from.
104
104
  # @param base_url [String, Html2rss::Url] base URL for normalization during enhancement
105
105
  # @return [Hash] The enhanced article hash.
106
+ # rubocop:disable Metrics/MethodLength
106
107
  def enhance_article_hash(article_hash, article_tag, base_url = @url)
107
108
  selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
108
- return article_hash unless selected_anchor
109
-
110
- extracted = HtmlExtractor.new(article_tag, base_url:, selected_anchor:).call
109
+ extracted = HtmlExtractor.new(
110
+ article_tag,
111
+ base_url:,
112
+ selected_anchor:,
113
+ fallback_anchorless: true
114
+ ).call
111
115
  return article_hash unless extracted
112
116
 
113
117
  extracted.each_with_object(article_hash) do |(key, value), hash|
@@ -116,6 +120,7 @@ module Html2rss
116
120
  hash[key] = value
117
121
  end
118
122
  end
123
+ # rubocop:enable Metrics/MethodLength
119
124
 
120
125
  ##
121
126
  # Selects the value for a given attribute from an HTML element.
@@ -4,6 +4,6 @@
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
6
  # Current application version.
7
- VERSION = '0.22.0'
7
+ VERSION = '0.22.1'
8
8
  public_constant :VERSION
9
9
  end
@@ -153,6 +153,12 @@
153
153
  "not": {
154
154
  "type": "null"
155
155
  }
156
+ },
157
+ "fallback_anchorless": {
158
+ "type": "boolean",
159
+ "not": {
160
+ "type": "null"
161
+ }
156
162
  }
157
163
  },
158
164
  "required": []
@@ -228,7 +234,8 @@
228
234
  "enabled": true
229
235
  },
230
236
  "semantic_html": {
231
- "enabled": true
237
+ "enabled": true,
238
+ "fallback_anchorless": true
232
239
  },
233
240
  "html": {
234
241
  "enabled": true,
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.22.0
4
+ version: 0.22.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
@@ -381,7 +381,7 @@ licenses:
381
381
  - MIT
382
382
  metadata:
383
383
  allowed_push_host: https://rubygems.org
384
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.0
384
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.22.1
385
385
  rubygems_mfa_required: 'true'
386
386
  rdoc_options: []
387
387
  require_paths: