html2rss 0.19.1 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,47 +11,13 @@ module Html2rss
11
11
  # scraping can reason about link intent instead of DOM order. It favors
12
12
  # heading-aligned article links and suppresses utility links, duplicate
13
13
  # destinations, and weak textless affordances.
14
- class AnchorSelector # rubocop:disable Metrics/ClassLength
15
- AnchorFacts = Data.define(
16
- :anchor,
17
- :text,
18
- :url,
19
- :destination,
20
- :segments,
21
- :meaningful_text,
22
- :content_like_destination,
23
- :heading_anchor,
24
- :heading_text_match,
25
- :score
26
- )
27
-
14
+ class AnchorSelector
28
15
  # Comma-separated heading selector used for heading/anchor matching.
29
16
  HEADING_SELECTOR = HtmlExtractor::HEADING_TAGS.join(',').freeze
30
- # Path segments that usually represent utility navigation rather than article content.
31
- UTILITY_PATH_SEGMENTS = %w[
32
- about account author category comment comments contact feedback help
33
- login newsletter profile register search settings share signup subscribe
34
- topic topics view-all archive archives
35
- feed feeds
36
- recommended
37
- for-you
38
- preference preferences
39
- notification notifications
40
- privacy terms
41
- cookie cookies
42
- logout
43
- user users
44
- ].to_set.freeze
45
- # Path segments that signal content-like destinations.
46
- CONTENT_PATH_SEGMENTS = %w[
47
- article articles news post posts story stories update updates
48
- ].to_set.freeze
49
- # Ancestor tags that usually indicate navigation/utility regions.
50
- UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
51
17
 
52
18
  # @param base_url [String, Html2rss::Url] page URL used to normalize href destinations
53
19
  def initialize(base_url)
54
- @base_url = base_url
20
+ @link_heuristics = LinkHeuristics.new(base_url)
55
21
  end
56
22
 
57
23
  ##
@@ -70,132 +36,11 @@ module Html2rss
70
36
 
71
37
  private
72
38
 
73
- attr_reader :base_url
74
-
75
39
  def facts_for(container)
76
- heading = heading_for(container)
77
- heading_text = visible_text(heading)
78
-
79
- container.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).each_with_object({}) do |anchor, best_by_destination|
80
- next if anchor.path.match?(Html::TAGS_TO_IGNORE)
81
-
82
- facts = build_facts(anchor, heading, heading_text)
83
- next unless facts
84
-
85
- keep_stronger_fact(best_by_destination, facts)
86
- end.values
87
- end
88
-
89
- def build_facts(anchor, heading, heading_text) # rubocop:disable Metrics/MethodLength
90
- text = visible_text(anchor)
91
- meaningful_text = meaningful_text?(text)
92
- ancestors = anchor.ancestors.to_a
93
- url = normalized_destination(anchor)
94
- return unless url
95
-
96
- segments = url.path_segments
97
- content_like_destination = content_like_destination?(segments)
98
- return if ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
99
-
100
- heading_anchor = heading_anchor?(ancestors, heading)
101
- heading_text_match = heading_text_match?(heading_text, text, meaningful_text)
102
- return unless heading_anchor || content_like_anchor?(meaningful_text, content_like_destination)
103
-
104
- AnchorFacts.new(
105
- anchor:,
106
- text:,
107
- url:,
108
- destination: url.to_s,
109
- segments:,
110
- meaningful_text:,
111
- content_like_destination:,
112
- heading_anchor:,
113
- heading_text_match:,
114
- score: score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
115
- )
116
- end
117
-
118
- def ineligible_anchor?(anchor, ancestors, text, meaningful_text, segments)
119
- utility_destination?(segments) ||
120
- utility_text?(text) ||
121
- icon_only_anchor?(anchor, meaningful_text) ||
122
- utility_landmark_anchor?(ancestors)
123
- end
124
-
125
- def keep_stronger_fact(best_by_destination, facts)
126
- current = best_by_destination[facts.destination]
127
- return best_by_destination[facts.destination] = facts unless current
128
- return if current.score >= facts.score
129
-
130
- best_by_destination[facts.destination] = facts
131
- end
132
-
133
- def content_like_anchor?(meaningful_text, content_like_destination)
134
- meaningful_text || content_like_destination
135
- end
136
-
137
- def score_anchor(meaningful_text, content_like_destination, heading_anchor, heading_text_match)
138
- score = 0
139
- score += 100 if heading_anchor
140
- score += 20 if heading_text_match
141
- score += 10 if meaningful_text
142
- score += 10 if content_like_destination
143
- score
144
- end
145
-
146
- def heading_anchor?(ancestors, heading)
147
- heading && ancestors.include?(heading)
148
- end
149
-
150
- def heading_text_match?(heading_text, text, meaningful_text)
151
- meaningful_text && meaningful_text?(heading_text) && heading_text == text
152
- end
153
-
154
- def heading_for(container)
155
- container.at_css(HEADING_SELECTOR)
156
- end
157
-
158
- def icon_only_anchor?(anchor, meaningful_text)
159
- !meaningful_text && anchor.at_css('img, svg')
160
- end
161
-
162
- def utility_destination?(segments)
163
- segments.empty? || segments.any? { |segment| UTILITY_PATH_SEGMENTS.include?(segment) }
164
- end
165
-
166
- def content_like_destination?(segments)
167
- segments.any? do |segment|
168
- CONTENT_PATH_SEGMENTS.include?(segment) || segment.match?(/\A\d[\w-]*\z/)
169
- end
170
- end
171
-
172
- def normalized_destination(anchor)
173
- href = anchor['href'].to_s.split('#').first.to_s.strip
174
- return if href.empty?
175
-
176
- Html2rss::Url.from_relative(href, base_url)
177
- rescue ArgumentError
178
- nil
179
- end
180
-
181
- def meaningful_text?(text)
182
- text.scan(/\p{Alnum}+/).any?
183
- end
184
-
185
- def utility_text?(text)
186
- text.match?(
187
- /\A(about|contact|log in|login|sign up|signup|share|comments?|view all|recommended for you|subscribe)\b/i
188
- )
189
- end
190
-
191
- def utility_landmark_anchor?(ancestors)
192
- ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
193
- end
194
-
195
- def visible_text(node)
196
- return '' unless node
197
-
198
- HtmlExtractor.extract_visible_text(node).to_s.strip
40
+ HtmlExtractor::SemanticAnchorCandidates.new(
41
+ container,
42
+ link_heuristics: @link_heuristics
43
+ ).to_a
199
44
  end
200
45
  end
201
46
  end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ class SemanticHtml
7
+ ##
8
+ # Collapses nested containers and deduplicates entries pointing to the same destination.
9
+ # It resolves ties using scoring precedence and payload richness comparison.
10
+ class Deduplicator
11
+ # @param url [String, Html2rss::Url] base url used to resolve relative hrefs
12
+ # @param extractor [Class] extractor class used to materialize articles
13
+ def initialize(url, extractor)
14
+ @url = url
15
+ @extractor = extractor
16
+ @article_cache = {}.compare_by_identity
17
+ end
18
+
19
+ # Collapses and deduplicates the given entries.
20
+ #
21
+ # @param entries [Array<Entry>] list of scraper entries
22
+ # @return [Array<Entry>] deduplicated list of scraper entries
23
+ def call(entries)
24
+ destination_groups(entries).filter_map do |group|
25
+ collapsed_group = collapse_nested_destination_group(group)
26
+ collapsed_group.reduce do |best, entry|
27
+ stronger_entry?(entry, best) ? entry : best
28
+ end
29
+ end
30
+ end
31
+
32
+ # Returns the materialized article hash for the entry, using the cache.
33
+ #
34
+ # @param entry [Entry] scraper entry
35
+ # @return [Hash, nil] article payload
36
+ def article_for(entry)
37
+ return entry.article if entry.article
38
+
39
+ @article_cache.fetch(entry) do
40
+ @article_cache[entry] = @extractor.new(
41
+ entry.container, base_url: @url, selected_anchor: entry.selected_anchor
42
+ ).call
43
+ end
44
+ end
45
+
46
+ # Compares two entries to determine which is stronger.
47
+ #
48
+ # @param left [Entry] left entry
49
+ # @param right [Entry] right entry
50
+ # @return [Boolean] true if left is stronger than right
51
+ def stronger_entry?(left, right) # rubocop:disable Metrics/AbcSize
52
+ final_delta = left.final_score <=> right.final_score
53
+ return final_delta.positive? unless final_delta.zero?
54
+
55
+ quality_delta = left.quality_score <=> right.quality_score
56
+ return quality_delta.positive? unless quality_delta.zero?
57
+
58
+ left_article = article_for(left)
59
+ right_article = article_for(right)
60
+ return !right_article if left_article.nil? || right_article.nil?
61
+
62
+ richness_delta = payload_richness_signature(left_article) <=> payload_richness_signature(right_article)
63
+ richness_delta.zero? ? left.position < right.position : richness_delta.positive?
64
+ end
65
+
66
+ private
67
+
68
+ def destination_groups(entries) = entries.group_by { entry_destination(_1) }.values
69
+
70
+ def collapse_nested_destination_group(entries)
71
+ return entries if entries.size <= 1
72
+
73
+ entries.reject do |entry|
74
+ entries.any? do |other|
75
+ next if entry.equal?(other)
76
+ next unless nested_container_pair?(entry.container, other.container)
77
+
78
+ stronger_entry?(other, entry)
79
+ end
80
+ end
81
+ end
82
+
83
+ def nested_container_pair?(left, right) = left.ancestors.include?(right) || right.ancestors.include?(left)
84
+
85
+ def entry_destination(entry) = entry.destination_facts&.destination || article_for(entry)&.[](:url)&.to_s
86
+
87
+ def payload_richness_signature(article)
88
+ [
89
+ article[:published_at] ? 1 : 0,
90
+ word_count(article[:description]),
91
+ article[:image] ? 1 : 0,
92
+ Array(article[:categories]).length,
93
+ Array(article[:enclosures]).length
94
+ ]
95
+ end
96
+
97
+ def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require_relative 'semantic_html/anchor_selector'
4
+ require_relative 'semantic_html/deduplicator'
4
5
 
5
6
  module Html2rss
6
7
  class AutoSource
@@ -17,20 +18,20 @@ module Html2rss
17
18
  # The result is lower recall on weak-signal blocks, but much better link
18
19
  # quality on modern teaser cards that mix headlines, utility links, and
19
20
  # duplicate image overlays.
20
- class SemanticHtml
21
+ class SemanticHtml # rubocop:disable Metrics/ClassLength
21
22
  include Enumerable
22
23
 
23
- # Container plus selected anchor chosen for extraction.
24
- Entry = Data.define(:container, :selected_anchor)
25
-
26
- # Candidate semantic container selectors used to locate extractable blocks.
27
- CONTAINER_SELECTORS = [
28
- 'article:not(:has(article))',
29
- 'section:not(:has(section))',
30
- 'li:not(:has(li))',
31
- 'tr:not(:has(tr))',
32
- 'div:not(:has(div))'
33
- ].freeze
24
+ # Container plus selected anchor, scoring metadata, and extracted article.
25
+ Entry = Data.define(
26
+ :container,
27
+ :selected_anchor,
28
+ :destination_facts,
29
+ :quality_score,
30
+ :junk_score,
31
+ :final_score,
32
+ :position,
33
+ :article
34
+ )
34
35
 
35
36
  ##
36
37
  # @return [Symbol] config key used to enable or configure this scraper
@@ -53,6 +54,7 @@ module Html2rss
53
54
  @parsed_body = parsed_body
54
55
  @url = url
55
56
  @extractor = extractor
57
+ @link_heuristics = LinkHeuristics.new(url)
56
58
  @anchor_selector = AnchorSelector.new(url)
57
59
  end
58
60
 
@@ -71,14 +73,7 @@ module Html2rss
71
73
  def each
72
74
  return enum_for(:each) unless block_given?
73
75
 
74
- extractable_entries.each do |entry|
75
- article_hash = @extractor.new(
76
- entry.container,
77
- base_url: @url,
78
- selected_anchor: entry.selected_anchor
79
- ).call
80
- yield article_hash if article_hash
81
- end
76
+ ranked_entries.each { yield _1.article }
82
77
  end
83
78
 
84
79
  ##
@@ -100,28 +95,175 @@ module Html2rss
100
95
  @anchor_selector.primary_anchor_for(container)
101
96
  end
102
97
 
103
- def extractable_entries
98
+ def extractable_entries # rubocop:disable Metrics/MethodLength
104
99
  @extractable_entries ||= candidate_containers.filter_map do |container|
105
100
  selected_anchor = primary_anchor_for(container)
101
+
106
102
  next unless selected_anchor
107
103
 
108
- Entry.new(container:, selected_anchor:)
104
+ destination_facts = normalized_destination(selected_anchor)
105
+ next unless destination_facts
106
+ next if hard_junk_entry?(container, selected_anchor, destination_facts)
107
+
108
+ quality = quality_score(container, selected_anchor, destination_facts)
109
+ junk = junk_score(container, selected_anchor, destination_facts)
110
+
111
+ Entry.new(
112
+ container:,
113
+ selected_anchor:,
114
+ destination_facts:,
115
+ quality_score: quality,
116
+ junk_score: junk,
117
+ final_score: quality - junk,
118
+ position: document_position(container),
119
+ article: nil
120
+ )
109
121
  end
110
122
  end
111
123
 
112
- def collect_candidate_containers
113
- seen = {}.compare_by_identity
124
+ # rubocop:disable Metrics/MethodLength
125
+ def ranked_entries
126
+ @ranked_entries ||= begin
127
+ deduplicator = Deduplicator.new(@url, @extractor)
128
+ entries = deduplicator.call(extractable_entries)
129
+ entries = stable_rank(entries)
114
130
 
115
- CONTAINER_SELECTORS.each_with_object([]) do |selector, containers|
116
- parsed_body.css(selector).each do |container|
117
- next if container.path.match?(Html::TAGS_TO_IGNORE)
118
- next if seen[container]
131
+ entries.filter_map do |entry|
132
+ article = deduplicator.article_for(entry)
133
+ next unless article
119
134
 
120
- seen[container] = true
121
- containers << container
135
+ Entry.new(
136
+ container: entry.container,
137
+ selected_anchor: entry.selected_anchor,
138
+ destination_facts: entry.destination_facts,
139
+ quality_score: entry.quality_score,
140
+ junk_score: entry.junk_score,
141
+ final_score: entry.final_score,
142
+ position: entry.position,
143
+ article:
144
+ )
122
145
  end
123
146
  end
124
147
  end
148
+ # rubocop:enable Metrics/MethodLength
149
+
150
+ def collect_candidate_containers
151
+ HtmlExtractor::SemanticContainers.call(parsed_body)
152
+ end
153
+
154
+ private
155
+
156
+ def document_position(container)
157
+ (@document_positions ||= candidate_containers.each_with_index.to_h).fetch(container)
158
+ end
159
+
160
+ def quality_score(container, selected_anchor, destination_facts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
161
+ title = entry_title(container, selected_anchor)
162
+ words = word_count(title)
163
+ container_text = visible_text(container)
164
+ score = 0
165
+
166
+ score += 40 if words >= 3
167
+ score += 15 if words >= 7
168
+ score += 20 if destination_facts.url.path.to_s.length > 6
169
+ score += 15 if destination_facts.content_path
170
+ score += 15 if publish_marker?(container)
171
+ score += 10 if descriptive_context?(container_text, title)
172
+ score += 10 if article_container?(container)
173
+ score += 10 if content_tokens?(container_tokens(container))
174
+ score
175
+ end
176
+
177
+ def junk_score(container, selected_anchor, destination_facts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
178
+ title = entry_title(container, selected_anchor)
179
+ utility_text = @link_heuristics.utility_prefix_text?(title)
180
+ recommended_text = @link_heuristics.recommended_text?(title)
181
+ content_signal = destination_facts.content_path
182
+ no_content_signal = !content_signal
183
+ non_content_utility_path =
184
+ destination_facts.utility_path &&
185
+ no_content_signal &&
186
+ !destination_facts.strong_post_suffix
187
+ publish_signal = publish_marker?(container)
188
+ descriptive_signal = descriptive_context?(visible_text(container), title)
189
+ weak_container = !publish_signal && !descriptive_signal
190
+ score = 0
191
+
192
+ score += 25 if non_content_utility_path
193
+ score += 15 if utility_text && word_count(title) <= 6
194
+ score += 10 if destination_facts.shallow
195
+ score += 10 if weak_container
196
+ score += 10 if recommended_text && no_content_signal
197
+ score += 5 if destination_facts.high_confidence_junk_path
198
+ score += 15 if junk_tokens?(container_tokens(container))
199
+ score
200
+ end
201
+
202
+ def hard_junk_entry?(container, selected_anchor, destination_facts) # rubocop:disable Metrics/MethodLength
203
+ title = entry_title(container, selected_anchor)
204
+ publish_signal = publish_marker?(container)
205
+ descriptive_signal = descriptive_context?(visible_text(container), title)
206
+ content_signal = destination_facts.content_path
207
+ weak_article_candidate = article_signal_count(
208
+ container,
209
+ publish_signal:,
210
+ descriptive_signal:,
211
+ content_signal:
212
+ ) < 2
213
+
214
+ destination_facts.high_confidence_junk_path ||
215
+ (@link_heuristics.recommended_text?(title) && destination_facts.shallow && weak_article_candidate) ||
216
+ (@link_heuristics.utility_prefix_text?(title) &&
217
+ destination_facts.high_confidence_utility_destination &&
218
+ weak_article_candidate)
219
+ end
220
+
221
+ def publish_marker?(container)
222
+ container.at_css('time, [datetime], [itemprop="datePublished"], [itemprop="dateModified"]')
223
+ end
224
+
225
+ def article_signal_count(container, publish_signal:, descriptive_signal:, content_signal:)
226
+ [article_container?(container), publish_signal, descriptive_signal, content_signal].count(&:itself)
227
+ end
228
+
229
+ def article_container?(container) = container.name == 'article'
230
+
231
+ def descriptive_context?(container_text, title)
232
+ snippet = container_text.to_s.sub(/\A#{Regexp.escape(title.to_s)}/i, '')
233
+ word_count(snippet) >= 8
234
+ end
235
+
236
+ def heading_for(container) = container.at_css(AnchorSelector::HEADING_SELECTOR)
237
+
238
+ def normalized_destination(anchor) = @link_heuristics.destination_facts(anchor)
239
+
240
+ def visible_text(node)
241
+ return '' unless node
242
+
243
+ HtmlExtractor.extract_visible_text(node).to_s.strip
244
+ end
245
+
246
+ def entry_title(container, selected_anchor) = visible_text(heading_for(container) || selected_anchor)
247
+
248
+ def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
249
+
250
+ def container_tokens(container)
251
+ classes = container['class'].to_s.split
252
+ id = container['id'].to_s
253
+ (classes << id).flat_map { |str| str.downcase.split(/[-_]+/) }.reject(&:empty?)
254
+ end
255
+
256
+ def content_tokens?(tokens)
257
+ (@content_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:content)).intersect?(tokens.to_set)
258
+ end
259
+
260
+ def junk_tokens?(tokens)
261
+ (@junk_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:utility)).intersect?(tokens.to_set)
262
+ end
263
+
264
+ def stable_rank(entries)
265
+ entries.sort_by { |entry| [-entry.final_score, entry.position] }
266
+ end
125
267
  end
126
268
  end
127
269
  end
@@ -54,7 +54,7 @@ module Html2rss
54
54
  return log_missing_api_root if href.empty?
55
55
 
56
56
  Html2rss::Url.from_relative(href, page_url)
57
- rescue Addressable::URI::InvalidURIError, ArgumentError => error
57
+ rescue ArgumentError => error
58
58
  logger.warn("#{WordpressApi}: invalid WordPress API endpoint #{href.inspect} (#{error.message})")
59
59
  nil
60
60
  end
@@ -138,13 +138,13 @@ module Html2rss
138
138
  },
139
139
  channel: { time_zone: 'UTC' },
140
140
  headers: RequestHeaders.browser_defaults,
141
- stylesheets: []
141
+ stylesheets: Html2rss.configuration.stylesheets || []
142
142
  }
143
143
  end
144
144
 
145
145
  # @return [Symbol] the default strategy for feed orchestration
146
146
  def default_strategy_name
147
- :auto
147
+ Html2rss.configuration.default_strategy || :auto
148
148
  end
149
149
 
150
150
  private
@@ -17,13 +17,8 @@ module Html2rss
17
17
  */*;q=0.8
18
18
  ].join(',')
19
19
 
20
- # Browser-like default `User-Agent` header value.
21
- DEFAULT_USER_AGENT = [
22
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
23
- 'AppleWebKit/537.36 (KHTML, like Gecko)',
24
- 'Chrome/123.0.0.0',
25
- 'Safari/537.36'
26
- ].join(' ')
20
+ # Default `User-Agent` header value.
21
+ DEFAULT_USER_AGENT = "html2rss/#{Html2rss::VERSION}".freeze
27
22
 
28
23
  # Baseline browser-like header set used for outbound requests.
29
24
  DEFAULT_HEADERS = {
@@ -40,9 +35,23 @@ module Html2rss
40
35
 
41
36
  class << self
42
37
  ##
43
- # @return [Hash{String => String}] the unmodified default header set
38
+ # :reek:ManualDispatch
39
+ # :reek:TooManyStatements
40
+ #
41
+ # @return [Hash{String => String}] the default header set merged with global defaults
44
42
  def browser_defaults
45
- DEFAULT_HEADERS.dup
43
+ defaults = DEFAULT_HEADERS.dup
44
+ global_headers = Html2rss.configuration.headers
45
+ global_headers = global_headers.call if global_headers.respond_to?(:call)
46
+
47
+ if global_headers.is_a?(Hash)
48
+ global_headers.each do |key, value|
49
+ canonical_key = key.to_s.split('-').map(&:capitalize).join('-')
50
+ defaults[canonical_key] = value.to_s
51
+ end
52
+ end
53
+
54
+ defaults
46
55
  end
47
56
 
48
57
  ##