html2rss 0.19.1 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 69268fde80ddaa21f5ca3588de51f63182909714956af3ed8b1ee11a47075dc8
4
- data.tar.gz: 045dfb3fec6cebfa8c7d066acd12c056dbf01766bbe1c292642d6d4d9db72055
3
+ metadata.gz: 1e4867b7a4906d0e4bb9d6cb9facfe96da516175a82fe10824e9ed579cf4aa3d
4
+ data.tar.gz: 53a8f699b87817b2b62cbe5d5d1761f33004f0b3be1ddb6c7e2428d449923a7c
5
5
  SHA512:
6
- metadata.gz: de88861fd21375da62549cbed418f5f1550e7adf8e9c6ea98cfce9331944067bc8aa43eacdbdfe3ab6380764e485031f1d6bb3b456e0bf486340864328a5abc8
7
- data.tar.gz: 6f65cd2e7dc555c35cb456bff595184331f3df07dafe10c50d6d334102237b0f98a5971c6033d8e56c4504254726ca0eca9757c22b1a697c2a47b8b76681945a
6
+ metadata.gz: a38e85afebf7bd17739915cf9f59846a76212690dc5309a02f37b24f4910247a1e34e3c3eeede165f52ea8c77dff39de472a65b5b5d3b6d8b99c02a19f6dfb0a
7
+ data.tar.gz: e3a0ad5868a070adf65a0cd78b560d196df7ed932fa424372397f906e0e2afc362eeb0afa864ddcb9bef10cf344efc2f6d9505c8515be502df342fc2a0975252
@@ -19,9 +19,8 @@ module Html2rss
19
19
  class Html
20
20
  include Enumerable
21
21
 
22
- # Elements ignored when traversing potential article containers.
23
- TAGS_TO_IGNORE = /(nav|footer|header|svg|script|style)/i
24
-
22
+ # Absolute base URL used when probe-time detection needs to normalize relative hrefs.
23
+ DETECTION_BASE_URL = 'https://example.com'
25
24
  # Minimum selector frequency required to treat a path as a stable list signal.
26
25
  DEFAULT_MINIMUM_SELECTOR_FREQUENCY = 2
27
26
  # Number of most frequent selectors kept for container extraction.
@@ -39,7 +38,7 @@ module Html2rss
39
38
  # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
40
39
  # @return [Boolean] true when the scraper can likely extract articles
41
40
  def self.articles?(parsed_body)
42
- new(parsed_body, url: '').any?
41
+ new(parsed_body, url: DETECTION_BASE_URL).any?
43
42
  end
44
43
 
45
44
  ##
@@ -49,7 +48,7 @@ module Html2rss
49
48
  # @param xpath [String] original XPath
50
49
  # @return [String] XPath without positional indexes
51
50
  def self.simplify_xpath(xpath)
52
- xpath.gsub(/\[\d+\]/, '')
51
+ HtmlExtractor::ListCandidates.simplify_xpath(xpath)
53
52
  end
54
53
 
55
54
  # @param parsed_body [Nokogiri::HTML::Document] The parsed HTML document.
@@ -63,6 +62,7 @@ module Html2rss
63
62
  @url = url
64
63
  @extractor = extractor
65
64
  @opts = opts
65
+ @link_heuristics = LinkHeuristics.new(url)
66
66
  end
67
67
 
68
68
  attr_reader :parsed_body
@@ -73,8 +73,8 @@ module Html2rss
73
73
  def each
74
74
  return enum_for(:each) unless block_given?
75
75
 
76
- each_article_tag do |article_tag|
77
- article_hash = extract_article(article_tag)
76
+ each_article_tag do |article_tag, selected_anchor|
77
+ article_hash = extract_article(article_tag, selected_anchor:)
78
78
  yield article_hash if article_hash
79
79
  end
80
80
  end
@@ -90,8 +90,8 @@ module Html2rss
90
90
  # @param node [Nokogiri::XML::Node] candidate boundary node
91
91
  # @return [Boolean] true when the node is a good extraction boundary
92
92
  def article_tag_condition?(node)
93
- # Ignore tags that are below a tag which is in TAGS_TO_IGNORE.
94
- return false if node.path.match?(TAGS_TO_IGNORE)
93
+ # Ignore tags that are below ignored DOM chrome.
94
+ return false if HtmlExtractor.ignored_container_path?(node)
95
95
  return true if %w[body html].include?(node.name)
96
96
  return false unless (parent = node.parent)
97
97
 
@@ -100,24 +100,6 @@ module Html2rss
100
100
 
101
101
  private
102
102
 
103
- ##
104
- # Find relevant anchors in root.
105
- # @return [Set<String>] The set of XPath selectors
106
- def selectors
107
- @selectors ||= Hash.new(0).tap do |selectors|
108
- each_relevant_anchor { |node| increment_selector_count(selectors, node) }
109
- end
110
- end
111
-
112
- ##
113
- # Filter the frequent selectors by the minimum_selector_frequency and use_top_selectors.
114
- # @return [Array<String>] The filtered selectors
115
- def filtered_selectors
116
- selectors.select { |_selector, count| count >= minimum_selector_frequency }
117
- .max_by(use_top_selectors, &:last)
118
- .map(&:first)
119
- end
120
-
121
103
  def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
122
104
  def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
123
105
 
@@ -126,49 +108,59 @@ module Html2rss
126
108
  @anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
127
109
  end
128
110
 
129
- def each_relevant_anchor
130
- return enum_for(:each_relevant_anchor) unless block_given?
111
+ def relevant_anchor?(node)
112
+ destination_facts = @link_heuristics.destination_facts(node)
113
+ return false unless destination_facts
131
114
 
132
- traversal_root&.traverse do |node|
133
- yield node if relevant_anchor?(node)
134
- end
115
+ !noise_anchor?(node, destination_facts)
135
116
  end
136
117
 
137
- def relevant_anchor?(node)
138
- node.element? && node.name == 'a' && !String(node['href']).empty?
139
- end
118
+ def each_article_tag(&block)
119
+ return enum_for(:each_article_tag) unless block
140
120
 
141
- def increment_selector_count(selectors, node)
142
- path = self.class.simplify_xpath(node.path)
143
- selectors[path] += 1 unless path.match?(TAGS_TO_IGNORE)
121
+ list_candidates.each_article_tag(anchor_filter: method(:relevant_anchor?),
122
+ boundary_condition: method(:article_tag_condition?),
123
+ &block)
144
124
  end
145
125
 
146
- def traversal_root
147
- parsed_body.at_css('body, html') || parsed_body.root
126
+ def extract_article(article_tag, selected_anchor: nil)
127
+ selected_anchor ||= preferred_anchor_for(article_tag)
128
+ return unless selected_anchor
129
+ return if noise_anchor?(selected_anchor, @link_heuristics.destination_facts(selected_anchor))
130
+
131
+ @extractor.new(article_tag, base_url: @url, selected_anchor:).call
148
132
  end
149
133
 
150
- def each_article_tag
151
- return enum_for(:each_article_tag) unless block_given?
134
+ def noise_anchor?(anchor, destination_facts) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
135
+ return true unless destination_facts
152
136
 
153
- filtered_selectors.each do |selector|
154
- parsed_body.xpath(selector).each do |selected_tag|
155
- article_tag = article_tag_for(selected_tag)
156
- yield article_tag if article_tag
157
- end
158
- end
159
- end
137
+ text = HtmlExtractor.extract_visible_text(anchor).to_s.strip
160
138
 
161
- def article_tag_for(selected_tag)
162
- return if selected_tag.path.match?(Html::TAGS_TO_IGNORE)
139
+ destination_facts.taxonomy_path ||
140
+ short_utility_label?(text, destination_facts) ||
141
+ (@link_heuristics.recommended_text?(text) && destination_facts.shallow) ||
142
+ (@link_heuristics.utility_prefix_text?(text) && destination_facts.high_confidence_utility_destination) ||
143
+ (@link_heuristics.utility_text?(text) && destination_facts.vanity_path)
144
+ end
163
145
 
164
- HtmlNavigator.parent_until_condition(selected_tag, method(:article_tag_condition?))
146
+ def short_utility_label?(text, destination_facts)
147
+ destination_facts.utility_path &&
148
+ !destination_facts.content_path &&
149
+ !destination_facts.strong_post_suffix &&
150
+ text.scan(/\p{Alnum}+/).size <= 3
165
151
  end
166
152
 
167
- def extract_article(article_tag)
168
- selected_anchor = HtmlExtractor.main_anchor_for(article_tag)
169
- return unless selected_anchor
153
+ def preferred_anchor_for(article_tag)
154
+ article_tag.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).find { relevant_anchor?(_1) } ||
155
+ HtmlExtractor.main_anchor_for(article_tag)
156
+ end
170
157
 
171
- @extractor.new(article_tag, base_url: @url, selected_anchor:).call
158
+ def list_candidates
159
+ HtmlExtractor::ListCandidates.new(
160
+ parsed_body,
161
+ minimum_selector_frequency:,
162
+ use_top_selectors:
163
+ )
172
164
  end
173
165
  end
174
166
  end
@@ -0,0 +1,447 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class AutoSource
5
+ module Scraper
6
+ ##
7
+ # Shared link-level heuristics used by scraper-local selection and
8
+ # scoring. This keeps normalization and route/text classification
9
+ # consistent without moving scraper policy into higher orchestration.
10
+ class LinkHeuristics
11
+ # Normalized URL plus reusable route-classification facts for one link.
12
+ DestinationFacts = Data.define(
13
+ :url,
14
+ :destination,
15
+ :segments,
16
+ :content_path,
17
+ :utility_path,
18
+ :taxonomy_path,
19
+ :vanity_path,
20
+ :shallow,
21
+ :strong_post_suffix,
22
+ :high_confidence_junk_path,
23
+ :high_confidence_utility_destination
24
+ ) do
25
+ # @param url [Html2rss::Url] normalized destination URL
26
+ # @return [DestinationFacts] route facts for downstream link scoring
27
+ def self.build(url)
28
+ classifier = PathClassifier.new(url.path_segments)
29
+
30
+ new(
31
+ url:,
32
+ destination: url.to_s,
33
+ **classifier.destination_attributes
34
+ )
35
+ end
36
+ end
37
+
38
+ # Extracts a normalized href from a Nokogiri anchor or raw href value.
39
+ class HrefExtractor
40
+ # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
41
+ # @return [String, nil] href without fragment, or nil when blank
42
+ def self.call(anchor_or_href) = new(anchor_or_href).call
43
+
44
+ # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
45
+ def initialize(anchor_or_href)
46
+ @anchor_or_href = anchor_or_href
47
+ end
48
+
49
+ # @return [String, nil] href without fragment, or nil when blank
50
+ def call
51
+ raw_href.to_s.split('#', 2).first.to_s.strip.then do |href|
52
+ href unless href.empty?
53
+ end
54
+ end
55
+
56
+ private
57
+
58
+ def raw_href
59
+ case @anchor_or_href
60
+ when Nokogiri::XML::Node
61
+ @anchor_or_href['href']
62
+ else
63
+ @anchor_or_href
64
+ end
65
+ end
66
+ end
67
+
68
+ # Classifies visible anchor text for utility and recommendation chrome.
69
+ class TextClassifier
70
+ # Prefix labels that usually identify navigation or subscription links.
71
+ UTILITY_PREFIX_PATTERN = /
72
+ \A\s*(
73
+ # English
74
+ view\s+all|see\s+all|all\s+news|subscribe|newsletter|comment\s+feed|comments\s+feed|join|premium|plus|
75
+ # German
76
+ alle\s+anzeigen|alle\s+news|abonnieren|newsletter|kommentar\s+feed|mitmachen|
77
+ # Spanish
78
+ ver\s+todos|ver\s+todo|todas\s+las\s+noticias|suscribirse|bolet(i|í)n|comentarios\s+feed|unirse|
79
+ # French
80
+ voir\s+tout|voir\s+tous|toutes\s+les\s+nouvelles|s['’]abonner|flux\s+de\s+commentaires|rejoindre
81
+ )\b
82
+ /ix
83
+ # Short labels that usually identify non-article navigation links.
84
+ UTILITY_PATTERN = /
85
+ \A\s*(
86
+ # English
87
+ about|contact|comments?|join|log\s+in|login|member(ship)?|
88
+ plus|premium|pricing|recommended(\s+for\s+you)?|
89
+ see\s+all|share|sign\s+up|signup|subscribe|view\s+all|
90
+ # German
91
+ (ue|ü)ber(\s+uns)?|kontakt|kommentare?|mitmachen|anmelden|login|
92
+ mitglied(schaft)?|empfohlen(\s+f(ue|ü)r\s+dich)?|alle\s+anzeigen|
93
+ teilen|registrieren|abonnieren|newsletter|
94
+ # Spanish
95
+ sobre(\s+nosotros)?|contacto|comentarios?|unirse|iniciar\s+sesion|
96
+ login|miembro|membres(i|í)a|recomendado(\s+para\s+ti)?|ver\s+todo|
97
+ compartir|registrarse|suscribirse|bolet(i|í)n|
98
+ # French
99
+ (a|à)\s+propos|(a|à)propos|contact|commentaires?|rejoindre|
100
+ se\s+connecter|login|membre|abonnement|recommand(e|é)(\s+pour\s+vous)?|
101
+ voir\s+tout|partager|s['’]inscrire|s['’]abonner|newsletter
102
+ )\b
103
+ /ix
104
+ # Labels for recommendation chrome rather than source articles.
105
+ RECOMMENDED_PATTERN = /
106
+ \A\s*(
107
+ recommended(\s+for\s+you)?|
108
+ empfohlen(\s+f(ue|ü)r\s+dich)?|
109
+ recomendado(\s+para\s+ti)?|
110
+ recommand(e|é)(\s+pour\s+vous)?
111
+ )\b
112
+ /ix
113
+
114
+ # @param text [String, #to_s] visible anchor text
115
+ # @return [Boolean] true when text matches a utility label
116
+ def utility?(text) = text.to_s.match?(UTILITY_PATTERN)
117
+
118
+ # @param text [String, #to_s] visible anchor text
119
+ # @return [Boolean] true when text begins with a utility label
120
+ def utility_prefix?(text) = text.to_s.match?(UTILITY_PREFIX_PATTERN)
121
+
122
+ # @param text [String, #to_s] visible anchor text
123
+ # @return [Boolean] true when text identifies recommendation chrome
124
+ def recommended?(text) = text.to_s.match?(RECOMMENDED_PATTERN)
125
+ end
126
+
127
+ # Classifies normalized destination path segments for scoring.
128
+ # rubocop:disable Metrics/ClassLength
129
+ class PathClassifier
130
+ attr_reader :segments
131
+
132
+ # Segment groups used to classify article, taxonomy, utility, and vanity routes.
133
+ SEGMENT_SETS = {
134
+ content: %w[
135
+ article articles blog blogs changelog changelogs insight insights
136
+ launch launches news post posts release releases story stories update updates
137
+ artikel beitrag beitraege nachrichten neuigkeiten aktuelles
138
+ articulo articulos noticia noticias entrada entradas publicacion publicaciones
139
+ actualite actualites nouvelle nouvelles
140
+ teaser teasers card cards
141
+ ].to_set.freeze,
142
+ utility: %w[
143
+ about account archive archives author authors category categories comment comments
144
+ contact feedback help login logout newsletter newsletters notification notifications
145
+ preference preferences profile register search settings share signup subscribe
146
+ tag tags topic topics
147
+ feed feeds comment-feed comments-feed
148
+ recommended
149
+ for-you
150
+ privacy terms cookie cookies
151
+ join member members membership plus premium plans pricing user users
152
+ kategorie kategorien schlagwort schlagworte thema themen autor autoren archiv
153
+ ueber-uns ueber ueberuns profil kontakt impressum suche hilfe anmelden registrieren
154
+ konto registrierung anmeldung abonnieren abo datenschutz nutzungsbedingungen agb
155
+ categoria categorias etiqueta etiquetas tema temas autores archivos
156
+ sobre-nosotros sobre quienes-somos buscar busqueda ayuda entrar ingresar
157
+ registrarse registro cuenta suscribirse boletin privacidad condiciones
158
+ categorie etiquette etiquettes sujet sujets theme themes auteur auteurs
159
+ a-propos apropos recherche rechercher aide connexion s-inscrire
160
+ sinscrire inscription compte s-abonner saboner lettre-information confidentialite mentions-legales cgu
161
+ menu sidebar widget social modal popup banner promo ad ads
162
+ related recommendation recommendations pagination pager
163
+ ].to_set.freeze,
164
+ high_confidence_junk: %w[
165
+ about account archive archives author authors category categories comment comments
166
+ contact cookie cookies feedback feed feeds help login logout notification notifications
167
+ preference preferences privacy profile register search settings share signup subscribe
168
+ tag tags terms topic topics comment-feed comments-feed user users
169
+ kategorie kategorien schlagwort schlagworte thema themen autor autoren archiv
170
+ ueber-uns ueber ueberuns profil kontakt impressum suche hilfe anmelden registrieren
171
+ konto registrierung anmeldung abonnieren abo datenschutz nutzungsbedingungen agb
172
+ categoria categorias etiqueta etiquetas tema temas autores archivos
173
+ sobre-nosotros sobre quienes-somos buscar busqueda ayuda entrar ingresar
174
+ registrarse registro cuenta suscribirse boletin privacidad condiciones
175
+ categorie etiquette etiquettes sujet sujets theme themes auteur auteurs
176
+ a-propos apropos recherche rechercher aide connexion s-inscrire
177
+ sinscrire inscription compte s-abonner saboner lettre-information confidentialite mentions-legales cgu
178
+ menu sidebar widget social modal popup banner promo ad ads
179
+ related recommendation recommendations pagination pager
180
+ ].to_set.freeze,
181
+ taxonomy: %w[
182
+ category categories tag tags topic topics
183
+ kategorie kategorien schlagwort schlagworte thema themen
184
+ categoria categorias etiqueta etiquetas tema temas
185
+ categorie etiquette etiquettes sujet sujets theme themes
186
+ ].to_set.freeze,
187
+ vanity: %w[
188
+ join membership plus premium pricing plans subscribe signup
189
+ abonnieren abo
190
+ suscribirse boletin
191
+ s-abonner saboner
192
+ ].to_set.freeze,
193
+ deep_post_context: %w[
194
+ press newsroom
195
+ presse pressemitteilungen
196
+ prensa
197
+ ].to_set.freeze
198
+ }.freeze
199
+ # Path segment that begins with a year-like publishing marker.
200
+ YEARISH_SEGMENT = /\A\d{4,}[\w-]*\z/
201
+ # Hyphenated slug shape common to article permalinks.
202
+ POST_SLUG_SEGMENT = /\A[a-z0-9]+(?:-[a-z0-9]+){2,}\z/i
203
+
204
+ # @param segments [Array<String>] normalized URL path segments
205
+ def initialize(segments)
206
+ @segments = segments
207
+ end
208
+
209
+ # @return [Hash] destination attributes consumed by DestinationFacts
210
+ def destination_attributes
211
+ route_attributes.merge(confidence_attributes)
212
+ end
213
+
214
+ # @return [Hash] baseline path classification attributes
215
+ def route_attributes
216
+ {
217
+ segments:,
218
+ content_path: content_path?,
219
+ utility_path: utility_path?,
220
+ taxonomy_path: taxonomy_path?,
221
+ vanity_path: vanity_path?,
222
+ shallow: shallow?,
223
+ strong_post_suffix: strong_post_suffix?
224
+ }
225
+ end
226
+
227
+ # @return [Hash] high-confidence noise classification attributes
228
+ def confidence_attributes
229
+ ConfidenceClassifier.new(self).attributes
230
+ end
231
+
232
+ # @return [Boolean] true when the route has article-like path evidence
233
+ def content_path?
234
+ @content_path ||= SEGMENT_SETS.fetch(:content).intersect?(segments.to_set) ||
235
+ yearish_content_context?
236
+ end
237
+
238
+ # @return [Boolean] true when the route includes utility/navigation evidence
239
+ def utility_path?
240
+ @utility_path ||= SEGMENT_SETS.fetch(:utility).intersect?(segments.to_set)
241
+ end
242
+
243
+ # @return [Boolean] true when the route points at conversion or account chrome
244
+ def vanity_path?
245
+ @vanity_path ||= SEGMENT_SETS.fetch(:vanity).intersect?(segments.to_set)
246
+ end
247
+
248
+ # @return [Boolean] true when the route points at taxonomy/listing chrome
249
+ def taxonomy_path?
250
+ @taxonomy_path ||= SEGMENT_SETS.fetch(:taxonomy).intersect?(segments.to_set)
251
+ end
252
+
253
+ # @return [Boolean] true when the route is too shallow to strongly indicate an article
254
+ def shallow?
255
+ segment_count = segments.size
256
+ junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
257
+
258
+ segment_count <= 1 || (segment_count == 2 && junk_segments.include?(segments.last))
259
+ end
260
+
261
+ # @return [Boolean] true when the final path segment looks like a post slug
262
+ def strong_post_suffix?
263
+ PostSuffixClassifier.new(segments).strong?
264
+ end
265
+
266
+ # @return [Boolean] true when every path segment is utility chrome
267
+ def utility_only_route?
268
+ junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
269
+
270
+ segments.all? { |segment| junk_segments.include?(segment) }
271
+ end
272
+
273
+ # @return [Boolean] true when the route is shallow and contains high-confidence noise
274
+ def shallow_high_confidence_route?
275
+ junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
276
+ vanity_segments = SEGMENT_SETS.fetch(:vanity)
277
+
278
+ shallow? && segments.any? do |segment|
279
+ junk_segments.include?(segment) || vanity_segments.include?(segment)
280
+ end
281
+ end
282
+
283
+ # @return [Boolean] true when the leading segments are all utility chrome
284
+ def deep_utility_context_route?
285
+ LeadingSegments.new(segments).all_junk?
286
+ end
287
+
288
+ private
289
+
290
+ def yearish_content_context?
291
+ segments.any? { |segment| segment.match?(YEARISH_SEGMENT) } &&
292
+ (strong_post_suffix? || LeadingSegments.new(segments).trusted_post_context?)
293
+ end
294
+ end
295
+ # rubocop:enable Metrics/ClassLength
296
+
297
+ # Classifies high-confidence junk and utility routes from path facts.
298
+ class ConfidenceClassifier
299
+ # @param path [PathClassifier] classified destination path
300
+ def initialize(path)
301
+ @path = path
302
+ end
303
+
304
+ # @return [Hash] high-confidence route classification attributes
305
+ def attributes
306
+ {
307
+ high_confidence_junk_path: junk_path?,
308
+ high_confidence_utility_destination: utility_destination?
309
+ }
310
+ end
311
+
312
+ private
313
+
314
+ def junk_path?
315
+ return false if excluded_content_route?
316
+
317
+ @path.taxonomy_path? ||
318
+ @path.utility_only_route? ||
319
+ @path.deep_utility_context_route? ||
320
+ @path.shallow_high_confidence_route?
321
+ end
322
+
323
+ def utility_destination?
324
+ return false if excluded_content_route?
325
+
326
+ @path.vanity_path? || utility_route?
327
+ end
328
+
329
+ def excluded_content_route?
330
+ @path.segments.empty? || @path.content_path? || @path.strong_post_suffix?
331
+ end
332
+
333
+ def utility_route?
334
+ @path.taxonomy_path? ||
335
+ @path.utility_only_route? ||
336
+ @path.deep_utility_context_route? ||
337
+ shallow_utility_route?
338
+ end
339
+
340
+ def shallow_utility_route?
341
+ @path.shallow? && @path.utility_path?
342
+ end
343
+ end
344
+
345
+ # Classifies route context before the final segment.
346
+ class LeadingSegments
347
+ # @param segments [Array<String>] normalized URL path segments
348
+ def initialize(segments)
349
+ @segments = segments[0...-1]
350
+ end
351
+
352
+ # @return [Boolean] true when every leading segment is utility chrome
353
+ def all_junk?
354
+ junk_segments = PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk)
355
+
356
+ @segments.any? && @segments.all? { |segment| junk_segments.include?(segment) }
357
+ end
358
+
359
+ # @return [Boolean] true when leading segments provide article context
360
+ def trusted_post_context?
361
+ content_segments = PathClassifier::SEGMENT_SETS.fetch(:content)
362
+ context_segments = PathClassifier::SEGMENT_SETS.fetch(:deep_post_context)
363
+
364
+ @segments.any? do |segment|
365
+ content_segments.include?(segment) ||
366
+ segment.match?(PathClassifier::YEARISH_SEGMENT) ||
367
+ context_segments.include?(segment)
368
+ end
369
+ end
370
+ end
371
+
372
+ # Classifies whether the final segment is a strong post-like suffix.
373
+ class PostSuffixClassifier
374
+ # @param segments [Array<String>] normalized URL path segments
375
+ def initialize(segments)
376
+ @segments = segments
377
+ end
378
+
379
+ # @return [Boolean] true when the final path segment looks like a post slug
380
+ def strong?
381
+ @segments.any? &&
382
+ included_last_segment? &&
383
+ LeadingSegments.new(@segments).trusted_post_context?
384
+ end
385
+
386
+ private
387
+
388
+ def included_last_segment?
389
+ !excluded_last_segment? && slug_last_segment?
390
+ end
391
+
392
+ def excluded_last_segment?
393
+ excluded_segments.any? { |segment| segment.include?(last_segment) }
394
+ end
395
+
396
+ def excluded_segments
397
+ [
398
+ PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk),
399
+ PathClassifier::SEGMENT_SETS.fetch(:vanity)
400
+ ]
401
+ end
402
+
403
+ def slug_last_segment?
404
+ last_segment.match?(PathClassifier::YEARISH_SEGMENT) ||
405
+ last_segment.match?(PathClassifier::POST_SLUG_SEGMENT)
406
+ end
407
+
408
+ def last_segment
409
+ @segments.last
410
+ end
411
+ end
412
+
413
+ # @param base_url [String, Html2rss::Url] page URL used to resolve relative hrefs
414
+ def initialize(base_url)
415
+ @base_url = base_url
416
+ @text_classifier = TextClassifier.new
417
+ end
418
+
419
+ # Builds normalized destination facts for an anchor element or href string.
420
+ #
421
+ # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
422
+ # @return [DestinationFacts, nil] normalized destination facts, or nil for blank/invalid URLs
423
+ def destination_facts(anchor_or_href)
424
+ href = HrefExtractor.call(anchor_or_href)
425
+ return unless href
426
+
427
+ url = Html2rss::Url.from_relative(href, @base_url)
428
+ DestinationFacts.build(url)
429
+ rescue ArgumentError
430
+ nil
431
+ end
432
+
433
+ # @param text [String, #to_s] visible anchor text
434
+ # @return [Boolean] true when text matches a utility label
435
+ def utility_text?(text) = @text_classifier.utility?(text)
436
+
437
+ # @param text [String, #to_s] visible anchor text
438
+ # @return [Boolean] true when text begins with a utility label
439
+ def utility_prefix_text?(text) = @text_classifier.utility_prefix?(text)
440
+
441
+ # @param text [String, #to_s] visible anchor text
442
+ # @return [Boolean] true when text identifies recommendation chrome
443
+ def recommended_text?(text) = @text_classifier.recommended?(text)
444
+ end
445
+ end
446
+ end
447
+ end