html2rss 0.20.1 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/html2rss.gemspec +1 -2
  3. data/lib/html2rss/auto_source/scraper/html.rb +61 -16
  4. data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
  5. data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
  6. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
  7. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
  8. data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
  9. data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
  10. data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
  11. data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
  12. data/lib/html2rss/auto_source/scraper.rb +0 -3
  13. data/lib/html2rss/auto_source.rb +2 -11
  14. data/lib/html2rss/category_extractor.rb +54 -20
  15. data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
  16. data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
  17. data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
  18. data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
  19. data/lib/html2rss/html_extractor.rb +51 -30
  20. data/lib/html2rss/rendering/description_builder.rb +3 -3
  21. data/lib/html2rss/rss_builder/article.rb +44 -23
  22. data/lib/html2rss/rss_builder/enclosure.rb +4 -2
  23. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
  24. data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
  25. data/lib/html2rss/selectors/post_processors/template.rb +3 -2
  26. data/lib/html2rss/selectors.rb +18 -4
  27. data/lib/html2rss/url.rb +4 -3
  28. data/lib/html2rss/version.rb +1 -1
  29. metadata +3 -17
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3eb20836fb55a5e33d114634c7c20bf2a65afd1a923af1bba82896797fdb099d
4
- data.tar.gz: 97c00923e0ca5744cf82f22d54aa428cc784cba3e259bbcdcd9252928f9bc3c0
3
+ metadata.gz: 8168109d2cc60920d8a18b6b99970a5558e43163ad5cd11cb3d3f0d944d46943
4
+ data.tar.gz: 833a936f89f9ce31c0b4fb0036020c7962a4ac77e0dfa72f1134a0bae8bea4c4
5
5
  SHA512:
6
- metadata.gz: 36431edafddcca32a53f562a75fbcd77fae969ca8c8fa7c2b6de77f121add2ee3c34c2b6e2e7b57109742780d62eeddde823b1a9bf8f4c1aaeed08a0ae4e5c90
7
- data.tar.gz: d3eafa9cbbecc5ccaded7b21508e0af1c43337999cb5651f2d879df5217c1aa1f3e5484bcfd6354037d709715db1cc106f2946dd592745c0ac58252bcdd26ac8
6
+ metadata.gz: 734f286a486d49c86ab7baf48d157cdee9d988fdc8b693ac7d79bf3c64c661fcd54538d5e94dc19bdc8a6f3021168c1ecac2d8e34417f56879392d71600c7340
7
+ data.tar.gz: f008a767b452557cff1b45b1abb0eccb26f38d839417e95d21b8cf74f4546f9143b067e2d11f9fc00f5955c3f145f8933a1b1d4912ad25756c890280d4bb1a37
data/html2rss.gemspec CHANGED
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
14
14
  spec.description = 'Supports JSON content, custom HTTP headers, and post-processing of extracted content.'
15
15
  spec.homepage = 'https://github.com/html2rss/html2rss'
16
16
  spec.license = 'MIT'
17
- spec.required_ruby_version = '>= 3.2'
17
+ spec.required_ruby_version = '>= 3.3'
18
18
 
19
19
  if spec.respond_to?(:metadata)
20
20
  spec.metadata['allowed_push_host'] = 'https://rubygems.org'
@@ -41,7 +41,6 @@ Gem::Specification.new do |spec|
41
41
  spec.add_dependency 'kramdown'
42
42
  spec.add_dependency 'mime-types', '> 3.0'
43
43
  spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
44
- spec.add_dependency 'parallel'
45
44
  spec.add_dependency 'puppeteer-ruby'
46
45
  spec.add_dependency 'regexp_parser'
47
46
  spec.add_dependency 'reverse_markdown', '~> 3.0'
@@ -63,6 +63,7 @@ module Html2rss
63
63
  @extractor = extractor
64
64
  @opts = opts
65
65
  @link_heuristics = LinkHeuristics.new(url)
66
+ @ignored_cache = {}.compare_by_identity
66
67
  end
67
68
 
68
69
  attr_reader :parsed_body
@@ -73,10 +74,13 @@ module Html2rss
73
74
  def each
74
75
  return enum_for(:each) unless block_given?
75
76
 
76
- each_article_tag do |article_tag, selected_anchor|
77
- article_hash = extract_article(article_tag, selected_anchor:)
78
- yield article_hash if article_hash
79
- end
77
+ articles.each { yield _1 }
78
+ end
79
+
80
+ ##
81
+ # @return [Boolean] true when the scraper can likely extract articles
82
+ def extractable?
83
+ articles.any?
80
84
  end
81
85
 
82
86
  ##
@@ -91,7 +95,7 @@ module Html2rss
91
95
  # @return [Boolean] true when the node is a good extraction boundary
92
96
  def article_tag_condition?(node)
93
97
  # Ignore tags that are below ignored DOM chrome.
94
- return false if HtmlExtractor.ignored_container_path?(node)
98
+ return false if HtmlExtractor.ignored_container_path?(node, @ignored_cache)
95
99
  return true if %w[body html].include?(node.name)
96
100
  return false unless (parent = node.parent)
97
101
 
@@ -100,14 +104,30 @@ module Html2rss
100
104
 
101
105
  private
102
106
 
107
+ def articles
108
+ @articles ||= each_article_tag.filter_map do |article_tag, selected_anchor|
109
+ extract_article(article_tag, selected_anchor:)
110
+ end
111
+ end
112
+
113
+ ##
114
+ # @return [Integer]
103
115
  def minimum_selector_frequency = @opts[:minimum_selector_frequency] || DEFAULT_MINIMUM_SELECTOR_FREQUENCY
116
+
117
+ ##
118
+ # @return [Boolean]
104
119
  def use_top_selectors = @opts[:use_top_selectors] || DEFAULT_USE_TOP_SELECTORS
105
120
 
121
+ ##
122
+ # @param node [Nokogiri::XML::Node]
123
+ # @return [Integer]
106
124
  def anchor_count(node)
107
- @anchor_counts ||= {}
108
- @anchor_counts[node.path] ||= node.name == 'a' ? 1 : node.css('a').size
125
+ (@anchor_counts ||= {}.compare_by_identity)[node] ||= node.name == 'a' ? 1 : node.css('a').size
109
126
  end
110
127
 
128
+ ##
129
+ # @param node [Nokogiri::XML::Node]
130
+ # @return [Boolean]
111
131
  def relevant_anchor?(node)
112
132
  destination_facts = @link_heuristics.destination_facts(node)
113
133
  return false unless destination_facts
@@ -115,14 +135,24 @@ module Html2rss
115
135
  !noise_anchor?(node, destination_facts)
116
136
  end
117
137
 
138
+ ##
139
+ # @yield [article_tag, selected_anchor]
140
+ # @yieldparam article_tag [Nokogiri::XML::Node]
141
+ # @yieldparam selected_anchor [Nokogiri::XML::Node]
142
+ # @return [Enumerator, nil]
118
143
  def each_article_tag(&block)
119
144
  return enum_for(:each_article_tag) unless block
120
145
 
121
- list_candidates.each_article_tag(anchor_filter: method(:relevant_anchor?),
122
- boundary_condition: method(:article_tag_condition?),
123
- &block)
146
+ anchor_filter = ->(node) { relevant_anchor?(node) }
147
+ boundary_condition = ->(node) { article_tag_condition?(node) }
148
+
149
+ list_candidates.each_article_tag(anchor_filter:, boundary_condition:, &block)
124
150
  end
125
151
 
152
+ ##
153
+ # @param article_tag [Nokogiri::XML::Node]
154
+ # @param selected_anchor [Nokogiri::XML::Node, nil]
155
+ # @return [Hash, nil]
126
156
  def extract_article(article_tag, selected_anchor: nil)
127
157
  selected_anchor ||= preferred_anchor_for(article_tag)
128
158
  return unless selected_anchor
@@ -131,18 +161,28 @@ module Html2rss
131
161
  @extractor.new(article_tag, base_url: @url, selected_anchor:).call
132
162
  end
133
163
 
164
+ ##
165
+ # @param anchor [Nokogiri::XML::Node]
166
+ # @param destination_facts [DestinationFacts]
167
+ # @return [Boolean]
134
168
  def noise_anchor?(anchor, destination_facts) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
135
169
  return true unless destination_facts
136
170
 
137
- text = HtmlExtractor.extract_visible_text(anchor).to_s.strip
171
+ (@noise_anchors ||= {}.compare_by_identity)[anchor] ||= begin
172
+ text = HtmlExtractor.extract_visible_text(anchor).to_s.strip
138
173
 
139
- destination_facts.taxonomy_path ||
140
- short_utility_label?(text, destination_facts) ||
141
- (@link_heuristics.recommended_text?(text) && destination_facts.shallow) ||
142
- (@link_heuristics.utility_prefix_text?(text) && destination_facts.high_confidence_utility_destination) ||
143
- (@link_heuristics.utility_text?(text) && destination_facts.vanity_path)
174
+ destination_facts.taxonomy_path ||
175
+ short_utility_label?(text, destination_facts) ||
176
+ (@link_heuristics.recommended_text?(text) && destination_facts.shallow) ||
177
+ (@link_heuristics.utility_prefix_text?(text) && destination_facts.high_confidence_utility_destination) ||
178
+ (@link_heuristics.utility_text?(text) && destination_facts.vanity_path)
179
+ end
144
180
  end
145
181
 
182
+ ##
183
+ # @param text [String]
184
+ # @param destination_facts [DestinationFacts]
185
+ # @return [Boolean]
146
186
  def short_utility_label?(text, destination_facts)
147
187
  destination_facts.utility_path &&
148
188
  !destination_facts.content_path &&
@@ -150,11 +190,16 @@ module Html2rss
150
190
  text.scan(/\p{Alnum}+/).size <= 3
151
191
  end
152
192
 
193
+ ##
194
+ # @param article_tag [Nokogiri::XML::Node]
195
+ # @return [Nokogiri::XML::Node, nil]
153
196
  def preferred_anchor_for(article_tag)
154
197
  article_tag.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR).find { relevant_anchor?(_1) } ||
155
198
  HtmlExtractor.main_anchor_for(article_tag)
156
199
  end
157
200
 
201
+ ##
202
+ # @return [HtmlExtractor::ListCandidates]
158
203
  def list_candidates
159
204
  HtmlExtractor::ListCandidates.new(
160
205
  parsed_body,
@@ -30,6 +30,9 @@ module Html2rss
30
30
  /(?:window|self|globalThis)\.angular\s*=\s*/m
31
31
  ].freeze
32
32
 
33
+ # Combined regex for faster matching of global assignments.
34
+ GLOBAL_ASSIGNMENT_REGEXP = Regexp.union(GLOBAL_ASSIGNMENT_PATTERNS).freeze
35
+
33
36
  # Preferred keys when extracting title-like values from state payloads.
34
37
  TITLE_KEYS = %i[title headline name text].freeze
35
38
  # Preferred keys when extracting URL-like values from state payloads.
@@ -53,7 +56,12 @@ module Html2rss
53
56
  # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
54
57
  # @return [Array<Hash, Array>] parsed JSON documents discovered in scripts
55
58
  def json_documents(parsed_body)
56
- script_documents(parsed_body) + assignment_documents(parsed_body)
59
+ # Use identity-based cache to avoid double-parsing of the same document.
60
+ # WeakMap allows the Nokogiri Document (key) to be garbage collected.
61
+ # rubocop:disable ThreadSafety/ClassInstanceVariable
62
+ (@cache ||= ObjectSpace::WeakMap.new)[parsed_body] ||=
63
+ script_documents(parsed_body) + assignment_documents(parsed_body)
64
+ # rubocop:enable ThreadSafety/ClassInstanceVariable
57
65
  end
58
66
 
59
67
  # @param parsed_body [Nokogiri::HTML::Document] parsed HTML document
@@ -80,15 +88,10 @@ module Html2rss
80
88
  def assignment_payload(text)
81
89
  trimmed = text.to_s.strip
82
90
  return if trimmed.empty?
91
+ return unless trimmed.match?(GLOBAL_ASSIGNMENT_REGEXP)
83
92
 
84
- GLOBAL_ASSIGNMENT_PATTERNS.each do |pattern|
85
- next unless trimmed.match?(pattern)
86
-
87
- payload = trimmed.sub(pattern, '')
88
- return extract_assignment_payload(payload)
89
- end
90
-
91
- nil
93
+ payload = trimmed.sub(GLOBAL_ASSIGNMENT_REGEXP, '')
94
+ extract_assignment_payload(payload)
92
95
  end
93
96
 
94
97
  # @param text [String] text potentially containing JSON-like payloads
@@ -116,8 +119,10 @@ module Html2rss
116
119
  in_string = false
117
120
  escape = false
118
121
 
119
- text.each_char.with_index do |char, index|
120
- next if index < start_index
122
+ i = start_index
123
+ len = text.length
124
+ while i < len
125
+ char = text[i]
121
126
 
122
127
  if in_string
123
128
  if escape
@@ -127,24 +132,22 @@ module Html2rss
127
132
  elsif char == '"'
128
133
  in_string = false
129
134
  end
130
- next
131
- end
132
-
133
- case char
134
- when '"'
135
- in_string = true
136
- when '{'
137
- stack << '}'
138
- when '['
139
- stack << ']'
140
- when '}', ']'
141
- expected = stack.pop
142
- return index if expected == char && stack.empty?
135
+ else
136
+ case char
137
+ when '"' then in_string = true
138
+ when '{' then stack << '}'
139
+ when '[' then stack << ']'
140
+ when '}', ']'
141
+ expected = stack.pop
142
+ return i if expected == char && stack.empty?
143
+ end
143
144
  end
145
+ i += 1
144
146
  end
145
147
 
146
148
  nil
147
149
  end
150
+
148
151
  # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength, Metrics/PerceivedComplexity
149
152
 
150
153
  # @param payload [String, nil] JSON payload to parse
@@ -184,8 +187,9 @@ module Html2rss
184
187
  # @param jsonish [String] JSON-like string with potentially unquoted keys
185
188
  # @return [String] payload with unquoted object keys quoted
186
189
  def quote_unquoted_keys(jsonish)
187
- jsonish.gsub(/(\A\s*|[{,\[]\s*)([A-Za-z_]\w*)(\s*:)/) do
188
- "#{Regexp.last_match(1)}\"#{Regexp.last_match(2)}\"#{Regexp.last_match(3)}"
190
+ jsonish.gsub(/(?<prefix>\A\s*|[{,\[]\s*)(?<key>[A-Za-z_]\w*)(?<suffix>\s*:)/) do
191
+ captures = Regexp.last_match.named_captures(symbolize_names: true)
192
+ "#{captures[:prefix]}\"#{captures[:key]}\"#{captures[:suffix]}"
189
193
  end
190
194
  end
191
195
 
@@ -415,12 +419,17 @@ module Html2rss
415
419
 
416
420
  attr_reader :parsed_body
417
421
 
422
+ # @return [Boolean] true when the page contains article-like arrays in JSON state
423
+ def extractable?
424
+ json_documents.any? { CandidateDetector.candidate_array?(_1) }
425
+ end
426
+
418
427
  # @yield [Hash{Symbol => Object}] normalized article hash
419
428
  # @return [Enumerator, void] article enumerator when no block is given
420
429
  def each
421
430
  return enum_for(:each) unless block_given?
422
431
 
423
- DocumentScanner.json_documents(parsed_body).each do |document|
432
+ json_documents.each do |document|
424
433
  discover_articles(document) do |article|
425
434
  yield article if article
426
435
  end
@@ -431,6 +440,10 @@ module Html2rss
431
440
 
432
441
  attr_reader :url
433
442
 
443
+ def json_documents
444
+ self.class.json_documents(parsed_body)
445
+ end
446
+
434
447
  def discover_articles(document, &block)
435
448
  case document
436
449
  when Array then handle_array(document, &block)
@@ -24,19 +24,30 @@ module Html2rss
24
24
  ) do
25
25
  # @param url [Html2rss::Url] normalized destination URL
26
26
  # @return [DestinationFacts] route facts for downstream link scoring
27
- def self.build(url)
27
+ def self.build(url) # rubocop:disable Metrics/MethodLength
28
28
  classifier = PathClassifier.new(url.path_segments)
29
29
 
30
30
  new(
31
31
  url:,
32
32
  destination: url.to_s,
33
- **classifier.destination_attributes
33
+ segments: classifier.segments,
34
+ strong_post_suffix: classifier.strong_post_suffix?,
35
+ content_path: classifier.content_path?,
36
+ utility_path: classifier.utility_path?,
37
+ taxonomy_path: classifier.taxonomy_path?,
38
+ vanity_path: classifier.vanity_path?,
39
+ shallow: classifier.shallow?,
40
+ high_confidence_junk_path: classifier.junk_path?,
41
+ high_confidence_utility_destination: classifier.utility_destination?
34
42
  )
35
43
  end
36
44
  end
37
45
 
38
46
  # Extracts a normalized href from a Nokogiri anchor or raw href value.
39
47
  class HrefExtractor
48
+ # Regexp to capture everything before the first '#'
49
+ HREF_BASE_PATTERN = /\A([^#]*)/
50
+
40
51
  # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
41
52
  # @return [String, nil] href without fragment, or nil when blank
42
53
  def self.call(anchor_or_href) = new(anchor_or_href).call
@@ -48,20 +59,18 @@ module Html2rss
48
59
 
49
60
  # @return [String, nil] href without fragment, or nil when blank
50
61
  def call
51
- raw_href.to_s.split('#', 2).first.to_s.strip.then do |href|
52
- href unless href.empty?
53
- end
54
- end
62
+ href = case @anchor_or_href
63
+ when Nokogiri::XML::Node
64
+ @anchor_or_href['href']
65
+ else
66
+ @anchor_or_href
67
+ end
55
68
 
56
- private
69
+ return unless href
57
70
 
58
- def raw_href
59
- case @anchor_or_href
60
- when Nokogiri::XML::Node
61
- @anchor_or_href['href']
62
- else
63
- @anchor_or_href
64
- end
71
+ # Extract base part before # and strip whitespace
72
+ base = href.to_s[HREF_BASE_PATTERN, 1].strip
73
+ base unless base.empty?
65
74
  end
66
75
  end
67
76
 
@@ -125,8 +134,7 @@ module Html2rss
125
134
  end
126
135
 
127
136
  # Classifies normalized destination path segments for scoring.
128
- # rubocop:disable Metrics/ClassLength
129
- class PathClassifier
137
+ class PathClassifier # rubocop:disable Metrics/ClassLength
130
138
  attr_reader :segments
131
139
 
132
140
  # Segment groups used to classify article, taxonomy, utility, and vanity routes.
@@ -206,48 +214,25 @@ module Html2rss
206
214
  @segments = segments
207
215
  end
208
216
 
209
- # @return [Hash] destination attributes consumed by DestinationFacts
210
- def destination_attributes
211
- route_attributes.merge(confidence_attributes)
212
- end
213
-
214
- # @return [Hash] baseline path classification attributes
215
- def route_attributes
216
- {
217
- segments:,
218
- content_path: content_path?,
219
- utility_path: utility_path?,
220
- taxonomy_path: taxonomy_path?,
221
- vanity_path: vanity_path?,
222
- shallow: shallow?,
223
- strong_post_suffix: strong_post_suffix?
224
- }
225
- end
226
-
227
- # @return [Hash] high-confidence noise classification attributes
228
- def confidence_attributes
229
- ConfidenceClassifier.new(self).attributes
230
- end
231
-
232
217
  # @return [Boolean] true when the route has article-like path evidence
233
218
  def content_path?
234
- @content_path ||= SEGMENT_SETS.fetch(:content).intersect?(segments.to_set) ||
219
+ @content_path ||= segments.any? { |s| SEGMENT_SETS[:content].include?(s) } ||
235
220
  yearish_content_context?
236
221
  end
237
222
 
238
223
  # @return [Boolean] true when the route includes utility/navigation evidence
239
224
  def utility_path?
240
- @utility_path ||= SEGMENT_SETS.fetch(:utility).intersect?(segments.to_set)
225
+ @utility_path ||= segments.any? { |s| SEGMENT_SETS[:utility].include?(s) }
241
226
  end
242
227
 
243
228
  # @return [Boolean] true when the route points at conversion or account chrome
244
229
  def vanity_path?
245
- @vanity_path ||= SEGMENT_SETS.fetch(:vanity).intersect?(segments.to_set)
230
+ @vanity_path ||= segments.any? { |s| SEGMENT_SETS[:vanity].include?(s) }
246
231
  end
247
232
 
248
233
  # @return [Boolean] true when the route points at taxonomy/listing chrome
249
234
  def taxonomy_path?
250
- @taxonomy_path ||= SEGMENT_SETS.fetch(:taxonomy).intersect?(segments.to_set)
235
+ @taxonomy_path ||= segments.any? { |s| SEGMENT_SETS[:taxonomy].include?(s) }
251
236
  end
252
237
 
253
238
  # @return [Boolean] true when the route is too shallow to strongly indicate an article
@@ -260,7 +245,9 @@ module Html2rss
260
245
 
261
246
  # @return [Boolean] true when the final path segment looks like a post slug
262
247
  def strong_post_suffix?
263
- PostSuffixClassifier.new(segments).strong?
248
+ @strong_post_suffix ||= segments.any? &&
249
+ included_last_segment? &&
250
+ trusted_post_context?(segments.size - 1)
264
251
  end
265
252
 
266
253
  # @return [Boolean] true when every path segment is utility chrome
@@ -282,131 +269,81 @@ module Html2rss
282
269
 
283
270
  # @return [Boolean] true when the leading segments are all utility chrome
284
271
  def deep_utility_context_route?
285
- LeadingSegments.new(segments).all_junk?
272
+ all_junk?(segments.size - 1)
286
273
  end
287
274
 
288
- private
289
-
290
- def yearish_content_context?
291
- segments.any? { |segment| segment.match?(YEARISH_SEGMENT) } &&
292
- (strong_post_suffix? || LeadingSegments.new(segments).trusted_post_context?)
293
- end
294
- end
295
- # rubocop:enable Metrics/ClassLength
296
-
297
- # Classifies high-confidence junk and utility routes from path facts.
298
- class ConfidenceClassifier
299
- # @param path [PathClassifier] classified destination path
300
- def initialize(path)
301
- @path = path
302
- end
303
-
304
- # @return [Hash] high-confidence route classification attributes
305
- def attributes
306
- {
307
- high_confidence_junk_path: junk_path?,
308
- high_confidence_utility_destination: utility_destination?
309
- }
310
- end
311
-
312
- private
313
-
275
+ # @return [Boolean] true when the route is shallow and contains high-confidence noise
314
276
  def junk_path?
315
277
  return false if excluded_content_route?
316
278
 
317
- @path.taxonomy_path? ||
318
- @path.utility_only_route? ||
319
- @path.deep_utility_context_route? ||
320
- @path.shallow_high_confidence_route?
279
+ taxonomy_path? ||
280
+ utility_only_route? ||
281
+ deep_utility_context_route? ||
282
+ shallow_high_confidence_route?
321
283
  end
322
284
 
285
+ # @return [Boolean] true when the route points at conversion or account chrome
323
286
  def utility_destination?
324
287
  return false if excluded_content_route?
325
288
 
326
- @path.vanity_path? || utility_route?
289
+ vanity_path? || utility_route?
290
+ end
291
+
292
+ private
293
+
294
+ def yearish_content_context?
295
+ segments.any? { |segment| segment.match?(YEARISH_SEGMENT) } &&
296
+ (strong_post_suffix? || trusted_post_context?(segments.size - 1))
327
297
  end
328
298
 
329
299
  def excluded_content_route?
330
- @path.segments.empty? || @path.content_path? || @path.strong_post_suffix?
300
+ segments.empty? || content_path? || strong_post_suffix?
331
301
  end
332
302
 
333
303
  def utility_route?
334
- @path.taxonomy_path? ||
335
- @path.utility_only_route? ||
336
- @path.deep_utility_context_route? ||
304
+ taxonomy_path? ||
305
+ utility_only_route? ||
306
+ deep_utility_context_route? ||
337
307
  shallow_utility_route?
338
308
  end
339
309
 
340
310
  def shallow_utility_route?
341
- @path.shallow? && @path.utility_path?
342
- end
343
- end
344
-
345
- # Classifies route context before the final segment.
346
- class LeadingSegments
347
- # @param segments [Array<String>] normalized URL path segments
348
- def initialize(segments)
349
- @segments = segments[0...-1]
311
+ shallow? && utility_path?
350
312
  end
351
313
 
352
- # @return [Boolean] true when every leading segment is utility chrome
353
- def all_junk?
354
- junk_segments = PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk)
314
+ def all_junk?(limit)
315
+ return false if limit <= 0
355
316
 
356
- @segments.any? && @segments.all? { |segment| junk_segments.include?(segment) }
317
+ junk_segments = SEGMENT_SETS.fetch(:high_confidence_junk)
318
+ (0...limit).all? { |i| junk_segments.include?(segments[i]) }
357
319
  end
358
320
 
359
- # @return [Boolean] true when leading segments provide article context
360
- def trusted_post_context?
361
- content_segments = PathClassifier::SEGMENT_SETS.fetch(:content)
362
- context_segments = PathClassifier::SEGMENT_SETS.fetch(:deep_post_context)
321
+ def trusted_post_context?(limit)
322
+ return false if limit <= 0
323
+
324
+ content_segments = SEGMENT_SETS.fetch(:content)
325
+ context_segments = SEGMENT_SETS.fetch(:deep_post_context)
363
326
 
364
- @segments.any? do |segment|
327
+ (0...limit).any? do |i|
328
+ segment = segments[i]
365
329
  content_segments.include?(segment) ||
366
330
  segment.match?(PathClassifier::YEARISH_SEGMENT) ||
367
331
  context_segments.include?(segment)
368
332
  end
369
333
  end
370
- end
371
-
372
- # Classifies whether the final segment is a strong post-like suffix.
373
- class PostSuffixClassifier
374
- # @param segments [Array<String>] normalized URL path segments
375
- def initialize(segments)
376
- @segments = segments
377
- end
378
-
379
- # @return [Boolean] true when the final path segment looks like a post slug
380
- def strong?
381
- @segments.any? &&
382
- included_last_segment? &&
383
- LeadingSegments.new(@segments).trusted_post_context?
384
- end
385
-
386
- private
387
334
 
388
335
  def included_last_segment?
389
336
  !excluded_last_segment? && slug_last_segment?
390
337
  end
391
338
 
392
339
  def excluded_last_segment?
393
- excluded_segments.any? { |segment| segment.include?(last_segment) }
394
- end
395
-
396
- def excluded_segments
397
- [
398
- PathClassifier::SEGMENT_SETS.fetch(:high_confidence_junk),
399
- PathClassifier::SEGMENT_SETS.fetch(:vanity)
400
- ]
340
+ last = segments.last
341
+ [SEGMENT_SETS[:high_confidence_junk], SEGMENT_SETS[:vanity]].any? { |set| set.include?(last) }
401
342
  end
402
343
 
403
344
  def slug_last_segment?
404
- last_segment.match?(PathClassifier::YEARISH_SEGMENT) ||
405
- last_segment.match?(PathClassifier::POST_SLUG_SEGMENT)
406
- end
407
-
408
- def last_segment
409
- @segments.last
345
+ last = segments.last
346
+ last.match?(YEARISH_SEGMENT) || last.match?(POST_SLUG_SEGMENT)
410
347
  end
411
348
  end
412
349
 
@@ -421,11 +358,15 @@ module Html2rss
421
358
  # @param anchor_or_href [Nokogiri::XML::Element, String, #to_s] anchor element or href-like value
422
359
  # @return [DestinationFacts, nil] normalized destination facts, or nil for blank/invalid URLs
423
360
  def destination_facts(anchor_or_href)
361
+ return node_facts[anchor_or_href] if node_facts.key?(anchor_or_href)
362
+
424
363
  href = HrefExtractor.call(anchor_or_href)
425
364
  return unless href
426
365
 
427
- url = Html2rss::Url.from_relative(href, @base_url)
428
- DestinationFacts.build(url)
366
+ res = memoized_destination_facts(href)
367
+
368
+ node_facts[anchor_or_href] = res if anchor_or_href.is_a?(Nokogiri::XML::Node)
369
+ res
429
370
  rescue ArgumentError
430
371
  nil
431
372
  end
@@ -441,6 +382,19 @@ module Html2rss
441
382
  # @param text [String, #to_s] visible anchor text
442
383
  # @return [Boolean] true when text identifies recommendation chrome
443
384
  def recommended_text?(text) = @text_classifier.recommended?(text)
385
+
386
+ private
387
+
388
+ def node_facts
389
+ @node_facts ||= {}.compare_by_identity
390
+ end
391
+
392
+ def memoized_destination_facts(href)
393
+ (@destination_facts ||= {})[href] ||= begin
394
+ url = Html2rss::Url.from_relative(href, @base_url)
395
+ DestinationFacts.build(url)
396
+ end
397
+ end
444
398
  end
445
399
  end
446
400
  end