html2rss 0.20.1 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/html2rss.gemspec +1 -2
  3. data/lib/html2rss/auto_source/scraper/html.rb +61 -16
  4. data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
  5. data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
  6. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
  7. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
  8. data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
  9. data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
  10. data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
  11. data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
  12. data/lib/html2rss/auto_source/scraper.rb +0 -3
  13. data/lib/html2rss/auto_source.rb +2 -11
  14. data/lib/html2rss/category_extractor.rb +54 -20
  15. data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
  16. data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
  17. data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
  18. data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
  19. data/lib/html2rss/html_extractor.rb +51 -30
  20. data/lib/html2rss/rendering/description_builder.rb +3 -3
  21. data/lib/html2rss/rss_builder/article.rb +44 -23
  22. data/lib/html2rss/rss_builder/enclosure.rb +4 -2
  23. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
  24. data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
  25. data/lib/html2rss/selectors/post_processors/template.rb +3 -2
  26. data/lib/html2rss/selectors.rb +18 -4
  27. data/lib/html2rss/url.rb +4 -3
  28. data/lib/html2rss/version.rb +1 -1
  29. metadata +3 -17
@@ -13,11 +13,10 @@ module Html2rss
13
13
  # @param schema_object [Hash] The schema object
14
14
  # @return [Array<String>] Array of category strings
15
15
  def self.call(schema_object)
16
- # Build union of all category sources
17
- field_categories = extract_field_categories(schema_object)
18
- about_categories = extract_about_categories(schema_object)
19
-
20
- (field_categories | about_categories).to_a
16
+ Set.new.tap do |categories|
17
+ extract_field_categories!(categories, schema_object)
18
+ extract_about_categories!(categories, schema_object)
19
+ end.to_a
21
20
  end
22
21
 
23
22
  ##
@@ -26,10 +25,18 @@ module Html2rss
26
25
  # @param schema_object [Hash] The schema object
27
26
  # @return [Set<String>] Set of category strings
28
27
  def self.extract_field_categories(schema_object)
29
- Set.new.tap do |categories|
30
- %w[keywords categories tags].each do |field|
31
- categories.merge(extract_field_value(schema_object, field))
32
- end
28
+ Set.new.tap { |categories| extract_field_categories!(categories, schema_object) }
29
+ end
30
+
31
+ ##
32
+ # Extracts categories from keywords, categories, and tags fields.
33
+ #
34
+ # @param categories [Set<String>] Accumulator set
35
+ # @param schema_object [Hash] The schema object
36
+ # @return [void]
37
+ def self.extract_field_categories!(categories, schema_object)
38
+ %i[keywords categories tags].each do |field|
39
+ extract_field_value!(categories, schema_object[field])
33
40
  end
34
41
  end
35
42
 
@@ -39,15 +46,23 @@ module Html2rss
39
46
  # @param schema_object [Hash] The schema object
40
47
  # @return [Set<String>] Set of category strings
41
48
  def self.extract_about_categories(schema_object)
49
+ Set.new.tap { |categories| extract_about_categories!(categories, schema_object) }
50
+ end
51
+
52
+ ##
53
+ # Extracts categories from the about field.
54
+ #
55
+ # @param categories [Set<String>] Accumulator set
56
+ # @param schema_object [Hash] The schema object
57
+ # @return [void]
58
+ def self.extract_about_categories!(categories, schema_object)
42
59
  about = schema_object[:about]
43
- return Set.new unless about
60
+ return unless about
44
61
 
45
62
  if about.is_a?(Array)
46
- extract_about_array(about)
63
+ extract_about_array!(categories, about)
47
64
  elsif about.is_a?(String)
48
- extract_string_categories(about)
49
- else
50
- Set.new
65
+ extract_string_categories!(categories, about)
51
66
  end
52
67
  end
53
68
 
@@ -58,15 +73,25 @@ module Html2rss
58
73
  # @param field [String] The field name
59
74
  # @return [Set<String>] Set of category strings
60
75
  def self.extract_field_value(schema_object, field)
61
- value = schema_object[field.to_sym]
62
- return Set.new unless value
76
+ Set.new.tap { |categories| extract_field_value!(categories, schema_object[field.to_sym]) }
77
+ end
78
+
79
+ ##
80
+ # Extracts categories from a single field value.
81
+ #
82
+ # @param categories [Set<String>] Accumulator set
83
+ # @param value [Object] The field value
84
+ # @return [void]
85
+ def self.extract_field_value!(categories, value)
86
+ return unless value
63
87
 
64
88
  if value.is_a?(Array)
65
- Set.new(value.map(&:to_s).reject(&:empty?))
89
+ value.each do |item|
90
+ s = item.to_s
91
+ categories.add(s) unless s.empty?
92
+ end
66
93
  elsif value.is_a?(String)
67
- extract_string_categories(value)
68
- else
69
- Set.new
94
+ extract_string_categories!(categories, value)
70
95
  end
71
96
  end
72
97
 
@@ -76,13 +101,21 @@ module Html2rss
76
101
  # @param about [Array] The about array
77
102
  # @return [Set<String>] Set of category strings
78
103
  def self.extract_about_array(about)
79
- Set.new.tap do |categories|
80
- about.each do |item|
81
- if item.is_a?(Hash) && item[:name]
82
- categories.add(item[:name].to_s)
83
- elsif item.is_a?(String)
84
- categories.add(item)
85
- end
104
+ Set.new.tap { |categories| extract_about_array!(categories, about) }
105
+ end
106
+
107
+ ##
108
+ # Extracts categories from an about array.
109
+ #
110
+ # @param categories [Set<String>] Accumulator set
111
+ # @param about [Array] The about array
112
+ # @return [void]
113
+ def self.extract_about_array!(categories, about)
114
+ about.each do |item|
115
+ if item.is_a?(Hash) && item[:name]
116
+ categories.add(item[:name].to_s)
117
+ elsif item.is_a?(String)
118
+ categories.add(item)
86
119
  end
87
120
  end
88
121
  end
@@ -93,7 +126,20 @@ module Html2rss
93
126
  # @param string [String] source string that may contain category delimiters
94
127
  # @return [Set<String>] Set of category strings
95
128
  def self.extract_string_categories(string)
96
- Set.new(string.split(/[,;|]/).map(&:strip).reject(&:empty?))
129
+ Set.new.tap { |categories| extract_string_categories!(categories, string) }
130
+ end
131
+
132
+ ##
133
+ # Extracts categories from a string by splitting on separators.
134
+ #
135
+ # @param categories [Set<String>] Accumulator set
136
+ # @param string [String] source string that may contain category delimiters
137
+ # @return [void]
138
+ def self.extract_string_categories!(categories, string)
139
+ string.split(/[,;|]/).each do |part|
140
+ s = part.strip
141
+ categories.add(s) unless s.empty?
142
+ end
97
143
  end
98
144
  end
99
145
  end
@@ -16,9 +16,10 @@ module Html2rss
16
16
 
17
17
  # @return [Html2rss::Url, nil]
18
18
  def url
19
- url = schema_object.dig(:item, :url) || super
19
+ return @url if defined?(@url)
20
20
 
21
- Url.from_relative(url, base_url || url) if url
21
+ item_url = schema_object.dig(:item, :url)
22
+ @url = item_url ? Url.from_relative(item_url, base_url || item_url) : super
22
23
  end
23
24
  end
24
25
  end
@@ -13,24 +13,10 @@ module Html2rss
13
13
  class Thing
14
14
  # Supported Schema.org `@type` values mapped to article extraction.
15
15
  SUPPORTED_TYPES = %w[
16
- AdvertiserContentArticle
17
- AnalysisNewsArticle
18
- APIReference
19
- Article
20
- AskPublicNewsArticle
21
- BackgroundNewsArticle
22
- BlogPosting
23
- DiscussionForumPosting
24
- LiveBlogPosting
25
- NewsArticle
26
- OpinionNewsArticle
27
- Report
28
- ReportageNewsArticle
29
- ReviewNewsArticle
30
- SatiricalArticle
31
- ScholarlyArticle
32
- SocialMediaPosting
33
- TechArticle
16
+ AdvertiserContentArticle AnalysisNewsArticle APIReference Article
17
+ AskPublicNewsArticle BackgroundNewsArticle BlogPosting DiscussionForumPosting
18
+ LiveBlogPosting NewsArticle OpinionNewsArticle Report ReportageNewsArticle
19
+ ReviewNewsArticle SatiricalArticle ScholarlyArticle SocialMediaPosting TechArticle
34
20
  ].to_set.freeze
35
21
 
36
22
  # Attributes exposed by `#call` in generated article hashes.
@@ -44,21 +30,14 @@ module Html2rss
44
30
  end
45
31
 
46
32
  # @return [Hash] the scraped article hash with DEFAULT_ATTRIBUTES
47
- def call
48
- DEFAULT_ATTRIBUTES.to_h do |attribute|
49
- [attribute, public_send(attribute)]
50
- end
51
- end
33
+ def call = DEFAULT_ATTRIBUTES.to_h { [_1, public_send(_1)] }
52
34
 
53
35
  # @return [String, nil] stable schema object identifier
54
36
  def id
55
37
  return @id if defined?(@id)
56
38
 
57
39
  id = normalized_id(schema_object[:@id], reference_url: url || base_url) || url&.path.to_s
58
-
59
- return if id.empty?
60
-
61
- @id = id
40
+ @id = id.to_s.empty? ? nil : id
62
41
  end
63
42
 
64
43
  # @return [String, nil] article title
@@ -66,26 +45,28 @@ module Html2rss
66
45
 
67
46
  # @return [String, nil] longest available description field
68
47
  def description
69
- schema_object.values_at(:description, :schema_object_body, :abstract)
70
- .max_by { |string| string.to_s.size }
48
+ schema_object.values_at(:description, :schema_object_body, :abstract).max_by { _1.to_s.size }
71
49
  end
72
50
 
73
51
  # @return [Html2rss::Url, nil] the URL of the schema object
74
52
  def url
53
+ return @url if defined?(@url)
54
+
75
55
  url = schema_object[:url]
76
56
  if url.to_s.empty?
77
57
  Log.debug("Schema#Thing.url: no url in schema_object: #{schema_object.inspect}")
78
- return
58
+ return @url = nil
79
59
  end
80
60
 
81
- Url.from_relative(url, base_url || url)
61
+ @url = Url.from_relative(url, base_url || url)
82
62
  end
83
63
 
84
64
  # @return [Html2rss::Url, nil] normalized article image URL
85
65
  def image
86
- if (image_url = image_urls.first)
87
- Url.from_relative(image_url, base_url || image_url)
88
- end
66
+ return @image if defined?(@image)
67
+
68
+ img_url = image_urls.first
69
+ @image = img_url ? Url.from_relative(img_url, base_url || img_url) : nil
89
70
  end
90
71
 
91
72
  # @return [String, nil] published-at timestamp string
@@ -93,24 +74,23 @@ module Html2rss
93
74
 
94
75
  # @return [Array<String>, nil] extracted category labels
95
76
  def categories
96
- return @categories if defined?(@categories)
97
-
98
- @categories = CategoryExtractor.call(schema_object)
77
+ @categories ||= CategoryExtractor.call(schema_object)
99
78
  end
100
79
 
101
80
  attr_reader :schema_object, :base_url
102
81
 
103
82
  # @return [Array<String>] normalized image URL candidates
104
83
  def image_urls
105
- schema_object.values_at(:image, :thumbnailUrl).filter_map do |object|
106
- next unless object
107
-
108
- if object.is_a?(String)
109
- object
110
- elsif object.is_a?(Hash) && object[:@type] == 'ImageObject'
111
- object[:url] || object[:contentUrl]
112
- end
113
- end
84
+ @image_urls ||= schema_object.values_at(:image, :thumbnailUrl).filter_map { image_url_from(_1) }
85
+ end
86
+
87
+ private
88
+
89
+ def image_url_from(obj)
90
+ return obj if obj.is_a?(String)
91
+ return unless obj.is_a?(Hash) && obj[:@type] == 'ImageObject'
92
+
93
+ obj[:url] || obj[:contentUrl]
114
94
  end
115
95
 
116
96
  # @param value [String, Symbol, nil] candidate schema identifier
@@ -120,10 +100,8 @@ module Html2rss
120
100
  text = value.to_s
121
101
  return if text.empty?
122
102
 
123
- normalized_url = normalized_id_url(text, reference_url:)
124
- return text unless reference_url && normalized_url.host == reference_url.host
125
-
126
- normalized_id_value(normalized_url)
103
+ norm_url = normalized_id_url(text, reference_url:)
104
+ reference_url && norm_url.host == reference_url.host ? normalized_id_value(norm_url) : text
127
105
  rescue ArgumentError
128
106
  text
129
107
  end
@@ -132,11 +110,7 @@ module Html2rss
132
110
  # @param reference_url [Html2rss::Url, nil] URL used to resolve relative IDs
133
111
  # @return [Html2rss::Url] normalized identifier URL
134
112
  def normalized_id_url(text, reference_url:)
135
- if text.start_with?('/')
136
- Url.from_relative(text, reference_url || text)
137
- else
138
- Url.from_absolute(text)
139
- end
113
+ text.start_with?('/') ? Url.from_relative(text, reference_url || text) : Url.from_absolute(text)
140
114
  end
141
115
 
142
116
  # @param url [Html2rss::Url] normalized identifier URL
@@ -144,17 +118,14 @@ module Html2rss
144
118
  def normalized_id_value(url)
145
119
  path = url.path.to_s
146
120
  return "#{path}?#{url.query}" if (path.empty? || path == '/') && !url.query.to_s.empty?
147
- return path unless path.empty?
148
121
 
149
- url.query
122
+ path.empty? ? url.query : path
150
123
  end
151
124
 
152
125
  # @param url [String, Html2rss::Url, nil] candidate page URL
153
126
  # @return [Html2rss::Url, nil] normalized absolute URL for schema resolution
154
127
  def normalized_base_url(url)
155
- return if url.to_s.strip.empty?
156
-
157
- Url.from_absolute(url)
128
+ Url.from_absolute(url) unless url.to_s.strip.empty?
158
129
  rescue ArgumentError
159
130
  nil
160
131
  end
@@ -18,6 +18,13 @@ module Html2rss
18
18
  # Selector for JSON-LD script tags containing Schema.org objects.
19
19
  TAG_SELECTOR = 'script[type="application/ld+json"]'
20
20
 
21
+ # Pre-compiled regex union for supported schema types.
22
+ # Performs a single pass over script tag text instead of multiple regex matches.
23
+ SUPPORTED_TYPES_RE = begin
24
+ types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
25
+ /"@type"\s*:\s*"(?:#{Regexp.union(types.to_a).source})"/
26
+ end.freeze
27
+
21
28
  # @return [Symbol] scraper config key
22
29
  def self.options_key = :schema
23
30
 
@@ -31,8 +38,7 @@ module Html2rss
31
38
  # @param script [Nokogiri::XML::Element] schema JSON-LD script tag
32
39
  # @return [Boolean] whether the tag references a supported schema type
33
40
  def supported_schema_type?(script)
34
- supported_types = Thing::SUPPORTED_TYPES | ItemList::SUPPORTED_TYPES
35
- supported_types.any? { |type| script.text.match?(/"@type"\s*:\s*"#{Regexp.escape(type)}"/) }
41
+ script.text.match?(SUPPORTED_TYPES_RE)
36
42
  end
37
43
 
38
44
  ##
@@ -22,8 +22,7 @@ module Html2rss
22
22
  # @return [Array<Entry>] deduplicated list of scraper entries
23
23
  def call(entries)
24
24
  destination_groups(entries).filter_map do |group|
25
- collapsed_group = collapse_nested_destination_group(group)
26
- collapsed_group.reduce do |best, entry|
25
+ group.reduce do |best, entry|
27
26
  stronger_entry?(entry, best) ? entry : best
28
27
  end
29
28
  end
@@ -67,21 +66,6 @@ module Html2rss
67
66
 
68
67
  def destination_groups(entries) = entries.group_by { entry_destination(_1) }.values
69
68
 
70
- def collapse_nested_destination_group(entries)
71
- return entries if entries.size <= 1
72
-
73
- entries.reject do |entry|
74
- entries.any? do |other|
75
- next if entry.equal?(other)
76
- next unless nested_container_pair?(entry.container, other.container)
77
-
78
- stronger_entry?(other, entry)
79
- end
80
- end
81
- end
82
-
83
- def nested_container_pair?(left, right) = left.ancestors.include?(right) || right.ancestors.include?(left)
84
-
85
69
  def entry_destination(entry) = entry.destination_facts&.destination || article_for(entry)&.[](:url)&.to_s
86
70
 
87
71
  def payload_richness_signature(article)
@@ -94,7 +78,9 @@ module Html2rss
94
78
  ]
95
79
  end
96
80
 
97
- def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
81
+ def word_count(text)
82
+ (@word_counts ||= {})[text] ||= text.to_s.scan(/\p{Alnum}+/).size
83
+ end
98
84
  end
99
85
  end
100
86
  end
@@ -21,6 +21,18 @@ module Html2rss
21
21
  class SemanticHtml # rubocop:disable Metrics/ClassLength
22
22
  include Enumerable
23
23
 
24
+ # Regexp to match content-related tokens.
25
+ CONTENT_REGEXP = begin
26
+ words = LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:content)
27
+ /(?:^|\s|[-_])(#{Regexp.union(words.to_a).source})(?:\s|[-_]|$)/i
28
+ end.freeze
29
+
30
+ # Regexp to match junk/utility-related tokens.
31
+ JUNK_REGEXP = begin
32
+ words = LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:utility)
33
+ /(?:^|\s|[-_])(#{Regexp.union(words.to_a).source})(?:\s|[-_]|$)/i
34
+ end.freeze
35
+
24
36
  # Container plus selected anchor, scoring metadata, and extracted article.
25
37
  Entry = Data.define(
26
38
  :container,
@@ -218,47 +230,79 @@ module Html2rss
218
230
  weak_article_candidate)
219
231
  end
220
232
 
233
+ ##
234
+ # @param container [Nokogiri::XML::Node]
235
+ # @return [Boolean]
221
236
  def publish_marker?(container)
222
- container.at_css('time, [datetime], [itemprop="datePublished"], [itemprop="dateModified"]')
237
+ (@publish_markers ||= {}.compare_by_identity)[container] ||=
238
+ !!container.at_css('time, [datetime], [itemprop="datePublished"], [itemprop="dateModified"]')
223
239
  end
224
240
 
241
+ ##
242
+ # @param container [Nokogiri::XML::Node]
243
+ # @param publish_signal [Boolean]
244
+ # @param descriptive_signal [Boolean]
245
+ # @param content_signal [Boolean]
246
+ # @return [Integer]
225
247
  def article_signal_count(container, publish_signal:, descriptive_signal:, content_signal:)
226
248
  [article_container?(container), publish_signal, descriptive_signal, content_signal].count(&:itself)
227
249
  end
228
250
 
251
+ ##
252
+ # @param container [Nokogiri::XML::Node]
253
+ # @return [Boolean]
229
254
  def article_container?(container) = container.name == 'article'
230
255
 
231
256
  def descriptive_context?(container_text, title)
232
257
  snippet = container_text.to_s.sub(/\A#{Regexp.escape(title.to_s)}/i, '')
233
- word_count(snippet) >= 8
258
+ # Only check for existence of enough words if snippet is long enough to have them
259
+ snippet.length > 30 && word_count(snippet) >= 8
234
260
  end
235
261
 
236
- def heading_for(container) = container.at_css(AnchorSelector::HEADING_SELECTOR)
262
+ ##
263
+ # @param container [Nokogiri::XML::Node]
264
+ # @return [Nokogiri::XML::Node, nil]
265
+ def heading_for(container)
266
+ (@headings ||= {}.compare_by_identity)[container] ||= container.at_css(AnchorSelector::HEADING_SELECTOR)
267
+ end
237
268
 
238
- def normalized_destination(anchor) = @link_heuristics.destination_facts(anchor)
269
+ def normalized_destination(anchor)
270
+ (@normalized_destinations ||= {}.compare_by_identity)[anchor] ||= @link_heuristics.destination_facts(anchor)
271
+ end
239
272
 
240
273
  def visible_text(node)
241
274
  return '' unless node
242
275
 
243
- HtmlExtractor.extract_visible_text(node).to_s.strip
276
+ (@visible_texts ||= {}.compare_by_identity)[node] ||= HtmlExtractor.extract_visible_text(node).to_s.strip
244
277
  end
245
278
 
279
+ ##
280
+ # @param container [Nokogiri::XML::Node]
281
+ # @param selected_anchor [Nokogiri::XML::Node]
282
+ # @return [String]
246
283
  def entry_title(container, selected_anchor) = visible_text(heading_for(container) || selected_anchor)
247
284
 
248
- def word_count(text) = text.to_s.scan(/\p{Alnum}+/).size
285
+ ##
286
+ # @param text [String, #to_s]
287
+ # @return [Integer]
288
+ def word_count(text)
289
+ (@word_counts ||= {})[text] ||= begin
290
+ count = 0
291
+ text.to_s.scan(/\p{Alnum}+/) { count += 1 }
292
+ count
293
+ end
294
+ end
249
295
 
250
296
  def container_tokens(container)
251
- classes = container['class'].to_s.split
252
- id = container['id'].to_s
253
- (classes << id).flat_map { |str| str.downcase.split(/[-_]+/) }.reject(&:empty?)
297
+ (@container_tokens ||= {}.compare_by_identity)[container] ||= "#{container['class']} #{container['id']}"
254
298
  end
255
299
 
256
300
  def content_tokens?(tokens)
257
- (@content_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:content)).intersect?(tokens.to_set)
301
+ tokens.match?(CONTENT_REGEXP)
258
302
  end
259
303
 
260
304
  def junk_tokens?(tokens)
261
- (@junk_segments ||= LinkHeuristics::PathClassifier::SEGMENT_SETS.fetch(:utility)).intersect?(tokens.to_set)
305
+ tokens.match?(JUNK_REGEXP)
262
306
  end
263
307
 
264
308
  def stable_rank(entries)
@@ -11,9 +11,6 @@ module Html2rss
11
11
  # Detection is intentionally shallow for most scrapers, but instance-based
12
12
  # matching is available for scrapers that need to carry expensive selection
13
13
  # state forward into extraction.
14
- # Scrapers run in parallel threads, so implementations must avoid shared
15
- # mutable state and degrade by returning no articles when a follow-up would
16
- # be unsafe or unsupported.
17
14
  module Scraper
18
15
  # Root markers indicating likely app-shell/client-rendered surfaces.
19
16
  APP_SHELL_ROOT_SELECTORS = '#app, #root, #__next, [data-reactroot], [ng-app], [id*="app-shell"]'
@@ -1,6 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'parallel'
4
3
  require 'dry-validation'
5
4
 
6
5
  module Html2rss
@@ -121,11 +120,8 @@ module Html2rss
121
120
  scraper_instances = Scraper.instances_for(parsed_body, url:, request_session:, opts: @opts[:scraper])
122
121
  return [] if scraper_instances.empty?
123
122
 
124
- # Scrapers are instantiated and run in parallel threads. Implementations
125
- # must avoid shared mutable state, treat request_session calls as
126
- # concurrency-safe from the scraper side, and return no articles when a
127
- # follow-up would be unsafe or unsupported.
128
- articles = Parallel.flat_map(scraper_instances, in_threads: thread_count_for(scraper_instances)) do |instance|
123
+ # Scrapers are run sequentially.
124
+ articles = scraper_instances.flat_map do |instance|
129
125
  run_scraper(instance)
130
126
  end
131
127
  Cleanup.call(articles, url:, **cleanup_options)
@@ -140,10 +136,5 @@ module Html2rss
140
136
  def cleanup_options
141
137
  @opts.fetch(:cleanup, {})
142
138
  end
143
-
144
- def thread_count_for(scrapers)
145
- count = [scrapers.size, Parallel.processor_count].min
146
- count.zero? ? 1 : count
147
- end
148
139
  end
149
140
  end
@@ -8,8 +8,10 @@ module Html2rss
8
8
  # Common category-related terms to look for in class names
9
9
  CATEGORY_TERMS = %w[category tag topic section label theme subject].freeze
10
10
 
11
- # CSS selectors to find elements with category-related class names
12
- CATEGORY_SELECTORS = CATEGORY_TERMS.map { |term| "[class*=\"#{term}\"]" }.freeze
11
+ # CSS selectors to find elements with category-related class names or data attributes
12
+ CATEGORY_SELECTORS = CATEGORY_TERMS.flat_map do |term|
13
+ ["[class*=\"#{term}\"]", "[data-#{term}]", "[#{term}]"]
14
+ end.freeze
13
15
 
14
16
  # Regex pattern for matching category-related attribute names
15
17
  CATEGORY_ATTR_PATTERN = /#{CATEGORY_TERMS.join('|')}/i
@@ -36,12 +38,12 @@ module Html2rss
36
38
  # @return [Set<String>] Set of category strings
37
39
  def self.extract_all_categories(article_tag)
38
40
  Set.new.tap do |categories|
39
- article_tag.css('*').each do |element|
41
+ article_tag.css(CATEGORY_SELECTORS.join(',')).each do |element|
40
42
  # Extract text categories from elements with category-related class names
41
- categories.merge(extract_text_categories(element)) if element['class']&.match?(CATEGORY_ATTR_PATTERN)
43
+ extract_text_categories!(categories, element) if element['class']&.match?(CATEGORY_ATTR_PATTERN)
42
44
 
43
45
  # Extract data categories from all elements
44
- categories.merge(extract_element_data_categories(element))
46
+ extract_element_data_categories!(categories, element)
45
47
  end
46
48
  end
47
49
  end
@@ -49,34 +51,66 @@ module Html2rss
49
51
  ##
50
52
  # Extracts categories from data attributes of a single element.
51
53
  #
54
+ # @param categories [Set<String>] Accumulator set
52
55
  # @param element [Nokogiri::XML::Element] metadata element that may contain category links
53
- # @return [Set<String>] Set of category strings
54
- def self.extract_element_data_categories(element)
55
- Set.new.tap do |categories|
56
- element.attributes.each_value do |attr|
57
- next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
56
+ # @return [void]
57
+ def self.extract_element_data_categories!(categories, element)
58
+ element.attributes.each_value do |attr|
59
+ next unless attr.name.match?(CATEGORY_ATTR_PATTERN)
58
60
 
59
- value = attr.value&.strip
60
- categories.add(value) if value && !value.empty?
61
- end
61
+ value = attr.value&.strip
62
+ categories.add(value) if value && !value.empty?
62
63
  end
63
64
  end
64
65
 
65
66
  ##
66
67
  # Extracts text-based categories from elements, splitting content into discrete values.
67
68
  #
69
+ # @param categories [Set<String>] Accumulator set
68
70
  # @param element [Nokogiri::XML::Element] metadata element whose text may contain delimiters
69
- # @return [Set<String>] Set of category strings
70
- def self.extract_text_categories(element)
71
- anchor_values = element.css('a').filter_map do |node|
72
- HtmlExtractor.extract_visible_text(node)
71
+ # @return [void]
72
+ def self.extract_text_categories!(categories, element)
73
+ if element.name == 'a'
74
+ add_text_to_categories!(categories, element)
75
+ return
73
76
  end
74
- return Set.new(anchor_values.reject(&:empty?)) if anchor_values.any?
75
77
 
78
+ anchors = element.css('a')
79
+
80
+ if anchors.any?
81
+ anchors.each { |node| add_text_to_categories!(categories, node) }
82
+ else
83
+ extract_split_text_categories!(categories, element)
84
+ end
85
+ end
86
+
87
+ ##
88
+ # Adds the visible text of the given element to the categories set.
89
+ #
90
+ # @param categories [Set<String>] Accumulator set
91
+ # @param element [Nokogiri::XML::Element] The element to extract text from
92
+ # @return [void]
93
+ def self.add_text_to_categories!(categories, element)
94
+ text = HtmlExtractor.extract_visible_text(element)
95
+ categories.add(text) if text && !text.empty?
96
+ end
97
+
98
+ ##
99
+ # Extracts categories from the element's text by splitting on newlines.
100
+ #
101
+ # @param categories [Set<String>] Accumulator set
102
+ # @param element [Nokogiri::XML::Element] The element to extract text from
103
+ # @return [void]
104
+ def self.extract_split_text_categories!(categories, element)
76
105
  text = HtmlExtractor.extract_visible_text(element)
77
- return Set.new unless text
106
+ return unless text
78
107
 
79
- Set.new(text.split(/\n+/).map(&:strip).reject(&:empty?))
108
+ text.split(/\n+/).each do |line|
109
+ line = line.strip
110
+ categories.add(line) unless line.empty?
111
+ end
80
112
  end
113
+
114
+ private_class_method :add_text_to_categories!, :extract_split_text_categories!
81
115
  end
82
116
  end