html2rss 0.20.1 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/html2rss.gemspec +1 -2
  3. data/lib/html2rss/auto_source/scraper/html.rb +61 -16
  4. data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
  5. data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
  6. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
  7. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
  8. data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
  9. data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
  10. data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
  11. data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
  12. data/lib/html2rss/auto_source/scraper.rb +0 -3
  13. data/lib/html2rss/auto_source.rb +2 -11
  14. data/lib/html2rss/category_extractor.rb +54 -20
  15. data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
  16. data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
  17. data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
  18. data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
  19. data/lib/html2rss/html_extractor.rb +51 -30
  20. data/lib/html2rss/rendering/description_builder.rb +3 -3
  21. data/lib/html2rss/rss_builder/article.rb +44 -23
  22. data/lib/html2rss/rss_builder/enclosure.rb +4 -2
  23. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
  24. data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
  25. data/lib/html2rss/selectors/post_processors/template.rb +3 -2
  26. data/lib/html2rss/selectors.rb +18 -4
  27. data/lib/html2rss/url.rb +4 -3
  28. data/lib/html2rss/version.rb +1 -1
  29. metadata +3 -17
@@ -5,116 +5,87 @@ module Html2rss
5
5
  ##
6
6
  # Extracts enclosures from HTML tags using various strategies.
7
7
  class EnclosureExtractor
8
+ # CSS union query covering images, media, PDFs, iframes, and archives.
9
+ SELECTOR = [
10
+ 'img[src]:not([src^="data"])',
11
+ 'video source[src]',
12
+ 'audio source[src]',
13
+ 'audio[src]',
14
+ 'a[href$=".pdf"]',
15
+ 'iframe[src]',
16
+ 'a[href$=".zip"]',
17
+ 'a[href$=".tar.gz"]',
18
+ 'a[href$=".tgz"]'
19
+ ].join(',').freeze
20
+
8
21
  # @param article_tag [Nokogiri::XML::Element] article container node
9
22
  # @param base_url [String, Html2rss::Url] base URL for relative enclosure links
10
23
  # @return [Array<Hash{Symbol => Object}>] normalized enclosure hashes
11
24
  def self.call(article_tag, base_url)
12
- [
13
- Extractors::Image,
14
- Extractors::Media,
15
- Extractors::Pdf,
16
- Extractors::Iframe,
17
- Extractors::Archive
18
- ].flat_map { |strategy| strategy.call(article_tag, base_url:) }
25
+ article_tag.css(SELECTOR).filter_map do |element|
26
+ extract_from_element(element, base_url)
27
+ end
19
28
  end
20
- end
21
29
 
22
- # Extraction strategies for enclosure-like media/link tags.
23
- module Extractors
24
- # Extracts image enclosures from HTML tags.
25
- # Finds all image sources and returns them in a format suitable for RSS.
26
- class Image
27
- # @param article_tag [Nokogiri::XML::Element] article container node
28
- # @param base_url [String, Html2rss::Url] base URL for relative image sources
29
- # @return [Array<Hash{Symbol => Object}>] image enclosure hashes
30
- def self.call(article_tag, base_url:)
31
- article_tag.css('img[src]:not([src^="data"])').filter_map do |img|
32
- src = img['src'].to_s
33
- next if src.empty?
34
-
35
- abs_url = Url.from_relative(src, base_url)
36
- {
37
- url: abs_url,
38
- type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
39
- }
40
- end
30
+ def self.extract_from_element(element, base_url)
31
+ case element.name
32
+ when 'img'
33
+ extract_image(element, base_url)
34
+ when 'video', 'audio', 'source'
35
+ extract_media(element, base_url)
36
+ when 'iframe'
37
+ extract_iframe(element, base_url)
38
+ when 'a'
39
+ extract_a(element, base_url)
41
40
  end
42
41
  end
43
42
 
44
- # Extracts media enclosures (video/audio) from HTML tags.
45
- class Media
46
- # @param article_tag [Nokogiri::XML::Element] article container node
47
- # @param base_url [String, Html2rss::Url] base URL for relative media sources
48
- # @return [Array<Hash{Symbol => Object}>] media enclosure hashes
49
- def self.call(article_tag, base_url:)
50
- article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
51
- src = element['src'].to_s
52
- next if src.empty?
43
+ def self.extract_image(img, base_url)
44
+ src = img['src'].to_s
45
+ return if src.empty?
53
46
 
54
- {
55
- url: Url.from_relative(src, base_url),
56
- type: element['type']
57
- }
58
- end
59
- end
47
+ abs_url = Url.from_relative(src, base_url)
48
+ {
49
+ url: abs_url,
50
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
51
+ }
60
52
  end
61
53
 
62
- # Extracts PDF enclosures from HTML tags.
63
- class Pdf
64
- # @param article_tag [Nokogiri::XML::Element] article container node
65
- # @param base_url [String, Html2rss::Url] base URL for relative PDF links
66
- # @return [Array<Hash{Symbol => Object}>] PDF enclosure hashes
67
- def self.call(article_tag, base_url:)
68
- article_tag.css('a[href$=".pdf"]').filter_map do |link|
69
- href = link['href'].to_s
70
- next if href.empty?
54
+ def self.extract_media(element, base_url)
55
+ src = element['src'].to_s
56
+ return if src.empty?
71
57
 
72
- abs_url = Url.from_relative(href, base_url)
73
- {
74
- url: abs_url,
75
- type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url)
76
- }
77
- end
78
- end
58
+ {
59
+ url: Url.from_relative(src, base_url),
60
+ type: element['type']
61
+ }
79
62
  end
80
63
 
81
- # Extracts iframe enclosures from HTML tags.
82
- class Iframe
83
- # @param article_tag [Nokogiri::XML::Element] article container node
84
- # @param base_url [String, Html2rss::Url] base URL for relative iframe links
85
- # @return [Array<Hash{Symbol => Object}>] iframe enclosure hashes
86
- def self.call(article_tag, base_url:)
87
- article_tag.css('iframe[src]').filter_map do |iframe|
88
- src = iframe['src']
89
- next if src.nil? || src.empty?
64
+ def self.extract_iframe(iframe, base_url)
65
+ src = iframe['src'].to_s
66
+ return if src.empty?
90
67
 
91
- abs_url = Url.from_relative(src, base_url)
92
- {
93
- url: abs_url,
94
- type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
95
- }
96
- end
97
- end
68
+ abs_url = Url.from_relative(src, base_url)
69
+ {
70
+ url: abs_url,
71
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
72
+ }
98
73
  end
99
74
 
100
- # Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
101
- class Archive
102
- # @param article_tag [Nokogiri::XML::Element] article container node
103
- # @param base_url [String, Html2rss::Url] base URL for relative archive links
104
- # @return [Array<Hash{Symbol => Object}>] archive enclosure hashes
105
- def self.call(article_tag, base_url:)
106
- article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
107
- href = link['href'].to_s
108
- next if href.empty?
75
+ def self.extract_a(link, base_url)
76
+ href = link['href'].to_s
77
+ return if href.empty?
109
78
 
110
- abs_url = Url.from_relative(href, base_url)
111
- {
112
- url: abs_url,
113
- type: 'application/zip'
114
- }
115
- end
79
+ abs_url = Url.from_relative(href, base_url)
80
+
81
+ if href.end_with?('.pdf')
82
+ { url: abs_url, type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url) }
83
+ else
84
+ { url: abs_url, type: 'application/zip' }
116
85
  end
117
86
  end
87
+
88
+ private_class_method :extract_from_element, :extract_image, :extract_media, :extract_iframe, :extract_a
118
89
  end
119
90
  end
120
91
  end
@@ -75,17 +75,11 @@ module Html2rss
75
75
  def each_anchor(anchor_filter:)
76
76
  return enum_for(:each_anchor, anchor_filter:) unless block_given?
77
77
 
78
- traversal_root&.traverse do |node|
79
- yield node if relevant_anchor?(node, anchor_filter:)
78
+ traversal_root&.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR)&.each do |node|
79
+ yield node if anchor_filter.call(node)
80
80
  end
81
81
  end
82
82
 
83
- def relevant_anchor?(node, anchor_filter:)
84
- node.element? &&
85
- node.matches?(HtmlExtractor::MAIN_ANCHOR_SELECTOR) &&
86
- anchor_filter.call(node)
87
- end
88
-
89
83
  def traversal_root
90
84
  parsed_body.at_css('body, html') || parsed_body.root
91
85
  end
@@ -31,6 +31,8 @@ module Html2rss
31
31
 
32
32
  # Shared context for all anchors in one semantic container.
33
33
  class Context
34
+ attr_reader :container
35
+
34
36
  # Ancestor tags that usually indicate navigation/utility regions.
35
37
  UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
36
38
 
@@ -56,7 +58,7 @@ module Html2rss
56
58
  def visible_text(node)
57
59
  return '' unless node
58
60
 
59
- HtmlExtractor.extract_visible_text(node).to_s.strip
61
+ (@visible_texts ||= {}.compare_by_identity)[node] ||= HtmlExtractor.extract_visible_text(node).to_s.strip
60
62
  end
61
63
 
62
64
  # @param anchor [Nokogiri::XML::Node] anchor candidate
@@ -70,12 +72,6 @@ module Html2rss
70
72
  def utility_text?(text)
71
73
  @link_heuristics.utility_text?(text)
72
74
  end
73
-
74
- # @param ancestors [Array<Nokogiri::XML::Node>]
75
- # @return [Boolean] true when the anchor lives inside navigation chrome
76
- def utility_landmark?(ancestors)
77
- ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
78
- end
79
75
  end
80
76
 
81
77
  # One anchor plus the facts needed to decide whether it represents content.
@@ -131,7 +127,7 @@ module Html2rss
131
127
 
132
128
  # @return [Boolean] true when visible anchor text has words
133
129
  def meaningful_text?
134
- text.scan(/\p{Alnum}+/).any?
130
+ @meaningful_text ||= text.match?(/\p{Alnum}/)
135
131
  end
136
132
 
137
133
  # @return [Boolean] true when the destination route has content signals
@@ -142,8 +138,17 @@ module Html2rss
142
138
  # @return [Boolean] true when the anchor is inside the selected heading
143
139
  def heading_anchor?
144
140
  heading = @context.heading
141
+ return false unless heading
142
+
143
+ curr = @anchor
144
+ container = @context.container
145
+ while curr.respond_to?(:parent)
146
+ return true if curr == heading
147
+ break if curr == container
145
148
 
146
- heading && @anchor.ancestors.include?(heading)
149
+ curr = curr.parent
150
+ end
151
+ false
147
152
  end
148
153
 
149
154
  # @return [Boolean] true when anchor text exactly matches heading text
@@ -151,14 +156,14 @@ module Html2rss
151
156
  heading_text = @context.heading_text
152
157
 
153
158
  meaningful_text? &&
154
- heading_text.scan(/\p{Alnum}+/).any? &&
159
+ heading_text.match?(/\p{Alnum}/) &&
155
160
  heading_text == text
156
161
  end
157
162
 
158
163
  private
159
164
 
160
165
  def representative_content_anchor?
161
- heading_anchor? || meaningful_text? || content_like_destination?
166
+ meaningful_text? || content_like_destination? || heading_anchor?
162
167
  end
163
168
 
164
169
  def utility_text_suppressed?
@@ -174,7 +179,19 @@ module Html2rss
174
179
  def ineligible_anchor?
175
180
  destination_facts.high_confidence_utility_destination ||
176
181
  icon_only_anchor? ||
177
- @context.utility_landmark?(@anchor.ancestors.to_a)
182
+ utility_landmark_ancestor?
183
+ end
184
+
185
+ def utility_landmark_ancestor?
186
+ curr = @anchor.parent
187
+ container = @context.container
188
+ while curr.respond_to?(:parent)
189
+ return true if Context::UTILITY_LANDMARK_TAGS.include?(curr.name)
190
+ break if curr == container
191
+
192
+ curr = curr.parent
193
+ end
194
+ false
178
195
  end
179
196
 
180
197
  def icon_only_anchor?
@@ -27,43 +27,17 @@ module Html2rss
27
27
 
28
28
  # @return [Array<Nokogiri::XML::Node>] candidate semantic containers
29
29
  def call
30
- containers = SELECTORS.each_with_object([]) do |selector, memo|
31
- collect_selector_containers(selector, memo)
30
+ cache = {}.compare_by_identity
31
+ candidates = @parsed_body.css(SELECTORS.join(',')).reject do |node|
32
+ HtmlExtractor.ignored_container_path?(node, cache)
32
33
  end
33
34
 
34
- containers.sort_by { document_order.fetch(_1) }
35
- end
36
-
37
- private
38
-
39
- def document_order
40
- @document_order ||= begin
41
- order = {}
42
- index = 0
43
-
44
- @parsed_body.traverse do |node|
45
- next unless node.element?
46
-
47
- order[node] = index
48
- index += 1
49
- end
50
-
51
- order.compare_by_identity
52
- end
53
- end
54
-
55
- def collect_selector_containers(selector, containers)
56
- @parsed_body.css(selector).each do |container|
57
- next if HtmlExtractor.ignored_container_path?(container)
58
- next if seen[container]
59
-
60
- seen[container] = true
61
- containers << container
62
- end
63
- end
64
-
65
- def seen
66
- @seen ||= {}.compare_by_identity
35
+ # Preserve the original post-order traversal intent (specific-first)
36
+ # by sorting candidates by depth (descending) while keeping original document
37
+ # order for nodes at the same depth.
38
+ candidates.each_with_index
39
+ .sort_by { |node, index| [-node.ancestors.size, index] }
40
+ .map!(&:first)
67
41
  end
68
42
  end
69
43
  end
@@ -4,15 +4,15 @@ module Html2rss
4
4
  ##
5
5
  # HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
6
6
  # from an article_tag.
7
- class HtmlExtractor
7
+ class HtmlExtractor # rubocop:disable Metrics/ClassLength
8
8
  # Tags ignored when extracting visible text content from article containers.
9
9
  INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
10
- # Element path pattern ignored when traversing candidate article containers.
11
- IGNORED_CONTAINER_PATH = /(nav|footer|header|svg|script|style)/i
12
10
  # Heading tags used to prioritize title extraction.
13
11
  HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
14
12
  # Selector used to derive non-headline description nodes.
15
13
  NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
14
+ # Element tags that indicate ignored DOM chrome when found in a container path.
15
+ IGNORED_CONTAINER_TAGS = %w[nav footer header svg script style].to_set.freeze
16
16
 
17
17
  # Anchor selector used to identify the canonical article link element.
18
18
  MAIN_ANCHOR_SELECTOR = begin
@@ -42,8 +42,39 @@ module Html2rss
42
42
  parts.join(separator).squeeze(' ').strip unless parts.empty?
43
43
  end
44
44
 
45
+ ##
46
+ # @param article_tag [Nokogiri::XML::Node] article-like container to search within
47
+ # @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
48
+ def main_anchor_for(article_tag)
49
+ return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
50
+
51
+ article_tag.at_css(MAIN_ANCHOR_SELECTOR)
52
+ end
53
+
54
+ ##
55
+ # @param node [Nokogiri::XML::Node]
56
+ # @param cache [Hash, nil] identity cache used to store results (must use compare_by_identity)
57
+ # @return [Boolean] true when the node belongs to ignored DOM chrome
58
+ def ignored_container_path?(node, cache = nil)
59
+ return cache[node] if cache&.key?(node)
60
+
61
+ res = walk_ignored_container_path?(node)
62
+ cache[node] = res if cache
63
+ res
64
+ end
65
+
45
66
  private
46
67
 
68
+ def walk_ignored_container_path?(node)
69
+ curr = node
70
+ while curr.respond_to?(:parent)
71
+ return true if IGNORED_CONTAINER_TAGS.include?(curr.name)
72
+
73
+ curr = curr.parent
74
+ end
75
+ false
76
+ end
77
+
47
78
  def visible_child?(node)
48
79
  !INVISIBLE_CONTENT_TAGS.include?(node.name) &&
49
80
  !(node.name == 'a' && node['href']&.start_with?('#'))
@@ -80,26 +111,6 @@ module Html2rss
80
111
 
81
112
  attr_reader :article_tag, :base_url, :selected_anchor
82
113
 
83
- class << self
84
- ##
85
- # @param article_tag [Nokogiri::XML::Node] article-like container to search within
86
- # @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
87
- def main_anchor_for(article_tag)
88
- return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
89
-
90
- article_tag.at_css(MAIN_ANCHOR_SELECTOR)
91
- end
92
-
93
- ##
94
- # @param node [Nokogiri::XML::Node, String] node or path to test
95
- # @return [Boolean] true when the node belongs to ignored DOM chrome
96
- def ignored_container_path?(node)
97
- path = node.respond_to?(:path) ? node.path : node.to_s
98
-
99
- path.match?(IGNORED_CONTAINER_PATH)
100
- end
101
- end
102
-
103
114
  def extract_url
104
115
  @extract_url ||= begin
105
116
  href = selected_anchor&.[]('href').to_s
@@ -115,14 +126,24 @@ module Html2rss
115
126
 
116
127
  def heading
117
128
  @heading ||= begin
118
- heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
119
- smallest_heading = heading_tags.keys.min
120
- if smallest_heading
121
- heading_tags[smallest_heading]&.max_by do |tag|
122
- self.class.extract_visible_text(tag)&.size.to_i
123
- end
124
- end
129
+ tags = article_tag.css(HEADING_TAGS.join(','))
130
+ tags.any? ? select_best_heading(tags) : nil
131
+ end
132
+ end
133
+
134
+ def select_best_heading(tags)
135
+ min_tag_name = tags.map(&:name).min
136
+ best_tag = nil
137
+ max_size = -1
138
+
139
+ tags.each do |tag|
140
+ next if tag.name != min_tag_name
141
+
142
+ size = self.class.extract_visible_text(tag)&.size.to_i
143
+ (best_tag = tag) && (max_size = size) if size > max_size
125
144
  end
145
+
146
+ best_tag
126
147
  end
127
148
 
128
149
  def extract_description
@@ -25,12 +25,12 @@ module Html2rss
25
25
  # @param end_of_range [Integer] Optional, defaults to half the text length
26
26
  # @return [String]
27
27
  def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
28
- return text unless text.is_a?(String) && pattern.is_a?(String)
28
+ return text unless text.is_a?(String) && pattern.is_a?(String) && !pattern.empty?
29
29
 
30
30
  index = text.index(pattern)
31
- return text if index.nil? || index >= end_of_range
31
+ return text if index.nil? || index > end_of_range
32
32
 
33
- text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
33
+ "#{text[0, index]}#{text[(index + pattern.size)..]}"
34
34
  end
35
35
 
36
36
  # @param base [String] The base text content for the description
@@ -9,6 +9,7 @@ module Html2rss
9
9
  ##
10
10
  # Article is a simple data object representing an article extracted from a page.
11
11
  # It is enumerable and responds to all keys specified in PROVIDED_KEYS.
12
+ # rubocop:disable Metrics/ClassLength
12
13
  class Article
13
14
  include Enumerable
14
15
  include Comparable
@@ -17,6 +18,11 @@ module Html2rss
17
18
  PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
18
19
  # Separator used to build deterministic deduplication fingerprints.
19
20
  DEDUP_FINGERPRINT_SEPARATOR = '#!/'
21
+ # Sentinel object used to pre-initialize instance variables in the constructor.
22
+ # This ensures all Article instances share the exact same object shape (Ruby 3.3+ optimization),
23
+ # preventing performance warnings and slower instance variable access due to shape transitions
24
+ # when attributes are lazily/conditionally accessed in different sequences.
25
+ NOT_SET = Object.new.freeze
20
26
 
21
27
  # @param options [Hash{Symbol => String}]
22
28
  # @option options [String] :id stable article identifier
@@ -31,9 +37,9 @@ module Html2rss
31
37
  # @option options [Array<String>] :categories category labels
32
38
  # @option options [Class] :scraper scraper class that produced the article
33
39
  def initialize(**options)
34
- @to_h = {}
35
- options.each_pair { |key, value| @to_h[key] = value.freeze if value }
36
- @to_h.freeze
40
+ @to_h = options.each_with_object({}) { |(k, v), h| h[k] = v.freeze if v }.freeze
41
+
42
+ @description = @url = @image = @guid = @enclosures = @enclosure = @categories = @published_at = NOT_SET
37
43
 
38
44
  return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
39
45
 
@@ -62,7 +68,9 @@ module Html2rss
62
68
 
63
69
  # @return [String] rendered article description
64
70
  def description
65
- @description ||= Rendering::DescriptionBuilder.new(
71
+ return @description unless @description == NOT_SET
72
+
73
+ @description = Rendering::DescriptionBuilder.new(
66
74
  base: @to_h[:description],
67
75
  title:,
68
76
  url:,
@@ -73,12 +81,16 @@ module Html2rss
73
81
 
74
82
  # @return [Url, nil]
75
83
  def url
76
- @url ||= Url.sanitize(@to_h[:url])
84
+ return @url unless @url == NOT_SET
85
+
86
+ @url = Url.sanitize(@to_h[:url])
77
87
  end
78
88
 
79
89
  # @return [Url, nil]
80
90
  def image
81
- @image ||= Url.sanitize(@to_h[:image])
91
+ return @image unless @image == NOT_SET
92
+
93
+ @image = Url.sanitize(@to_h[:image])
82
94
  end
83
95
 
84
96
  # @return [String, nil]
@@ -87,7 +99,9 @@ module Html2rss
87
99
  # Generates a unique identifier based on the URL and ID using CRC32.
88
100
  # @return [String]
89
101
  def guid
90
- @guid ||= Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
102
+ return @guid unless @guid == NOT_SET
103
+
104
+ @guid = Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
91
105
  end
92
106
 
93
107
  ##
@@ -100,27 +114,32 @@ module Html2rss
100
114
 
101
115
  # @return [Array<Html2rss::RssBuilder::Enclosure>] normalized enclosure objects
102
116
  def enclosures
103
- @enclosures ||= Array(@to_h[:enclosures])
104
- .map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
117
+ return @enclosures unless @enclosures == NOT_SET
118
+
119
+ @enclosures = Array(@to_h[:enclosures])
120
+ .map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
105
121
  end
106
122
 
107
123
  # @return [Html2rss::RssBuilder::Enclosure, nil]
108
124
  def enclosure
109
- return @enclosure if defined?(@enclosure)
110
-
111
- case (object = @to_h[:enclosures]&.first)
112
- when Hash
113
- @enclosure = Html2rss::RssBuilder::Enclosure.new(**object)
114
- when nil
115
- @enclosure = Html2rss::RssBuilder::Enclosure.new(url: image) if image
116
- else
117
- Log.warn "Article: unknown enclosure type: #{object.class}"
118
- end
125
+ return @enclosure unless @enclosure == NOT_SET
126
+
127
+ @enclosure = case (object = @to_h[:enclosures]&.first)
128
+ when Hash
129
+ Html2rss::RssBuilder::Enclosure.new(**object)
130
+ when nil
131
+ Html2rss::RssBuilder::Enclosure.new(url: image) if image
132
+ else
133
+ Log.warn "Article: unknown enclosure type: #{object.class}"
134
+ nil
135
+ end
119
136
  end
120
137
 
121
138
  # @return [Array<String>] normalized, unique category names
122
139
  def categories
123
- @categories ||= @to_h[:categories].dup.to_a.tap do |categories|
140
+ return @categories unless @categories == NOT_SET
141
+
142
+ @categories = @to_h[:categories].dup.to_a.tap do |categories|
124
143
  categories.map! { |category| category.to_s.strip }
125
144
  categories.reject!(&:empty?)
126
145
  categories.uniq!
@@ -130,11 +149,12 @@ module Html2rss
130
149
  # Parses and returns the published_at time.
131
150
  # @return [DateTime, nil]
132
151
  def published_at
133
- return if (string = @to_h[:published_at].to_s.strip).empty?
152
+ return @published_at unless @published_at == NOT_SET
134
153
 
135
- @published_at ||= DateTime.parse(string)
154
+ string = @to_h[:published_at].to_s.strip
155
+ @published_at = string.empty? ? nil : DateTime.parse(string)
136
156
  rescue ArgumentError
137
- nil
157
+ @published_at = nil
138
158
  end
139
159
 
140
160
  # @return [Class, nil] scraper class that produced this article
@@ -183,5 +203,6 @@ module Html2rss
183
203
  value
184
204
  end
185
205
  end
206
+ # rubocop:enable Metrics/ClassLength
186
207
  end
187
208
  end
@@ -16,9 +16,11 @@ module Html2rss
16
16
  def self.guess_content_type_from_url(url, default: 'application/octet-stream')
17
17
  return default unless url
18
18
 
19
- url = url.path.split('?').first
19
+ path = url.path
20
+ ext = File.extname(path)
21
+ ext = ext[1..] if ext.start_with?('.')
20
22
 
21
- content_type = MIME::Types.type_for(File.extname(url).delete('.'))
23
+ content_type = MIME::Types.type_for(ext)
22
24
  content_type.first&.to_s || 'application/octet-stream'
23
25
  end
24
26