html2rss 0.20.1 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/html2rss.gemspec +1 -2
- data/lib/html2rss/auto_source/scraper/html.rb +61 -16
- data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
- data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
- data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
- data/lib/html2rss/auto_source/scraper.rb +0 -3
- data/lib/html2rss/auto_source.rb +2 -11
- data/lib/html2rss/category_extractor.rb +54 -20
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
- data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
- data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
- data/lib/html2rss/html_extractor.rb +51 -30
- data/lib/html2rss/rendering/description_builder.rb +3 -3
- data/lib/html2rss/rss_builder/article.rb +44 -23
- data/lib/html2rss/rss_builder/enclosure.rb +4 -2
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
- data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
- data/lib/html2rss/selectors/post_processors/template.rb +3 -2
- data/lib/html2rss/selectors.rb +18 -4
- data/lib/html2rss/url.rb +4 -3
- data/lib/html2rss/version.rb +1 -1
- metadata +3 -17
|
@@ -5,116 +5,87 @@ module Html2rss
|
|
|
5
5
|
##
|
|
6
6
|
# Extracts enclosures from HTML tags using various strategies.
|
|
7
7
|
class EnclosureExtractor
|
|
8
|
+
# CSS union query covering images, media, PDFs, iframes, and archives.
|
|
9
|
+
SELECTOR = [
|
|
10
|
+
'img[src]:not([src^="data"])',
|
|
11
|
+
'video source[src]',
|
|
12
|
+
'audio source[src]',
|
|
13
|
+
'audio[src]',
|
|
14
|
+
'a[href$=".pdf"]',
|
|
15
|
+
'iframe[src]',
|
|
16
|
+
'a[href$=".zip"]',
|
|
17
|
+
'a[href$=".tar.gz"]',
|
|
18
|
+
'a[href$=".tgz"]'
|
|
19
|
+
].join(',').freeze
|
|
20
|
+
|
|
8
21
|
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
9
22
|
# @param base_url [String, Html2rss::Url] base URL for relative enclosure links
|
|
10
23
|
# @return [Array<Hash{Symbol => Object}>] normalized enclosure hashes
|
|
11
24
|
def self.call(article_tag, base_url)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
Extractors::Pdf,
|
|
16
|
-
Extractors::Iframe,
|
|
17
|
-
Extractors::Archive
|
|
18
|
-
].flat_map { |strategy| strategy.call(article_tag, base_url:) }
|
|
25
|
+
article_tag.css(SELECTOR).filter_map do |element|
|
|
26
|
+
extract_from_element(element, base_url)
|
|
27
|
+
end
|
|
19
28
|
end
|
|
20
|
-
end
|
|
21
29
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
src = img['src'].to_s
|
|
33
|
-
next if src.empty?
|
|
34
|
-
|
|
35
|
-
abs_url = Url.from_relative(src, base_url)
|
|
36
|
-
{
|
|
37
|
-
url: abs_url,
|
|
38
|
-
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
|
|
39
|
-
}
|
|
40
|
-
end
|
|
30
|
+
def self.extract_from_element(element, base_url)
|
|
31
|
+
case element.name
|
|
32
|
+
when 'img'
|
|
33
|
+
extract_image(element, base_url)
|
|
34
|
+
when 'video', 'audio', 'source'
|
|
35
|
+
extract_media(element, base_url)
|
|
36
|
+
when 'iframe'
|
|
37
|
+
extract_iframe(element, base_url)
|
|
38
|
+
when 'a'
|
|
39
|
+
extract_a(element, base_url)
|
|
41
40
|
end
|
|
42
41
|
end
|
|
43
42
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# @param base_url [String, Html2rss::Url] base URL for relative media sources
|
|
48
|
-
# @return [Array<Hash{Symbol => Object}>] media enclosure hashes
|
|
49
|
-
def self.call(article_tag, base_url:)
|
|
50
|
-
article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
|
|
51
|
-
src = element['src'].to_s
|
|
52
|
-
next if src.empty?
|
|
43
|
+
def self.extract_image(img, base_url)
|
|
44
|
+
src = img['src'].to_s
|
|
45
|
+
return if src.empty?
|
|
53
46
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
end
|
|
47
|
+
abs_url = Url.from_relative(src, base_url)
|
|
48
|
+
{
|
|
49
|
+
url: abs_url,
|
|
50
|
+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
|
|
51
|
+
}
|
|
60
52
|
end
|
|
61
53
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
# @param base_url [String, Html2rss::Url] base URL for relative PDF links
|
|
66
|
-
# @return [Array<Hash{Symbol => Object}>] PDF enclosure hashes
|
|
67
|
-
def self.call(article_tag, base_url:)
|
|
68
|
-
article_tag.css('a[href$=".pdf"]').filter_map do |link|
|
|
69
|
-
href = link['href'].to_s
|
|
70
|
-
next if href.empty?
|
|
54
|
+
def self.extract_media(element, base_url)
|
|
55
|
+
src = element['src'].to_s
|
|
56
|
+
return if src.empty?
|
|
71
57
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
}
|
|
77
|
-
end
|
|
78
|
-
end
|
|
58
|
+
{
|
|
59
|
+
url: Url.from_relative(src, base_url),
|
|
60
|
+
type: element['type']
|
|
61
|
+
}
|
|
79
62
|
end
|
|
80
63
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# @param base_url [String, Html2rss::Url] base URL for relative iframe links
|
|
85
|
-
# @return [Array<Hash{Symbol => Object}>] iframe enclosure hashes
|
|
86
|
-
def self.call(article_tag, base_url:)
|
|
87
|
-
article_tag.css('iframe[src]').filter_map do |iframe|
|
|
88
|
-
src = iframe['src']
|
|
89
|
-
next if src.nil? || src.empty?
|
|
64
|
+
def self.extract_iframe(iframe, base_url)
|
|
65
|
+
src = iframe['src'].to_s
|
|
66
|
+
return if src.empty?
|
|
90
67
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
end
|
|
97
|
-
end
|
|
68
|
+
abs_url = Url.from_relative(src, base_url)
|
|
69
|
+
{
|
|
70
|
+
url: abs_url,
|
|
71
|
+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
|
|
72
|
+
}
|
|
98
73
|
end
|
|
99
74
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
# @param base_url [String, Html2rss::Url] base URL for relative archive links
|
|
104
|
-
# @return [Array<Hash{Symbol => Object}>] archive enclosure hashes
|
|
105
|
-
def self.call(article_tag, base_url:)
|
|
106
|
-
article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
|
|
107
|
-
href = link['href'].to_s
|
|
108
|
-
next if href.empty?
|
|
75
|
+
def self.extract_a(link, base_url)
|
|
76
|
+
href = link['href'].to_s
|
|
77
|
+
return if href.empty?
|
|
109
78
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
79
|
+
abs_url = Url.from_relative(href, base_url)
|
|
80
|
+
|
|
81
|
+
if href.end_with?('.pdf')
|
|
82
|
+
{ url: abs_url, type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url) }
|
|
83
|
+
else
|
|
84
|
+
{ url: abs_url, type: 'application/zip' }
|
|
116
85
|
end
|
|
117
86
|
end
|
|
87
|
+
|
|
88
|
+
private_class_method :extract_from_element, :extract_image, :extract_media, :extract_iframe, :extract_a
|
|
118
89
|
end
|
|
119
90
|
end
|
|
120
91
|
end
|
|
@@ -75,17 +75,11 @@ module Html2rss
|
|
|
75
75
|
def each_anchor(anchor_filter:)
|
|
76
76
|
return enum_for(:each_anchor, anchor_filter:) unless block_given?
|
|
77
77
|
|
|
78
|
-
traversal_root&.
|
|
79
|
-
yield node if
|
|
78
|
+
traversal_root&.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR)&.each do |node|
|
|
79
|
+
yield node if anchor_filter.call(node)
|
|
80
80
|
end
|
|
81
81
|
end
|
|
82
82
|
|
|
83
|
-
def relevant_anchor?(node, anchor_filter:)
|
|
84
|
-
node.element? &&
|
|
85
|
-
node.matches?(HtmlExtractor::MAIN_ANCHOR_SELECTOR) &&
|
|
86
|
-
anchor_filter.call(node)
|
|
87
|
-
end
|
|
88
|
-
|
|
89
83
|
def traversal_root
|
|
90
84
|
parsed_body.at_css('body, html') || parsed_body.root
|
|
91
85
|
end
|
|
@@ -31,6 +31,8 @@ module Html2rss
|
|
|
31
31
|
|
|
32
32
|
# Shared context for all anchors in one semantic container.
|
|
33
33
|
class Context
|
|
34
|
+
attr_reader :container
|
|
35
|
+
|
|
34
36
|
# Ancestor tags that usually indicate navigation/utility regions.
|
|
35
37
|
UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
|
|
36
38
|
|
|
@@ -56,7 +58,7 @@ module Html2rss
|
|
|
56
58
|
def visible_text(node)
|
|
57
59
|
return '' unless node
|
|
58
60
|
|
|
59
|
-
HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
61
|
+
(@visible_texts ||= {}.compare_by_identity)[node] ||= HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
60
62
|
end
|
|
61
63
|
|
|
62
64
|
# @param anchor [Nokogiri::XML::Node] anchor candidate
|
|
@@ -70,12 +72,6 @@ module Html2rss
|
|
|
70
72
|
def utility_text?(text)
|
|
71
73
|
@link_heuristics.utility_text?(text)
|
|
72
74
|
end
|
|
73
|
-
|
|
74
|
-
# @param ancestors [Array<Nokogiri::XML::Node>]
|
|
75
|
-
# @return [Boolean] true when the anchor lives inside navigation chrome
|
|
76
|
-
def utility_landmark?(ancestors)
|
|
77
|
-
ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
|
|
78
|
-
end
|
|
79
75
|
end
|
|
80
76
|
|
|
81
77
|
# One anchor plus the facts needed to decide whether it represents content.
|
|
@@ -131,7 +127,7 @@ module Html2rss
|
|
|
131
127
|
|
|
132
128
|
# @return [Boolean] true when visible anchor text has words
|
|
133
129
|
def meaningful_text?
|
|
134
|
-
text.
|
|
130
|
+
@meaningful_text ||= text.match?(/\p{Alnum}/)
|
|
135
131
|
end
|
|
136
132
|
|
|
137
133
|
# @return [Boolean] true when the destination route has content signals
|
|
@@ -142,8 +138,17 @@ module Html2rss
|
|
|
142
138
|
# @return [Boolean] true when the anchor is inside the selected heading
|
|
143
139
|
def heading_anchor?
|
|
144
140
|
heading = @context.heading
|
|
141
|
+
return false unless heading
|
|
142
|
+
|
|
143
|
+
curr = @anchor
|
|
144
|
+
container = @context.container
|
|
145
|
+
while curr.respond_to?(:parent)
|
|
146
|
+
return true if curr == heading
|
|
147
|
+
break if curr == container
|
|
145
148
|
|
|
146
|
-
|
|
149
|
+
curr = curr.parent
|
|
150
|
+
end
|
|
151
|
+
false
|
|
147
152
|
end
|
|
148
153
|
|
|
149
154
|
# @return [Boolean] true when anchor text exactly matches heading text
|
|
@@ -151,14 +156,14 @@ module Html2rss
|
|
|
151
156
|
heading_text = @context.heading_text
|
|
152
157
|
|
|
153
158
|
meaningful_text? &&
|
|
154
|
-
heading_text.
|
|
159
|
+
heading_text.match?(/\p{Alnum}/) &&
|
|
155
160
|
heading_text == text
|
|
156
161
|
end
|
|
157
162
|
|
|
158
163
|
private
|
|
159
164
|
|
|
160
165
|
def representative_content_anchor?
|
|
161
|
-
|
|
166
|
+
meaningful_text? || content_like_destination? || heading_anchor?
|
|
162
167
|
end
|
|
163
168
|
|
|
164
169
|
def utility_text_suppressed?
|
|
@@ -174,7 +179,19 @@ module Html2rss
|
|
|
174
179
|
def ineligible_anchor?
|
|
175
180
|
destination_facts.high_confidence_utility_destination ||
|
|
176
181
|
icon_only_anchor? ||
|
|
177
|
-
|
|
182
|
+
utility_landmark_ancestor?
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def utility_landmark_ancestor?
|
|
186
|
+
curr = @anchor.parent
|
|
187
|
+
container = @context.container
|
|
188
|
+
while curr.respond_to?(:parent)
|
|
189
|
+
return true if Context::UTILITY_LANDMARK_TAGS.include?(curr.name)
|
|
190
|
+
break if curr == container
|
|
191
|
+
|
|
192
|
+
curr = curr.parent
|
|
193
|
+
end
|
|
194
|
+
false
|
|
178
195
|
end
|
|
179
196
|
|
|
180
197
|
def icon_only_anchor?
|
|
@@ -27,43 +27,17 @@ module Html2rss
|
|
|
27
27
|
|
|
28
28
|
# @return [Array<Nokogiri::XML::Node>] candidate semantic containers
|
|
29
29
|
def call
|
|
30
|
-
|
|
31
|
-
|
|
30
|
+
cache = {}.compare_by_identity
|
|
31
|
+
candidates = @parsed_body.css(SELECTORS.join(',')).reject do |node|
|
|
32
|
+
HtmlExtractor.ignored_container_path?(node, cache)
|
|
32
33
|
end
|
|
33
34
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@document_order ||= begin
|
|
41
|
-
order = {}
|
|
42
|
-
index = 0
|
|
43
|
-
|
|
44
|
-
@parsed_body.traverse do |node|
|
|
45
|
-
next unless node.element?
|
|
46
|
-
|
|
47
|
-
order[node] = index
|
|
48
|
-
index += 1
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
order.compare_by_identity
|
|
52
|
-
end
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def collect_selector_containers(selector, containers)
|
|
56
|
-
@parsed_body.css(selector).each do |container|
|
|
57
|
-
next if HtmlExtractor.ignored_container_path?(container)
|
|
58
|
-
next if seen[container]
|
|
59
|
-
|
|
60
|
-
seen[container] = true
|
|
61
|
-
containers << container
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
def seen
|
|
66
|
-
@seen ||= {}.compare_by_identity
|
|
35
|
+
# Preserve the original post-order traversal intent (specific-first)
|
|
36
|
+
# by sorting candidates by depth (descending) while keeping original document
|
|
37
|
+
# order for nodes at the same depth.
|
|
38
|
+
candidates.each_with_index
|
|
39
|
+
.sort_by { |node, index| [-node.ancestors.size, index] }
|
|
40
|
+
.map!(&:first)
|
|
67
41
|
end
|
|
68
42
|
end
|
|
69
43
|
end
|
|
@@ -4,15 +4,15 @@ module Html2rss
|
|
|
4
4
|
##
|
|
5
5
|
# HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
|
|
6
6
|
# from an article_tag.
|
|
7
|
-
class HtmlExtractor
|
|
7
|
+
class HtmlExtractor # rubocop:disable Metrics/ClassLength
|
|
8
8
|
# Tags ignored when extracting visible text content from article containers.
|
|
9
9
|
INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
|
|
10
|
-
# Element path pattern ignored when traversing candidate article containers.
|
|
11
|
-
IGNORED_CONTAINER_PATH = /(nav|footer|header|svg|script|style)/i
|
|
12
10
|
# Heading tags used to prioritize title extraction.
|
|
13
11
|
HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
|
|
14
12
|
# Selector used to derive non-headline description nodes.
|
|
15
13
|
NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
|
|
14
|
+
# Element tags that indicate ignored DOM chrome when found in a container path.
|
|
15
|
+
IGNORED_CONTAINER_TAGS = %w[nav footer header svg script style].to_set.freeze
|
|
16
16
|
|
|
17
17
|
# Anchor selector used to identify the canonical article link element.
|
|
18
18
|
MAIN_ANCHOR_SELECTOR = begin
|
|
@@ -42,8 +42,39 @@ module Html2rss
|
|
|
42
42
|
parts.join(separator).squeeze(' ').strip unless parts.empty?
|
|
43
43
|
end
|
|
44
44
|
|
|
45
|
+
##
|
|
46
|
+
# @param article_tag [Nokogiri::XML::Node] article-like container to search within
|
|
47
|
+
# @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
|
|
48
|
+
def main_anchor_for(article_tag)
|
|
49
|
+
return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
|
|
50
|
+
|
|
51
|
+
article_tag.at_css(MAIN_ANCHOR_SELECTOR)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
##
|
|
55
|
+
# @param node [Nokogiri::XML::Node]
|
|
56
|
+
# @param cache [Hash, nil] identity cache used to store results (must use compare_by_identity)
|
|
57
|
+
# @return [Boolean] true when the node belongs to ignored DOM chrome
|
|
58
|
+
def ignored_container_path?(node, cache = nil)
|
|
59
|
+
return cache[node] if cache&.key?(node)
|
|
60
|
+
|
|
61
|
+
res = walk_ignored_container_path?(node)
|
|
62
|
+
cache[node] = res if cache
|
|
63
|
+
res
|
|
64
|
+
end
|
|
65
|
+
|
|
45
66
|
private
|
|
46
67
|
|
|
68
|
+
def walk_ignored_container_path?(node)
|
|
69
|
+
curr = node
|
|
70
|
+
while curr.respond_to?(:parent)
|
|
71
|
+
return true if IGNORED_CONTAINER_TAGS.include?(curr.name)
|
|
72
|
+
|
|
73
|
+
curr = curr.parent
|
|
74
|
+
end
|
|
75
|
+
false
|
|
76
|
+
end
|
|
77
|
+
|
|
47
78
|
def visible_child?(node)
|
|
48
79
|
!INVISIBLE_CONTENT_TAGS.include?(node.name) &&
|
|
49
80
|
!(node.name == 'a' && node['href']&.start_with?('#'))
|
|
@@ -80,26 +111,6 @@ module Html2rss
|
|
|
80
111
|
|
|
81
112
|
attr_reader :article_tag, :base_url, :selected_anchor
|
|
82
113
|
|
|
83
|
-
class << self
|
|
84
|
-
##
|
|
85
|
-
# @param article_tag [Nokogiri::XML::Node] article-like container to search within
|
|
86
|
-
# @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
|
|
87
|
-
def main_anchor_for(article_tag)
|
|
88
|
-
return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
|
|
89
|
-
|
|
90
|
-
article_tag.at_css(MAIN_ANCHOR_SELECTOR)
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
##
|
|
94
|
-
# @param node [Nokogiri::XML::Node, String] node or path to test
|
|
95
|
-
# @return [Boolean] true when the node belongs to ignored DOM chrome
|
|
96
|
-
def ignored_container_path?(node)
|
|
97
|
-
path = node.respond_to?(:path) ? node.path : node.to_s
|
|
98
|
-
|
|
99
|
-
path.match?(IGNORED_CONTAINER_PATH)
|
|
100
|
-
end
|
|
101
|
-
end
|
|
102
|
-
|
|
103
114
|
def extract_url
|
|
104
115
|
@extract_url ||= begin
|
|
105
116
|
href = selected_anchor&.[]('href').to_s
|
|
@@ -115,14 +126,24 @@ module Html2rss
|
|
|
115
126
|
|
|
116
127
|
def heading
|
|
117
128
|
@heading ||= begin
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
129
|
+
tags = article_tag.css(HEADING_TAGS.join(','))
|
|
130
|
+
tags.any? ? select_best_heading(tags) : nil
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def select_best_heading(tags)
|
|
135
|
+
min_tag_name = tags.map(&:name).min
|
|
136
|
+
best_tag = nil
|
|
137
|
+
max_size = -1
|
|
138
|
+
|
|
139
|
+
tags.each do |tag|
|
|
140
|
+
next if tag.name != min_tag_name
|
|
141
|
+
|
|
142
|
+
size = self.class.extract_visible_text(tag)&.size.to_i
|
|
143
|
+
(best_tag = tag) && (max_size = size) if size > max_size
|
|
125
144
|
end
|
|
145
|
+
|
|
146
|
+
best_tag
|
|
126
147
|
end
|
|
127
148
|
|
|
128
149
|
def extract_description
|
|
@@ -25,12 +25,12 @@ module Html2rss
|
|
|
25
25
|
# @param end_of_range [Integer] Optional, defaults to half the text length
|
|
26
26
|
# @return [String]
|
|
27
27
|
def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
|
|
28
|
-
return text unless text.is_a?(String) && pattern.is_a?(String)
|
|
28
|
+
return text unless text.is_a?(String) && pattern.is_a?(String) && !pattern.empty?
|
|
29
29
|
|
|
30
30
|
index = text.index(pattern)
|
|
31
|
-
return text if index.nil? || index
|
|
31
|
+
return text if index.nil? || index > end_of_range
|
|
32
32
|
|
|
33
|
-
text
|
|
33
|
+
"#{text[0, index]}#{text[(index + pattern.size)..]}"
|
|
34
34
|
end
|
|
35
35
|
|
|
36
36
|
# @param base [String] The base text content for the description
|
|
@@ -9,6 +9,7 @@ module Html2rss
|
|
|
9
9
|
##
|
|
10
10
|
# Article is a simple data object representing an article extracted from a page.
|
|
11
11
|
# It is enumerable and responds to all keys specified in PROVIDED_KEYS.
|
|
12
|
+
# rubocop:disable Metrics/ClassLength
|
|
12
13
|
class Article
|
|
13
14
|
include Enumerable
|
|
14
15
|
include Comparable
|
|
@@ -17,6 +18,11 @@ module Html2rss
|
|
|
17
18
|
PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
|
|
18
19
|
# Separator used to build deterministic deduplication fingerprints.
|
|
19
20
|
DEDUP_FINGERPRINT_SEPARATOR = '#!/'
|
|
21
|
+
# Sentinel object used to pre-initialize instance variables in the constructor.
|
|
22
|
+
# This ensures all Article instances share the exact same object shape (Ruby 3.3+ optimization),
|
|
23
|
+
# preventing performance warnings and slower instance variable access due to shape transitions
|
|
24
|
+
# when attributes are lazily/conditionally accessed in different sequences.
|
|
25
|
+
NOT_SET = Object.new.freeze
|
|
20
26
|
|
|
21
27
|
# @param options [Hash{Symbol => String}]
|
|
22
28
|
# @option options [String] :id stable article identifier
|
|
@@ -31,9 +37,9 @@ module Html2rss
|
|
|
31
37
|
# @option options [Array<String>] :categories category labels
|
|
32
38
|
# @option options [Class] :scraper scraper class that produced the article
|
|
33
39
|
def initialize(**options)
|
|
34
|
-
@to_h = {}
|
|
35
|
-
|
|
36
|
-
@
|
|
40
|
+
@to_h = options.each_with_object({}) { |(k, v), h| h[k] = v.freeze if v }.freeze
|
|
41
|
+
|
|
42
|
+
@description = @url = @image = @guid = @enclosures = @enclosure = @categories = @published_at = NOT_SET
|
|
37
43
|
|
|
38
44
|
return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
|
|
39
45
|
|
|
@@ -62,7 +68,9 @@ module Html2rss
|
|
|
62
68
|
|
|
63
69
|
# @return [String] rendered article description
|
|
64
70
|
def description
|
|
65
|
-
@description
|
|
71
|
+
return @description unless @description == NOT_SET
|
|
72
|
+
|
|
73
|
+
@description = Rendering::DescriptionBuilder.new(
|
|
66
74
|
base: @to_h[:description],
|
|
67
75
|
title:,
|
|
68
76
|
url:,
|
|
@@ -73,12 +81,16 @@ module Html2rss
|
|
|
73
81
|
|
|
74
82
|
# @return [Url, nil]
|
|
75
83
|
def url
|
|
76
|
-
@url
|
|
84
|
+
return @url unless @url == NOT_SET
|
|
85
|
+
|
|
86
|
+
@url = Url.sanitize(@to_h[:url])
|
|
77
87
|
end
|
|
78
88
|
|
|
79
89
|
# @return [Url, nil]
|
|
80
90
|
def image
|
|
81
|
-
@image
|
|
91
|
+
return @image unless @image == NOT_SET
|
|
92
|
+
|
|
93
|
+
@image = Url.sanitize(@to_h[:image])
|
|
82
94
|
end
|
|
83
95
|
|
|
84
96
|
# @return [String, nil]
|
|
@@ -87,7 +99,9 @@ module Html2rss
|
|
|
87
99
|
# Generates a unique identifier based on the URL and ID using CRC32.
|
|
88
100
|
# @return [String]
|
|
89
101
|
def guid
|
|
90
|
-
@guid
|
|
102
|
+
return @guid unless @guid == NOT_SET
|
|
103
|
+
|
|
104
|
+
@guid = Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
|
|
91
105
|
end
|
|
92
106
|
|
|
93
107
|
##
|
|
@@ -100,27 +114,32 @@ module Html2rss
|
|
|
100
114
|
|
|
101
115
|
# @return [Array<Html2rss::RssBuilder::Enclosure>] normalized enclosure objects
|
|
102
116
|
def enclosures
|
|
103
|
-
@enclosures
|
|
104
|
-
|
|
117
|
+
return @enclosures unless @enclosures == NOT_SET
|
|
118
|
+
|
|
119
|
+
@enclosures = Array(@to_h[:enclosures])
|
|
120
|
+
.map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
|
|
105
121
|
end
|
|
106
122
|
|
|
107
123
|
# @return [Html2rss::RssBuilder::Enclosure, nil]
|
|
108
124
|
def enclosure
|
|
109
|
-
return @enclosure
|
|
110
|
-
|
|
111
|
-
case (object = @to_h[:enclosures]&.first)
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
125
|
+
return @enclosure unless @enclosure == NOT_SET
|
|
126
|
+
|
|
127
|
+
@enclosure = case (object = @to_h[:enclosures]&.first)
|
|
128
|
+
when Hash
|
|
129
|
+
Html2rss::RssBuilder::Enclosure.new(**object)
|
|
130
|
+
when nil
|
|
131
|
+
Html2rss::RssBuilder::Enclosure.new(url: image) if image
|
|
132
|
+
else
|
|
133
|
+
Log.warn "Article: unknown enclosure type: #{object.class}"
|
|
134
|
+
nil
|
|
135
|
+
end
|
|
119
136
|
end
|
|
120
137
|
|
|
121
138
|
# @return [Array<String>] normalized, unique category names
|
|
122
139
|
def categories
|
|
123
|
-
@categories
|
|
140
|
+
return @categories unless @categories == NOT_SET
|
|
141
|
+
|
|
142
|
+
@categories = @to_h[:categories].dup.to_a.tap do |categories|
|
|
124
143
|
categories.map! { |category| category.to_s.strip }
|
|
125
144
|
categories.reject!(&:empty?)
|
|
126
145
|
categories.uniq!
|
|
@@ -130,11 +149,12 @@ module Html2rss
|
|
|
130
149
|
# Parses and returns the published_at time.
|
|
131
150
|
# @return [DateTime, nil]
|
|
132
151
|
def published_at
|
|
133
|
-
return
|
|
152
|
+
return @published_at unless @published_at == NOT_SET
|
|
134
153
|
|
|
135
|
-
@published_at
|
|
154
|
+
string = @to_h[:published_at].to_s.strip
|
|
155
|
+
@published_at = string.empty? ? nil : DateTime.parse(string)
|
|
136
156
|
rescue ArgumentError
|
|
137
|
-
nil
|
|
157
|
+
@published_at = nil
|
|
138
158
|
end
|
|
139
159
|
|
|
140
160
|
# @return [Class, nil] scraper class that produced this article
|
|
@@ -183,5 +203,6 @@ module Html2rss
|
|
|
183
203
|
value
|
|
184
204
|
end
|
|
185
205
|
end
|
|
206
|
+
# rubocop:enable Metrics/ClassLength
|
|
186
207
|
end
|
|
187
208
|
end
|
|
@@ -16,9 +16,11 @@ module Html2rss
|
|
|
16
16
|
def self.guess_content_type_from_url(url, default: 'application/octet-stream')
|
|
17
17
|
return default unless url
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
path = url.path
|
|
20
|
+
ext = File.extname(path)
|
|
21
|
+
ext = ext[1..] if ext.start_with?('.')
|
|
20
22
|
|
|
21
|
-
content_type = MIME::Types.type_for(
|
|
23
|
+
content_type = MIME::Types.type_for(ext)
|
|
22
24
|
content_type.first&.to_s || 'application/octet-stream'
|
|
23
25
|
end
|
|
24
26
|
|