html2rss 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/html2rss.gemspec +1 -2
- data/lib/html2rss/auto_source/scraper/html.rb +61 -16
- data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
- data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
- data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
- data/lib/html2rss/auto_source/scraper.rb +0 -3
- data/lib/html2rss/auto_source.rb +2 -11
- data/lib/html2rss/category_extractor.rb +54 -20
- data/lib/html2rss/config/class_methods.rb +9 -4
- data/lib/html2rss/config/validator.rb +1 -0
- data/lib/html2rss/config.rb +4 -1
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
- data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
- data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
- data/lib/html2rss/html_extractor.rb +51 -30
- data/lib/html2rss/rendering/description_builder.rb +3 -3
- data/lib/html2rss/request_controls.rb +13 -3
- data/lib/html2rss/request_service/policy.rb +3 -3
- data/lib/html2rss/request_session/runtime_policy.rb +2 -1
- data/lib/html2rss/rss_builder/article.rb +44 -23
- data/lib/html2rss/rss_builder/enclosure.rb +4 -2
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
- data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
- data/lib/html2rss/selectors/post_processors/template.rb +3 -2
- data/lib/html2rss/selectors.rb +18 -4
- data/lib/html2rss/url.rb +4 -3
- data/lib/html2rss/version.rb +1 -1
- data/schema/html2rss-config.schema.json +7 -0
- metadata +3 -17
|
@@ -132,10 +132,7 @@ module Html2rss
|
|
|
132
132
|
def default_config
|
|
133
133
|
{
|
|
134
134
|
strategy: default_strategy_name,
|
|
135
|
-
request:
|
|
136
|
-
max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
|
|
137
|
-
max_requests: RequestService::Policy::DEFAULTS[:max_requests]
|
|
138
|
-
},
|
|
135
|
+
request: default_request_config,
|
|
139
136
|
channel: { time_zone: 'UTC' },
|
|
140
137
|
headers: RequestHeaders.browser_defaults,
|
|
141
138
|
stylesheets: Html2rss.configuration.stylesheets || []
|
|
@@ -149,6 +146,14 @@ module Html2rss
|
|
|
149
146
|
|
|
150
147
|
private
|
|
151
148
|
|
|
149
|
+
def default_request_config
|
|
150
|
+
{
|
|
151
|
+
max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
|
|
152
|
+
max_requests: RequestService::Policy::DEFAULTS[:max_requests],
|
|
153
|
+
total_timeout_seconds: RequestService::Policy::DEFAULTS[:total_timeout_seconds]
|
|
154
|
+
}
|
|
155
|
+
end
|
|
156
|
+
|
|
152
157
|
def resolve_effective_config(config, params:)
|
|
153
158
|
effective_config = HashUtil.deep_symbolize_keys(config, context: 'config')
|
|
154
159
|
resolved_params = parameter_defaults(effective_config)
|
|
@@ -80,6 +80,7 @@ module Html2rss
|
|
|
80
80
|
RequestConfig = Dry::Schema.Params do
|
|
81
81
|
optional(:max_redirects).filled(:integer, gteq?: 0)
|
|
82
82
|
optional(:max_requests).filled(:integer, gt?: 0)
|
|
83
|
+
optional(:total_timeout_seconds).filled(:integer, gt?: 0)
|
|
83
84
|
optional(:browserless).hash(BrowserlessRequestConfig)
|
|
84
85
|
optional(:botasaurus).hash(BotasaurusRequestConfig)
|
|
85
86
|
end
|
data/lib/html2rss/config.rb
CHANGED
|
@@ -31,7 +31,8 @@ module Html2rss
|
|
|
31
31
|
@request_controls = request_controls.with_effective_values(
|
|
32
32
|
strategy: validated_config[:strategy],
|
|
33
33
|
max_redirects: validated_config.dig(:request, :max_redirects),
|
|
34
|
-
max_requests: validated_config.dig(:request, :max_requests)
|
|
34
|
+
max_requests: validated_config.dig(:request, :max_requests),
|
|
35
|
+
total_timeout_seconds: validated_config.dig(:request, :total_timeout_seconds)
|
|
35
36
|
)
|
|
36
37
|
end
|
|
37
38
|
|
|
@@ -41,6 +42,8 @@ module Html2rss
|
|
|
41
42
|
def max_redirects = request_controls.max_redirects
|
|
42
43
|
# @return [Integer, nil] configured request budget
|
|
43
44
|
def max_requests = request_controls.max_requests
|
|
45
|
+
# @return [Integer, nil] configured request timeout
|
|
46
|
+
def total_timeout_seconds = request_controls.total_timeout_seconds
|
|
44
47
|
# @return [Array<Hash>] stylesheet definitions
|
|
45
48
|
def stylesheets = config[:stylesheets]
|
|
46
49
|
|
|
@@ -5,116 +5,87 @@ module Html2rss
|
|
|
5
5
|
##
|
|
6
6
|
# Extracts enclosures from HTML tags using various strategies.
|
|
7
7
|
class EnclosureExtractor
|
|
8
|
+
# CSS union query covering images, media, PDFs, iframes, and archives.
|
|
9
|
+
SELECTOR = [
|
|
10
|
+
'img[src]:not([src^="data"])',
|
|
11
|
+
'video source[src]',
|
|
12
|
+
'audio source[src]',
|
|
13
|
+
'audio[src]',
|
|
14
|
+
'a[href$=".pdf"]',
|
|
15
|
+
'iframe[src]',
|
|
16
|
+
'a[href$=".zip"]',
|
|
17
|
+
'a[href$=".tar.gz"]',
|
|
18
|
+
'a[href$=".tgz"]'
|
|
19
|
+
].join(',').freeze
|
|
20
|
+
|
|
8
21
|
# @param article_tag [Nokogiri::XML::Element] article container node
|
|
9
22
|
# @param base_url [String, Html2rss::Url] base URL for relative enclosure links
|
|
10
23
|
# @return [Array<Hash{Symbol => Object}>] normalized enclosure hashes
|
|
11
24
|
def self.call(article_tag, base_url)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
Extractors::Pdf,
|
|
16
|
-
Extractors::Iframe,
|
|
17
|
-
Extractors::Archive
|
|
18
|
-
].flat_map { |strategy| strategy.call(article_tag, base_url:) }
|
|
25
|
+
article_tag.css(SELECTOR).filter_map do |element|
|
|
26
|
+
extract_from_element(element, base_url)
|
|
27
|
+
end
|
|
19
28
|
end
|
|
20
|
-
end
|
|
21
29
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
src = img['src'].to_s
|
|
33
|
-
next if src.empty?
|
|
34
|
-
|
|
35
|
-
abs_url = Url.from_relative(src, base_url)
|
|
36
|
-
{
|
|
37
|
-
url: abs_url,
|
|
38
|
-
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
|
|
39
|
-
}
|
|
40
|
-
end
|
|
30
|
+
def self.extract_from_element(element, base_url)
|
|
31
|
+
case element.name
|
|
32
|
+
when 'img'
|
|
33
|
+
extract_image(element, base_url)
|
|
34
|
+
when 'video', 'audio', 'source'
|
|
35
|
+
extract_media(element, base_url)
|
|
36
|
+
when 'iframe'
|
|
37
|
+
extract_iframe(element, base_url)
|
|
38
|
+
when 'a'
|
|
39
|
+
extract_a(element, base_url)
|
|
41
40
|
end
|
|
42
41
|
end
|
|
43
42
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# @param base_url [String, Html2rss::Url] base URL for relative media sources
|
|
48
|
-
# @return [Array<Hash{Symbol => Object}>] media enclosure hashes
|
|
49
|
-
def self.call(article_tag, base_url:)
|
|
50
|
-
article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
|
|
51
|
-
src = element['src'].to_s
|
|
52
|
-
next if src.empty?
|
|
43
|
+
def self.extract_image(img, base_url)
|
|
44
|
+
src = img['src'].to_s
|
|
45
|
+
return if src.empty?
|
|
53
46
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
end
|
|
47
|
+
abs_url = Url.from_relative(src, base_url)
|
|
48
|
+
{
|
|
49
|
+
url: abs_url,
|
|
50
|
+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
|
|
51
|
+
}
|
|
60
52
|
end
|
|
61
53
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
# @param base_url [String, Html2rss::Url] base URL for relative PDF links
|
|
66
|
-
# @return [Array<Hash{Symbol => Object}>] PDF enclosure hashes
|
|
67
|
-
def self.call(article_tag, base_url:)
|
|
68
|
-
article_tag.css('a[href$=".pdf"]').filter_map do |link|
|
|
69
|
-
href = link['href'].to_s
|
|
70
|
-
next if href.empty?
|
|
54
|
+
def self.extract_media(element, base_url)
|
|
55
|
+
src = element['src'].to_s
|
|
56
|
+
return if src.empty?
|
|
71
57
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
}
|
|
77
|
-
end
|
|
78
|
-
end
|
|
58
|
+
{
|
|
59
|
+
url: Url.from_relative(src, base_url),
|
|
60
|
+
type: element['type']
|
|
61
|
+
}
|
|
79
62
|
end
|
|
80
63
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# @param base_url [String, Html2rss::Url] base URL for relative iframe links
|
|
85
|
-
# @return [Array<Hash{Symbol => Object}>] iframe enclosure hashes
|
|
86
|
-
def self.call(article_tag, base_url:)
|
|
87
|
-
article_tag.css('iframe[src]').filter_map do |iframe|
|
|
88
|
-
src = iframe['src']
|
|
89
|
-
next if src.nil? || src.empty?
|
|
64
|
+
def self.extract_iframe(iframe, base_url)
|
|
65
|
+
src = iframe['src'].to_s
|
|
66
|
+
return if src.empty?
|
|
90
67
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
end
|
|
97
|
-
end
|
|
68
|
+
abs_url = Url.from_relative(src, base_url)
|
|
69
|
+
{
|
|
70
|
+
url: abs_url,
|
|
71
|
+
type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
|
|
72
|
+
}
|
|
98
73
|
end
|
|
99
74
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
# @param base_url [String, Html2rss::Url] base URL for relative archive links
|
|
104
|
-
# @return [Array<Hash{Symbol => Object}>] archive enclosure hashes
|
|
105
|
-
def self.call(article_tag, base_url:)
|
|
106
|
-
article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
|
|
107
|
-
href = link['href'].to_s
|
|
108
|
-
next if href.empty?
|
|
75
|
+
def self.extract_a(link, base_url)
|
|
76
|
+
href = link['href'].to_s
|
|
77
|
+
return if href.empty?
|
|
109
78
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
79
|
+
abs_url = Url.from_relative(href, base_url)
|
|
80
|
+
|
|
81
|
+
if href.end_with?('.pdf')
|
|
82
|
+
{ url: abs_url, type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url) }
|
|
83
|
+
else
|
|
84
|
+
{ url: abs_url, type: 'application/zip' }
|
|
116
85
|
end
|
|
117
86
|
end
|
|
87
|
+
|
|
88
|
+
private_class_method :extract_from_element, :extract_image, :extract_media, :extract_iframe, :extract_a
|
|
118
89
|
end
|
|
119
90
|
end
|
|
120
91
|
end
|
|
@@ -75,17 +75,11 @@ module Html2rss
|
|
|
75
75
|
def each_anchor(anchor_filter:)
|
|
76
76
|
return enum_for(:each_anchor, anchor_filter:) unless block_given?
|
|
77
77
|
|
|
78
|
-
traversal_root&.
|
|
79
|
-
yield node if
|
|
78
|
+
traversal_root&.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR)&.each do |node|
|
|
79
|
+
yield node if anchor_filter.call(node)
|
|
80
80
|
end
|
|
81
81
|
end
|
|
82
82
|
|
|
83
|
-
def relevant_anchor?(node, anchor_filter:)
|
|
84
|
-
node.element? &&
|
|
85
|
-
node.matches?(HtmlExtractor::MAIN_ANCHOR_SELECTOR) &&
|
|
86
|
-
anchor_filter.call(node)
|
|
87
|
-
end
|
|
88
|
-
|
|
89
83
|
def traversal_root
|
|
90
84
|
parsed_body.at_css('body, html') || parsed_body.root
|
|
91
85
|
end
|
|
@@ -31,6 +31,8 @@ module Html2rss
|
|
|
31
31
|
|
|
32
32
|
# Shared context for all anchors in one semantic container.
|
|
33
33
|
class Context
|
|
34
|
+
attr_reader :container
|
|
35
|
+
|
|
34
36
|
# Ancestor tags that usually indicate navigation/utility regions.
|
|
35
37
|
UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
|
|
36
38
|
|
|
@@ -56,7 +58,7 @@ module Html2rss
|
|
|
56
58
|
def visible_text(node)
|
|
57
59
|
return '' unless node
|
|
58
60
|
|
|
59
|
-
HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
61
|
+
(@visible_texts ||= {}.compare_by_identity)[node] ||= HtmlExtractor.extract_visible_text(node).to_s.strip
|
|
60
62
|
end
|
|
61
63
|
|
|
62
64
|
# @param anchor [Nokogiri::XML::Node] anchor candidate
|
|
@@ -70,12 +72,6 @@ module Html2rss
|
|
|
70
72
|
def utility_text?(text)
|
|
71
73
|
@link_heuristics.utility_text?(text)
|
|
72
74
|
end
|
|
73
|
-
|
|
74
|
-
# @param ancestors [Array<Nokogiri::XML::Node>]
|
|
75
|
-
# @return [Boolean] true when the anchor lives inside navigation chrome
|
|
76
|
-
def utility_landmark?(ancestors)
|
|
77
|
-
ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
|
|
78
|
-
end
|
|
79
75
|
end
|
|
80
76
|
|
|
81
77
|
# One anchor plus the facts needed to decide whether it represents content.
|
|
@@ -131,7 +127,7 @@ module Html2rss
|
|
|
131
127
|
|
|
132
128
|
# @return [Boolean] true when visible anchor text has words
|
|
133
129
|
def meaningful_text?
|
|
134
|
-
text.
|
|
130
|
+
@meaningful_text ||= text.match?(/\p{Alnum}/)
|
|
135
131
|
end
|
|
136
132
|
|
|
137
133
|
# @return [Boolean] true when the destination route has content signals
|
|
@@ -142,8 +138,17 @@ module Html2rss
|
|
|
142
138
|
# @return [Boolean] true when the anchor is inside the selected heading
|
|
143
139
|
def heading_anchor?
|
|
144
140
|
heading = @context.heading
|
|
141
|
+
return false unless heading
|
|
142
|
+
|
|
143
|
+
curr = @anchor
|
|
144
|
+
container = @context.container
|
|
145
|
+
while curr.respond_to?(:parent)
|
|
146
|
+
return true if curr == heading
|
|
147
|
+
break if curr == container
|
|
145
148
|
|
|
146
|
-
|
|
149
|
+
curr = curr.parent
|
|
150
|
+
end
|
|
151
|
+
false
|
|
147
152
|
end
|
|
148
153
|
|
|
149
154
|
# @return [Boolean] true when anchor text exactly matches heading text
|
|
@@ -151,14 +156,14 @@ module Html2rss
|
|
|
151
156
|
heading_text = @context.heading_text
|
|
152
157
|
|
|
153
158
|
meaningful_text? &&
|
|
154
|
-
heading_text.
|
|
159
|
+
heading_text.match?(/\p{Alnum}/) &&
|
|
155
160
|
heading_text == text
|
|
156
161
|
end
|
|
157
162
|
|
|
158
163
|
private
|
|
159
164
|
|
|
160
165
|
def representative_content_anchor?
|
|
161
|
-
|
|
166
|
+
meaningful_text? || content_like_destination? || heading_anchor?
|
|
162
167
|
end
|
|
163
168
|
|
|
164
169
|
def utility_text_suppressed?
|
|
@@ -174,7 +179,19 @@ module Html2rss
|
|
|
174
179
|
def ineligible_anchor?
|
|
175
180
|
destination_facts.high_confidence_utility_destination ||
|
|
176
181
|
icon_only_anchor? ||
|
|
177
|
-
|
|
182
|
+
utility_landmark_ancestor?
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
def utility_landmark_ancestor?
|
|
186
|
+
curr = @anchor.parent
|
|
187
|
+
container = @context.container
|
|
188
|
+
while curr.respond_to?(:parent)
|
|
189
|
+
return true if Context::UTILITY_LANDMARK_TAGS.include?(curr.name)
|
|
190
|
+
break if curr == container
|
|
191
|
+
|
|
192
|
+
curr = curr.parent
|
|
193
|
+
end
|
|
194
|
+
false
|
|
178
195
|
end
|
|
179
196
|
|
|
180
197
|
def icon_only_anchor?
|
|
@@ -27,43 +27,17 @@ module Html2rss
|
|
|
27
27
|
|
|
28
28
|
# @return [Array<Nokogiri::XML::Node>] candidate semantic containers
|
|
29
29
|
def call
|
|
30
|
-
|
|
31
|
-
|
|
30
|
+
cache = {}.compare_by_identity
|
|
31
|
+
candidates = @parsed_body.css(SELECTORS.join(',')).reject do |node|
|
|
32
|
+
HtmlExtractor.ignored_container_path?(node, cache)
|
|
32
33
|
end
|
|
33
34
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
@document_order ||= begin
|
|
41
|
-
order = {}
|
|
42
|
-
index = 0
|
|
43
|
-
|
|
44
|
-
@parsed_body.traverse do |node|
|
|
45
|
-
next unless node.element?
|
|
46
|
-
|
|
47
|
-
order[node] = index
|
|
48
|
-
index += 1
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
order.compare_by_identity
|
|
52
|
-
end
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
def collect_selector_containers(selector, containers)
|
|
56
|
-
@parsed_body.css(selector).each do |container|
|
|
57
|
-
next if HtmlExtractor.ignored_container_path?(container)
|
|
58
|
-
next if seen[container]
|
|
59
|
-
|
|
60
|
-
seen[container] = true
|
|
61
|
-
containers << container
|
|
62
|
-
end
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
def seen
|
|
66
|
-
@seen ||= {}.compare_by_identity
|
|
35
|
+
# Preserve the original post-order traversal intent (specific-first)
|
|
36
|
+
# by sorting candidates by depth (descending) while keeping original document
|
|
37
|
+
# order for nodes at the same depth.
|
|
38
|
+
candidates.each_with_index
|
|
39
|
+
.sort_by { |node, index| [-node.ancestors.size, index] }
|
|
40
|
+
.map!(&:first)
|
|
67
41
|
end
|
|
68
42
|
end
|
|
69
43
|
end
|
|
@@ -4,15 +4,15 @@ module Html2rss
|
|
|
4
4
|
##
|
|
5
5
|
# HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
|
|
6
6
|
# from an article_tag.
|
|
7
|
-
class HtmlExtractor
|
|
7
|
+
class HtmlExtractor # rubocop:disable Metrics/ClassLength
|
|
8
8
|
# Tags ignored when extracting visible text content from article containers.
|
|
9
9
|
INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
|
|
10
|
-
# Element path pattern ignored when traversing candidate article containers.
|
|
11
|
-
IGNORED_CONTAINER_PATH = /(nav|footer|header|svg|script|style)/i
|
|
12
10
|
# Heading tags used to prioritize title extraction.
|
|
13
11
|
HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
|
|
14
12
|
# Selector used to derive non-headline description nodes.
|
|
15
13
|
NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
|
|
14
|
+
# Element tags that indicate ignored DOM chrome when found in a container path.
|
|
15
|
+
IGNORED_CONTAINER_TAGS = %w[nav footer header svg script style].to_set.freeze
|
|
16
16
|
|
|
17
17
|
# Anchor selector used to identify the canonical article link element.
|
|
18
18
|
MAIN_ANCHOR_SELECTOR = begin
|
|
@@ -42,8 +42,39 @@ module Html2rss
|
|
|
42
42
|
parts.join(separator).squeeze(' ').strip unless parts.empty?
|
|
43
43
|
end
|
|
44
44
|
|
|
45
|
+
##
|
|
46
|
+
# @param article_tag [Nokogiri::XML::Node] article-like container to search within
|
|
47
|
+
# @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
|
|
48
|
+
def main_anchor_for(article_tag)
|
|
49
|
+
return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
|
|
50
|
+
|
|
51
|
+
article_tag.at_css(MAIN_ANCHOR_SELECTOR)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
##
|
|
55
|
+
# @param node [Nokogiri::XML::Node]
|
|
56
|
+
# @param cache [Hash, nil] identity cache used to store results (must use compare_by_identity)
|
|
57
|
+
# @return [Boolean] true when the node belongs to ignored DOM chrome
|
|
58
|
+
def ignored_container_path?(node, cache = nil)
|
|
59
|
+
return cache[node] if cache&.key?(node)
|
|
60
|
+
|
|
61
|
+
res = walk_ignored_container_path?(node)
|
|
62
|
+
cache[node] = res if cache
|
|
63
|
+
res
|
|
64
|
+
end
|
|
65
|
+
|
|
45
66
|
private
|
|
46
67
|
|
|
68
|
+
def walk_ignored_container_path?(node)
|
|
69
|
+
curr = node
|
|
70
|
+
while curr.respond_to?(:parent)
|
|
71
|
+
return true if IGNORED_CONTAINER_TAGS.include?(curr.name)
|
|
72
|
+
|
|
73
|
+
curr = curr.parent
|
|
74
|
+
end
|
|
75
|
+
false
|
|
76
|
+
end
|
|
77
|
+
|
|
47
78
|
def visible_child?(node)
|
|
48
79
|
!INVISIBLE_CONTENT_TAGS.include?(node.name) &&
|
|
49
80
|
!(node.name == 'a' && node['href']&.start_with?('#'))
|
|
@@ -80,26 +111,6 @@ module Html2rss
|
|
|
80
111
|
|
|
81
112
|
attr_reader :article_tag, :base_url, :selected_anchor
|
|
82
113
|
|
|
83
|
-
class << self
|
|
84
|
-
##
|
|
85
|
-
# @param article_tag [Nokogiri::XML::Node] article-like container to search within
|
|
86
|
-
# @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
|
|
87
|
-
def main_anchor_for(article_tag)
|
|
88
|
-
return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
|
|
89
|
-
|
|
90
|
-
article_tag.at_css(MAIN_ANCHOR_SELECTOR)
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
##
|
|
94
|
-
# @param node [Nokogiri::XML::Node, String] node or path to test
|
|
95
|
-
# @return [Boolean] true when the node belongs to ignored DOM chrome
|
|
96
|
-
def ignored_container_path?(node)
|
|
97
|
-
path = node.respond_to?(:path) ? node.path : node.to_s
|
|
98
|
-
|
|
99
|
-
path.match?(IGNORED_CONTAINER_PATH)
|
|
100
|
-
end
|
|
101
|
-
end
|
|
102
|
-
|
|
103
114
|
def extract_url
|
|
104
115
|
@extract_url ||= begin
|
|
105
116
|
href = selected_anchor&.[]('href').to_s
|
|
@@ -115,14 +126,24 @@ module Html2rss
|
|
|
115
126
|
|
|
116
127
|
def heading
|
|
117
128
|
@heading ||= begin
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
129
|
+
tags = article_tag.css(HEADING_TAGS.join(','))
|
|
130
|
+
tags.any? ? select_best_heading(tags) : nil
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
def select_best_heading(tags)
|
|
135
|
+
min_tag_name = tags.map(&:name).min
|
|
136
|
+
best_tag = nil
|
|
137
|
+
max_size = -1
|
|
138
|
+
|
|
139
|
+
tags.each do |tag|
|
|
140
|
+
next if tag.name != min_tag_name
|
|
141
|
+
|
|
142
|
+
size = self.class.extract_visible_text(tag)&.size.to_i
|
|
143
|
+
(best_tag = tag) && (max_size = size) if size > max_size
|
|
125
144
|
end
|
|
145
|
+
|
|
146
|
+
best_tag
|
|
126
147
|
end
|
|
127
148
|
|
|
128
149
|
def extract_description
|
|
@@ -25,12 +25,12 @@ module Html2rss
|
|
|
25
25
|
# @param end_of_range [Integer] Optional, defaults to half the text length
|
|
26
26
|
# @return [String]
|
|
27
27
|
def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
|
|
28
|
-
return text unless text.is_a?(String) && pattern.is_a?(String)
|
|
28
|
+
return text unless text.is_a?(String) && pattern.is_a?(String) && !pattern.empty?
|
|
29
29
|
|
|
30
30
|
index = text.index(pattern)
|
|
31
|
-
return text if index.nil? || index
|
|
31
|
+
return text if index.nil? || index > end_of_range
|
|
32
32
|
|
|
33
|
-
text
|
|
33
|
+
"#{text[0, index]}#{text[(index + pattern.size)..]}"
|
|
34
34
|
end
|
|
35
35
|
|
|
36
36
|
# @param base [String] The base text content for the description
|
|
@@ -7,7 +7,7 @@ module Html2rss
|
|
|
7
7
|
# Request-control keys accepted at the top level of feed config.
|
|
8
8
|
TOP_LEVEL_KEYS = %i[strategy].freeze
|
|
9
9
|
# Request-control keys accepted under the nested `request` config.
|
|
10
|
-
REQUEST_KEYS = %i[max_redirects max_requests].freeze
|
|
10
|
+
REQUEST_KEYS = %i[max_redirects max_requests total_timeout_seconds].freeze
|
|
11
11
|
|
|
12
12
|
##
|
|
13
13
|
# @param config [Hash{Symbol => Object}] raw config input
|
|
@@ -20,6 +20,7 @@ module Html2rss
|
|
|
20
20
|
strategy: config[:strategy],
|
|
21
21
|
max_redirects: request_value_for(config, :max_redirects),
|
|
22
22
|
max_requests: request_value_for(config, :max_requests),
|
|
23
|
+
total_timeout_seconds: request_value_for(config, :total_timeout_seconds),
|
|
23
24
|
explicit_keys: explicit_keys_for(config)
|
|
24
25
|
)
|
|
25
26
|
end
|
|
@@ -47,11 +48,13 @@ module Html2rss
|
|
|
47
48
|
# @param strategy [Symbol, nil] effective request strategy
|
|
48
49
|
# @param max_redirects [Integer, nil] effective redirect limit
|
|
49
50
|
# @param max_requests [Integer, nil] effective request budget
|
|
51
|
+
# @param total_timeout_seconds [Integer, nil] effective request timeout
|
|
50
52
|
# @param explicit_keys [Array<Symbol>] controls explicitly supplied by the caller
|
|
51
|
-
def initialize(strategy: nil, max_redirects: nil, max_requests: nil, explicit_keys: [])
|
|
53
|
+
def initialize(strategy: nil, max_redirects: nil, max_requests: nil, total_timeout_seconds: nil, explicit_keys: [])
|
|
52
54
|
@strategy = strategy
|
|
53
55
|
@max_redirects = max_redirects
|
|
54
56
|
@max_requests = max_requests
|
|
57
|
+
@total_timeout_seconds = total_timeout_seconds
|
|
55
58
|
@explicit_keys = explicit_keys.map(&:to_sym).uniq.freeze
|
|
56
59
|
freeze
|
|
57
60
|
end
|
|
@@ -68,6 +71,10 @@ module Html2rss
|
|
|
68
71
|
# @return [Integer, nil] effective request budget
|
|
69
72
|
attr_reader :max_requests
|
|
70
73
|
|
|
74
|
+
##
|
|
75
|
+
# @return [Integer, nil] effective request timeout
|
|
76
|
+
attr_reader :total_timeout_seconds
|
|
77
|
+
|
|
71
78
|
##
|
|
72
79
|
# @param name [Symbol, String] request control name
|
|
73
80
|
# @return [Boolean] whether the control was explicitly supplied
|
|
@@ -79,12 +86,14 @@ module Html2rss
|
|
|
79
86
|
# @param strategy [Symbol, nil] validated request strategy
|
|
80
87
|
# @param max_redirects [Integer, nil] validated redirect limit
|
|
81
88
|
# @param max_requests [Integer, nil] validated request budget
|
|
89
|
+
# @param total_timeout_seconds [Integer, nil] validated request timeout
|
|
82
90
|
# @return [RequestControls] controls updated with validated effective values
|
|
83
|
-
def with_effective_values(strategy:, max_redirects:, max_requests:)
|
|
91
|
+
def with_effective_values(strategy:, max_redirects:, max_requests:, total_timeout_seconds:)
|
|
84
92
|
self.class.new(
|
|
85
93
|
strategy:,
|
|
86
94
|
max_redirects:,
|
|
87
95
|
max_requests:,
|
|
96
|
+
total_timeout_seconds:,
|
|
88
97
|
explicit_keys:
|
|
89
98
|
)
|
|
90
99
|
end
|
|
@@ -98,6 +107,7 @@ module Html2rss
|
|
|
98
107
|
config[:strategy] = strategy if explicit?(:strategy)
|
|
99
108
|
apply_request_value(config, :max_redirects, max_redirects)
|
|
100
109
|
apply_request_value(config, :max_requests, max_requests)
|
|
110
|
+
apply_request_value(config, :total_timeout_seconds, total_timeout_seconds)
|
|
101
111
|
config
|
|
102
112
|
end
|
|
103
113
|
|
|
@@ -30,9 +30,9 @@ module Html2rss
|
|
|
30
30
|
|
|
31
31
|
# Default policy values used when request controls are not explicitly set.
|
|
32
32
|
DEFAULTS = {
|
|
33
|
-
connect_timeout_seconds: 5,
|
|
34
|
-
read_timeout_seconds: 10,
|
|
35
|
-
total_timeout_seconds: 30,
|
|
33
|
+
connect_timeout_seconds: Integer(ENV.fetch('HTML2RSS_CONNECT_TIMEOUT_SECONDS', 5)),
|
|
34
|
+
read_timeout_seconds: Integer(ENV.fetch('HTML2RSS_READ_TIMEOUT_SECONDS', 10)),
|
|
35
|
+
total_timeout_seconds: Integer(ENV.fetch('HTML2RSS_TOTAL_TIMEOUT_SECONDS', 30)),
|
|
36
36
|
max_redirects: 3,
|
|
37
37
|
max_response_bytes: 5_242_880,
|
|
38
38
|
max_decompressed_bytes: 10_485_760,
|
|
@@ -11,7 +11,8 @@ module Html2rss
|
|
|
11
11
|
def self.from_config(config)
|
|
12
12
|
RequestService::Policy.new(
|
|
13
13
|
max_requests: effective_max_requests_for(config),
|
|
14
|
-
max_redirects: config.max_redirects
|
|
14
|
+
max_redirects: config.max_redirects,
|
|
15
|
+
total_timeout_seconds: config.total_timeout_seconds || RequestService::Policy::DEFAULTS[:total_timeout_seconds]
|
|
15
16
|
)
|
|
16
17
|
end
|
|
17
18
|
|