html2rss 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/html2rss.gemspec +1 -2
  3. data/lib/html2rss/auto_source/scraper/html.rb +61 -16
  4. data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
  5. data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
  6. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
  7. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
  8. data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
  9. data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
  10. data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
  11. data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
  12. data/lib/html2rss/auto_source/scraper.rb +0 -3
  13. data/lib/html2rss/auto_source.rb +2 -11
  14. data/lib/html2rss/category_extractor.rb +54 -20
  15. data/lib/html2rss/config/class_methods.rb +9 -4
  16. data/lib/html2rss/config/validator.rb +1 -0
  17. data/lib/html2rss/config.rb +4 -1
  18. data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
  19. data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
  20. data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
  21. data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
  22. data/lib/html2rss/html_extractor.rb +51 -30
  23. data/lib/html2rss/rendering/description_builder.rb +3 -3
  24. data/lib/html2rss/request_controls.rb +13 -3
  25. data/lib/html2rss/request_service/policy.rb +3 -3
  26. data/lib/html2rss/request_session/runtime_policy.rb +2 -1
  27. data/lib/html2rss/rss_builder/article.rb +44 -23
  28. data/lib/html2rss/rss_builder/enclosure.rb +4 -2
  29. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
  30. data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
  31. data/lib/html2rss/selectors/post_processors/template.rb +3 -2
  32. data/lib/html2rss/selectors.rb +18 -4
  33. data/lib/html2rss/url.rb +4 -3
  34. data/lib/html2rss/version.rb +1 -1
  35. data/schema/html2rss-config.schema.json +7 -0
  36. metadata +3 -17
@@ -132,10 +132,7 @@ module Html2rss
132
132
  def default_config
133
133
  {
134
134
  strategy: default_strategy_name,
135
- request: {
136
- max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
137
- max_requests: RequestService::Policy::DEFAULTS[:max_requests]
138
- },
135
+ request: default_request_config,
139
136
  channel: { time_zone: 'UTC' },
140
137
  headers: RequestHeaders.browser_defaults,
141
138
  stylesheets: Html2rss.configuration.stylesheets || []
@@ -149,6 +146,14 @@ module Html2rss
149
146
 
150
147
  private
151
148
 
149
+ def default_request_config
150
+ {
151
+ max_redirects: RequestService::Policy::DEFAULTS[:max_redirects],
152
+ max_requests: RequestService::Policy::DEFAULTS[:max_requests],
153
+ total_timeout_seconds: RequestService::Policy::DEFAULTS[:total_timeout_seconds]
154
+ }
155
+ end
156
+
152
157
  def resolve_effective_config(config, params:)
153
158
  effective_config = HashUtil.deep_symbolize_keys(config, context: 'config')
154
159
  resolved_params = parameter_defaults(effective_config)
@@ -80,6 +80,7 @@ module Html2rss
80
80
  RequestConfig = Dry::Schema.Params do
81
81
  optional(:max_redirects).filled(:integer, gteq?: 0)
82
82
  optional(:max_requests).filled(:integer, gt?: 0)
83
+ optional(:total_timeout_seconds).filled(:integer, gt?: 0)
83
84
  optional(:browserless).hash(BrowserlessRequestConfig)
84
85
  optional(:botasaurus).hash(BotasaurusRequestConfig)
85
86
  end
@@ -31,7 +31,8 @@ module Html2rss
31
31
  @request_controls = request_controls.with_effective_values(
32
32
  strategy: validated_config[:strategy],
33
33
  max_redirects: validated_config.dig(:request, :max_redirects),
34
- max_requests: validated_config.dig(:request, :max_requests)
34
+ max_requests: validated_config.dig(:request, :max_requests),
35
+ total_timeout_seconds: validated_config.dig(:request, :total_timeout_seconds)
35
36
  )
36
37
  end
37
38
 
@@ -41,6 +42,8 @@ module Html2rss
41
42
  def max_redirects = request_controls.max_redirects
42
43
  # @return [Integer, nil] configured request budget
43
44
  def max_requests = request_controls.max_requests
45
+ # @return [Integer, nil] configured request timeout
46
+ def total_timeout_seconds = request_controls.total_timeout_seconds
44
47
  # @return [Array<Hash>] stylesheet definitions
45
48
  def stylesheets = config[:stylesheets]
46
49
 
@@ -5,116 +5,87 @@ module Html2rss
5
5
  ##
6
6
  # Extracts enclosures from HTML tags using various strategies.
7
7
  class EnclosureExtractor
8
+ # CSS union query covering images, media, PDFs, iframes, and archives.
9
+ SELECTOR = [
10
+ 'img[src]:not([src^="data"])',
11
+ 'video source[src]',
12
+ 'audio source[src]',
13
+ 'audio[src]',
14
+ 'a[href$=".pdf"]',
15
+ 'iframe[src]',
16
+ 'a[href$=".zip"]',
17
+ 'a[href$=".tar.gz"]',
18
+ 'a[href$=".tgz"]'
19
+ ].join(',').freeze
20
+
8
21
  # @param article_tag [Nokogiri::XML::Element] article container node
9
22
  # @param base_url [String, Html2rss::Url] base URL for relative enclosure links
10
23
  # @return [Array<Hash{Symbol => Object}>] normalized enclosure hashes
11
24
  def self.call(article_tag, base_url)
12
- [
13
- Extractors::Image,
14
- Extractors::Media,
15
- Extractors::Pdf,
16
- Extractors::Iframe,
17
- Extractors::Archive
18
- ].flat_map { |strategy| strategy.call(article_tag, base_url:) }
25
+ article_tag.css(SELECTOR).filter_map do |element|
26
+ extract_from_element(element, base_url)
27
+ end
19
28
  end
20
- end
21
29
 
22
- # Extraction strategies for enclosure-like media/link tags.
23
- module Extractors
24
- # Extracts image enclosures from HTML tags.
25
- # Finds all image sources and returns them in a format suitable for RSS.
26
- class Image
27
- # @param article_tag [Nokogiri::XML::Element] article container node
28
- # @param base_url [String, Html2rss::Url] base URL for relative image sources
29
- # @return [Array<Hash{Symbol => Object}>] image enclosure hashes
30
- def self.call(article_tag, base_url:)
31
- article_tag.css('img[src]:not([src^="data"])').filter_map do |img|
32
- src = img['src'].to_s
33
- next if src.empty?
34
-
35
- abs_url = Url.from_relative(src, base_url)
36
- {
37
- url: abs_url,
38
- type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
39
- }
40
- end
30
+ def self.extract_from_element(element, base_url)
31
+ case element.name
32
+ when 'img'
33
+ extract_image(element, base_url)
34
+ when 'video', 'audio', 'source'
35
+ extract_media(element, base_url)
36
+ when 'iframe'
37
+ extract_iframe(element, base_url)
38
+ when 'a'
39
+ extract_a(element, base_url)
41
40
  end
42
41
  end
43
42
 
44
- # Extracts media enclosures (video/audio) from HTML tags.
45
- class Media
46
- # @param article_tag [Nokogiri::XML::Element] article container node
47
- # @param base_url [String, Html2rss::Url] base URL for relative media sources
48
- # @return [Array<Hash{Symbol => Object}>] media enclosure hashes
49
- def self.call(article_tag, base_url:)
50
- article_tag.css('video source[src], audio source[src], audio[src]').filter_map do |element|
51
- src = element['src'].to_s
52
- next if src.empty?
43
+ def self.extract_image(img, base_url)
44
+ src = img['src'].to_s
45
+ return if src.empty?
53
46
 
54
- {
55
- url: Url.from_relative(src, base_url),
56
- type: element['type']
57
- }
58
- end
59
- end
47
+ abs_url = Url.from_relative(src, base_url)
48
+ {
49
+ url: abs_url,
50
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'image/jpeg')
51
+ }
60
52
  end
61
53
 
62
- # Extracts PDF enclosures from HTML tags.
63
- class Pdf
64
- # @param article_tag [Nokogiri::XML::Element] article container node
65
- # @param base_url [String, Html2rss::Url] base URL for relative PDF links
66
- # @return [Array<Hash{Symbol => Object}>] PDF enclosure hashes
67
- def self.call(article_tag, base_url:)
68
- article_tag.css('a[href$=".pdf"]').filter_map do |link|
69
- href = link['href'].to_s
70
- next if href.empty?
54
+ def self.extract_media(element, base_url)
55
+ src = element['src'].to_s
56
+ return if src.empty?
71
57
 
72
- abs_url = Url.from_relative(href, base_url)
73
- {
74
- url: abs_url,
75
- type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url)
76
- }
77
- end
78
- end
58
+ {
59
+ url: Url.from_relative(src, base_url),
60
+ type: element['type']
61
+ }
79
62
  end
80
63
 
81
- # Extracts iframe enclosures from HTML tags.
82
- class Iframe
83
- # @param article_tag [Nokogiri::XML::Element] article container node
84
- # @param base_url [String, Html2rss::Url] base URL for relative iframe links
85
- # @return [Array<Hash{Symbol => Object}>] iframe enclosure hashes
86
- def self.call(article_tag, base_url:)
87
- article_tag.css('iframe[src]').filter_map do |iframe|
88
- src = iframe['src']
89
- next if src.nil? || src.empty?
64
+ def self.extract_iframe(iframe, base_url)
65
+ src = iframe['src'].to_s
66
+ return if src.empty?
90
67
 
91
- abs_url = Url.from_relative(src, base_url)
92
- {
93
- url: abs_url,
94
- type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
95
- }
96
- end
97
- end
68
+ abs_url = Url.from_relative(src, base_url)
69
+ {
70
+ url: abs_url,
71
+ type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url, default: 'text/html')
72
+ }
98
73
  end
99
74
 
100
- # Extracts archive enclosures (zip, tar.gz, tgz) from HTML tags.
101
- class Archive
102
- # @param article_tag [Nokogiri::XML::Element] article container node
103
- # @param base_url [String, Html2rss::Url] base URL for relative archive links
104
- # @return [Array<Hash{Symbol => Object}>] archive enclosure hashes
105
- def self.call(article_tag, base_url:)
106
- article_tag.css('a[href$=".zip"], a[href$=".tar.gz"], a[href$=".tgz"]').filter_map do |link|
107
- href = link['href'].to_s
108
- next if href.empty?
75
+ def self.extract_a(link, base_url)
76
+ href = link['href'].to_s
77
+ return if href.empty?
109
78
 
110
- abs_url = Url.from_relative(href, base_url)
111
- {
112
- url: abs_url,
113
- type: 'application/zip'
114
- }
115
- end
79
+ abs_url = Url.from_relative(href, base_url)
80
+
81
+ if href.end_with?('.pdf')
82
+ { url: abs_url, type: RssBuilder::Enclosure.guess_content_type_from_url(abs_url) }
83
+ else
84
+ { url: abs_url, type: 'application/zip' }
116
85
  end
117
86
  end
87
+
88
+ private_class_method :extract_from_element, :extract_image, :extract_media, :extract_iframe, :extract_a
118
89
  end
119
90
  end
120
91
  end
@@ -75,17 +75,11 @@ module Html2rss
75
75
  def each_anchor(anchor_filter:)
76
76
  return enum_for(:each_anchor, anchor_filter:) unless block_given?
77
77
 
78
- traversal_root&.traverse do |node|
79
- yield node if relevant_anchor?(node, anchor_filter:)
78
+ traversal_root&.css(HtmlExtractor::MAIN_ANCHOR_SELECTOR)&.each do |node|
79
+ yield node if anchor_filter.call(node)
80
80
  end
81
81
  end
82
82
 
83
- def relevant_anchor?(node, anchor_filter:)
84
- node.element? &&
85
- node.matches?(HtmlExtractor::MAIN_ANCHOR_SELECTOR) &&
86
- anchor_filter.call(node)
87
- end
88
-
89
83
  def traversal_root
90
84
  parsed_body.at_css('body, html') || parsed_body.root
91
85
  end
@@ -31,6 +31,8 @@ module Html2rss
31
31
 
32
32
  # Shared context for all anchors in one semantic container.
33
33
  class Context
34
+ attr_reader :container
35
+
34
36
  # Ancestor tags that usually indicate navigation/utility regions.
35
37
  UTILITY_LANDMARK_TAGS = %w[nav aside footer menu].freeze
36
38
 
@@ -56,7 +58,7 @@ module Html2rss
56
58
  def visible_text(node)
57
59
  return '' unless node
58
60
 
59
- HtmlExtractor.extract_visible_text(node).to_s.strip
61
+ (@visible_texts ||= {}.compare_by_identity)[node] ||= HtmlExtractor.extract_visible_text(node).to_s.strip
60
62
  end
61
63
 
62
64
  # @param anchor [Nokogiri::XML::Node] anchor candidate
@@ -70,12 +72,6 @@ module Html2rss
70
72
  def utility_text?(text)
71
73
  @link_heuristics.utility_text?(text)
72
74
  end
73
-
74
- # @param ancestors [Array<Nokogiri::XML::Node>]
75
- # @return [Boolean] true when the anchor lives inside navigation chrome
76
- def utility_landmark?(ancestors)
77
- ancestors.any? { |node| UTILITY_LANDMARK_TAGS.include?(node.name) }
78
- end
79
75
  end
80
76
 
81
77
  # One anchor plus the facts needed to decide whether it represents content.
@@ -131,7 +127,7 @@ module Html2rss
131
127
 
132
128
  # @return [Boolean] true when visible anchor text has words
133
129
  def meaningful_text?
134
- text.scan(/\p{Alnum}+/).any?
130
+ @meaningful_text ||= text.match?(/\p{Alnum}/)
135
131
  end
136
132
 
137
133
  # @return [Boolean] true when the destination route has content signals
@@ -142,8 +138,17 @@ module Html2rss
142
138
  # @return [Boolean] true when the anchor is inside the selected heading
143
139
  def heading_anchor?
144
140
  heading = @context.heading
141
+ return false unless heading
142
+
143
+ curr = @anchor
144
+ container = @context.container
145
+ while curr.respond_to?(:parent)
146
+ return true if curr == heading
147
+ break if curr == container
145
148
 
146
- heading && @anchor.ancestors.include?(heading)
149
+ curr = curr.parent
150
+ end
151
+ false
147
152
  end
148
153
 
149
154
  # @return [Boolean] true when anchor text exactly matches heading text
@@ -151,14 +156,14 @@ module Html2rss
151
156
  heading_text = @context.heading_text
152
157
 
153
158
  meaningful_text? &&
154
- heading_text.scan(/\p{Alnum}+/).any? &&
159
+ heading_text.match?(/\p{Alnum}/) &&
155
160
  heading_text == text
156
161
  end
157
162
 
158
163
  private
159
164
 
160
165
  def representative_content_anchor?
161
- heading_anchor? || meaningful_text? || content_like_destination?
166
+ meaningful_text? || content_like_destination? || heading_anchor?
162
167
  end
163
168
 
164
169
  def utility_text_suppressed?
@@ -174,7 +179,19 @@ module Html2rss
174
179
  def ineligible_anchor?
175
180
  destination_facts.high_confidence_utility_destination ||
176
181
  icon_only_anchor? ||
177
- @context.utility_landmark?(@anchor.ancestors.to_a)
182
+ utility_landmark_ancestor?
183
+ end
184
+
185
+ def utility_landmark_ancestor?
186
+ curr = @anchor.parent
187
+ container = @context.container
188
+ while curr.respond_to?(:parent)
189
+ return true if Context::UTILITY_LANDMARK_TAGS.include?(curr.name)
190
+ break if curr == container
191
+
192
+ curr = curr.parent
193
+ end
194
+ false
178
195
  end
179
196
 
180
197
  def icon_only_anchor?
@@ -27,43 +27,17 @@ module Html2rss
27
27
 
28
28
  # @return [Array<Nokogiri::XML::Node>] candidate semantic containers
29
29
  def call
30
- containers = SELECTORS.each_with_object([]) do |selector, memo|
31
- collect_selector_containers(selector, memo)
30
+ cache = {}.compare_by_identity
31
+ candidates = @parsed_body.css(SELECTORS.join(',')).reject do |node|
32
+ HtmlExtractor.ignored_container_path?(node, cache)
32
33
  end
33
34
 
34
- containers.sort_by { document_order.fetch(_1) }
35
- end
36
-
37
- private
38
-
39
- def document_order
40
- @document_order ||= begin
41
- order = {}
42
- index = 0
43
-
44
- @parsed_body.traverse do |node|
45
- next unless node.element?
46
-
47
- order[node] = index
48
- index += 1
49
- end
50
-
51
- order.compare_by_identity
52
- end
53
- end
54
-
55
- def collect_selector_containers(selector, containers)
56
- @parsed_body.css(selector).each do |container|
57
- next if HtmlExtractor.ignored_container_path?(container)
58
- next if seen[container]
59
-
60
- seen[container] = true
61
- containers << container
62
- end
63
- end
64
-
65
- def seen
66
- @seen ||= {}.compare_by_identity
35
+ # Preserve the original post-order traversal intent (specific-first)
36
+ # by sorting candidates by depth (descending) while keeping original document
37
+ # order for nodes at the same depth.
38
+ candidates.each_with_index
39
+ .sort_by { |node, index| [-node.ancestors.size, index] }
40
+ .map!(&:first)
67
41
  end
68
42
  end
69
43
  end
@@ -4,15 +4,15 @@ module Html2rss
4
4
  ##
5
5
  # HtmlExtractor is responsible for extracting details (headline, url, images, etc.)
6
6
  # from an article_tag.
7
- class HtmlExtractor
7
+ class HtmlExtractor # rubocop:disable Metrics/ClassLength
8
8
  # Tags ignored when extracting visible text content from article containers.
9
9
  INVISIBLE_CONTENT_TAGS = %w[svg script noscript style template].to_set.freeze
10
- # Element path pattern ignored when traversing candidate article containers.
11
- IGNORED_CONTAINER_PATH = /(nav|footer|header|svg|script|style)/i
12
10
  # Heading tags used to prioritize title extraction.
13
11
  HEADING_TAGS = %w[h1 h2 h3 h4 h5 h6].freeze
14
12
  # Selector used to derive non-headline description nodes.
15
13
  NON_HEADLINE_SELECTOR = (HEADING_TAGS.map { |tag| ":not(#{tag})" } + INVISIBLE_CONTENT_TAGS.to_a).freeze
14
+ # Element tags that indicate ignored DOM chrome when found in a container path.
15
+ IGNORED_CONTAINER_TAGS = %w[nav footer header svg script style].to_set.freeze
16
16
 
17
17
  # Anchor selector used to identify the canonical article link element.
18
18
  MAIN_ANCHOR_SELECTOR = begin
@@ -42,8 +42,39 @@ module Html2rss
42
42
  parts.join(separator).squeeze(' ').strip unless parts.empty?
43
43
  end
44
44
 
45
+ ##
46
+ # @param article_tag [Nokogiri::XML::Node] article-like container to search within
47
+ # @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
48
+ def main_anchor_for(article_tag)
49
+ return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
50
+
51
+ article_tag.at_css(MAIN_ANCHOR_SELECTOR)
52
+ end
53
+
54
+ ##
55
+ # @param node [Nokogiri::XML::Node]
56
+ # @param cache [Hash, nil] identity cache used to store results (must use compare_by_identity)
57
+ # @return [Boolean] true when the node belongs to ignored DOM chrome
58
+ def ignored_container_path?(node, cache = nil)
59
+ return cache[node] if cache&.key?(node)
60
+
61
+ res = walk_ignored_container_path?(node)
62
+ cache[node] = res if cache
63
+ res
64
+ end
65
+
45
66
  private
46
67
 
68
+ def walk_ignored_container_path?(node)
69
+ curr = node
70
+ while curr.respond_to?(:parent)
71
+ return true if IGNORED_CONTAINER_TAGS.include?(curr.name)
72
+
73
+ curr = curr.parent
74
+ end
75
+ false
76
+ end
77
+
47
78
  def visible_child?(node)
48
79
  !INVISIBLE_CONTENT_TAGS.include?(node.name) &&
49
80
  !(node.name == 'a' && node['href']&.start_with?('#'))
@@ -80,26 +111,6 @@ module Html2rss
80
111
 
81
112
  attr_reader :article_tag, :base_url, :selected_anchor
82
113
 
83
- class << self
84
- ##
85
- # @param article_tag [Nokogiri::XML::Node] article-like container to search within
86
- # @return [Nokogiri::XML::Node, nil] first eligible descendant anchor
87
- def main_anchor_for(article_tag)
88
- return article_tag if article_tag.name == 'a' && article_tag.matches?(MAIN_ANCHOR_SELECTOR)
89
-
90
- article_tag.at_css(MAIN_ANCHOR_SELECTOR)
91
- end
92
-
93
- ##
94
- # @param node [Nokogiri::XML::Node, String] node or path to test
95
- # @return [Boolean] true when the node belongs to ignored DOM chrome
96
- def ignored_container_path?(node)
97
- path = node.respond_to?(:path) ? node.path : node.to_s
98
-
99
- path.match?(IGNORED_CONTAINER_PATH)
100
- end
101
- end
102
-
103
114
  def extract_url
104
115
  @extract_url ||= begin
105
116
  href = selected_anchor&.[]('href').to_s
@@ -115,14 +126,24 @@ module Html2rss
115
126
 
116
127
  def heading
117
128
  @heading ||= begin
118
- heading_tags = article_tag.css(HEADING_TAGS.join(',')).group_by(&:name)
119
- smallest_heading = heading_tags.keys.min
120
- if smallest_heading
121
- heading_tags[smallest_heading]&.max_by do |tag|
122
- self.class.extract_visible_text(tag)&.size.to_i
123
- end
124
- end
129
+ tags = article_tag.css(HEADING_TAGS.join(','))
130
+ tags.any? ? select_best_heading(tags) : nil
131
+ end
132
+ end
133
+
134
+ def select_best_heading(tags)
135
+ min_tag_name = tags.map(&:name).min
136
+ best_tag = nil
137
+ max_size = -1
138
+
139
+ tags.each do |tag|
140
+ next if tag.name != min_tag_name
141
+
142
+ size = self.class.extract_visible_text(tag)&.size.to_i
143
+ (best_tag = tag) && (max_size = size) if size > max_size
125
144
  end
145
+
146
+ best_tag
126
147
  end
127
148
 
128
149
  def extract_description
@@ -25,12 +25,12 @@ module Html2rss
25
25
  # @param end_of_range [Integer] Optional, defaults to half the text length
26
26
  # @return [String]
27
27
  def self.remove_pattern_from_start(text, pattern, end_of_range: (text.size * 0.5).to_i)
28
- return text unless text.is_a?(String) && pattern.is_a?(String)
28
+ return text unless text.is_a?(String) && pattern.is_a?(String) && !pattern.empty?
29
29
 
30
30
  index = text.index(pattern)
31
- return text if index.nil? || index >= end_of_range
31
+ return text if index.nil? || index > end_of_range
32
32
 
33
- text.gsub(/^(.{0,#{end_of_range}})#{Regexp.escape(pattern)}/, '\1')
33
+ "#{text[0, index]}#{text[(index + pattern.size)..]}"
34
34
  end
35
35
 
36
36
  # @param base [String] The base text content for the description
@@ -7,7 +7,7 @@ module Html2rss
7
7
  # Request-control keys accepted at the top level of feed config.
8
8
  TOP_LEVEL_KEYS = %i[strategy].freeze
9
9
  # Request-control keys accepted under the nested `request` config.
10
- REQUEST_KEYS = %i[max_redirects max_requests].freeze
10
+ REQUEST_KEYS = %i[max_redirects max_requests total_timeout_seconds].freeze
11
11
 
12
12
  ##
13
13
  # @param config [Hash{Symbol => Object}] raw config input
@@ -20,6 +20,7 @@ module Html2rss
20
20
  strategy: config[:strategy],
21
21
  max_redirects: request_value_for(config, :max_redirects),
22
22
  max_requests: request_value_for(config, :max_requests),
23
+ total_timeout_seconds: request_value_for(config, :total_timeout_seconds),
23
24
  explicit_keys: explicit_keys_for(config)
24
25
  )
25
26
  end
@@ -47,11 +48,13 @@ module Html2rss
47
48
  # @param strategy [Symbol, nil] effective request strategy
48
49
  # @param max_redirects [Integer, nil] effective redirect limit
49
50
  # @param max_requests [Integer, nil] effective request budget
51
+ # @param total_timeout_seconds [Integer, nil] effective request timeout
50
52
  # @param explicit_keys [Array<Symbol>] controls explicitly supplied by the caller
51
- def initialize(strategy: nil, max_redirects: nil, max_requests: nil, explicit_keys: [])
53
+ def initialize(strategy: nil, max_redirects: nil, max_requests: nil, total_timeout_seconds: nil, explicit_keys: [])
52
54
  @strategy = strategy
53
55
  @max_redirects = max_redirects
54
56
  @max_requests = max_requests
57
+ @total_timeout_seconds = total_timeout_seconds
55
58
  @explicit_keys = explicit_keys.map(&:to_sym).uniq.freeze
56
59
  freeze
57
60
  end
@@ -68,6 +71,10 @@ module Html2rss
68
71
  # @return [Integer, nil] effective request budget
69
72
  attr_reader :max_requests
70
73
 
74
+ ##
75
+ # @return [Integer, nil] effective request timeout
76
+ attr_reader :total_timeout_seconds
77
+
71
78
  ##
72
79
  # @param name [Symbol, String] request control name
73
80
  # @return [Boolean] whether the control was explicitly supplied
@@ -79,12 +86,14 @@ module Html2rss
79
86
  # @param strategy [Symbol, nil] validated request strategy
80
87
  # @param max_redirects [Integer, nil] validated redirect limit
81
88
  # @param max_requests [Integer, nil] validated request budget
89
+ # @param total_timeout_seconds [Integer, nil] validated request timeout
82
90
  # @return [RequestControls] controls updated with validated effective values
83
- def with_effective_values(strategy:, max_redirects:, max_requests:)
91
+ def with_effective_values(strategy:, max_redirects:, max_requests:, total_timeout_seconds:)
84
92
  self.class.new(
85
93
  strategy:,
86
94
  max_redirects:,
87
95
  max_requests:,
96
+ total_timeout_seconds:,
88
97
  explicit_keys:
89
98
  )
90
99
  end
@@ -98,6 +107,7 @@ module Html2rss
98
107
  config[:strategy] = strategy if explicit?(:strategy)
99
108
  apply_request_value(config, :max_redirects, max_redirects)
100
109
  apply_request_value(config, :max_requests, max_requests)
110
+ apply_request_value(config, :total_timeout_seconds, total_timeout_seconds)
101
111
  config
102
112
  end
103
113
 
@@ -30,9 +30,9 @@ module Html2rss
30
30
 
31
31
  # Default policy values used when request controls are not explicitly set.
32
32
  DEFAULTS = {
33
- connect_timeout_seconds: 5,
34
- read_timeout_seconds: 10,
35
- total_timeout_seconds: 30,
33
+ connect_timeout_seconds: Integer(ENV.fetch('HTML2RSS_CONNECT_TIMEOUT_SECONDS', 5)),
34
+ read_timeout_seconds: Integer(ENV.fetch('HTML2RSS_READ_TIMEOUT_SECONDS', 10)),
35
+ total_timeout_seconds: Integer(ENV.fetch('HTML2RSS_TOTAL_TIMEOUT_SECONDS', 30)),
36
36
  max_redirects: 3,
37
37
  max_response_bytes: 5_242_880,
38
38
  max_decompressed_bytes: 10_485_760,
@@ -11,7 +11,8 @@ module Html2rss
11
11
  def self.from_config(config)
12
12
  RequestService::Policy.new(
13
13
  max_requests: effective_max_requests_for(config),
14
- max_redirects: config.max_redirects
14
+ max_redirects: config.max_redirects,
15
+ total_timeout_seconds: config.total_timeout_seconds || RequestService::Policy::DEFAULTS[:total_timeout_seconds]
15
16
  )
16
17
  end
17
18