html2rss 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/html2rss.gemspec +1 -2
  3. data/lib/html2rss/auto_source/scraper/html.rb +61 -16
  4. data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
  5. data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
  6. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
  7. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
  8. data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
  9. data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
  10. data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
  11. data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
  12. data/lib/html2rss/auto_source/scraper.rb +0 -3
  13. data/lib/html2rss/auto_source.rb +2 -11
  14. data/lib/html2rss/category_extractor.rb +54 -20
  15. data/lib/html2rss/config/class_methods.rb +9 -4
  16. data/lib/html2rss/config/validator.rb +1 -0
  17. data/lib/html2rss/config.rb +4 -1
  18. data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
  19. data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
  20. data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
  21. data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
  22. data/lib/html2rss/html_extractor.rb +51 -30
  23. data/lib/html2rss/rendering/description_builder.rb +3 -3
  24. data/lib/html2rss/request_controls.rb +13 -3
  25. data/lib/html2rss/request_service/policy.rb +3 -3
  26. data/lib/html2rss/request_session/runtime_policy.rb +2 -1
  27. data/lib/html2rss/rss_builder/article.rb +44 -23
  28. data/lib/html2rss/rss_builder/enclosure.rb +4 -2
  29. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
  30. data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
  31. data/lib/html2rss/selectors/post_processors/template.rb +3 -2
  32. data/lib/html2rss/selectors.rb +18 -4
  33. data/lib/html2rss/url.rb +4 -3
  34. data/lib/html2rss/version.rb +1 -1
  35. data/schema/html2rss-config.schema.json +7 -0
  36. metadata +3 -17
@@ -9,6 +9,7 @@ module Html2rss
9
9
  ##
10
10
  # Article is a simple data object representing an article extracted from a page.
11
11
  # It is enumerable and responds to all keys specified in PROVIDED_KEYS.
12
+ # rubocop:disable Metrics/ClassLength
12
13
  class Article
13
14
  include Enumerable
14
15
  include Comparable
@@ -17,6 +18,11 @@ module Html2rss
17
18
  PROVIDED_KEYS = %i[id title description url image author guid published_at enclosures categories scraper].freeze
18
19
  # Separator used to build deterministic deduplication fingerprints.
19
20
  DEDUP_FINGERPRINT_SEPARATOR = '#!/'
21
+ # Sentinel object used to pre-initialize instance variables in the constructor.
22
+ # This ensures all Article instances share the exact same object shape (Ruby 3.3+ optimization),
23
+ # preventing performance warnings and slower instance variable access due to shape transitions
24
+ # when attributes are lazily/conditionally accessed in different sequences.
25
+ NOT_SET = Object.new.freeze
20
26
 
21
27
  # @param options [Hash{Symbol => String}]
22
28
  # @option options [String] :id stable article identifier
@@ -31,9 +37,9 @@ module Html2rss
31
37
  # @option options [Array<String>] :categories category labels
32
38
  # @option options [Class] :scraper scraper class that produced the article
33
39
  def initialize(**options)
34
- @to_h = {}
35
- options.each_pair { |key, value| @to_h[key] = value.freeze if value }
36
- @to_h.freeze
40
+ @to_h = options.each_with_object({}) { |(k, v), h| h[k] = v.freeze if v }.freeze
41
+
42
+ @description = @url = @image = @guid = @enclosures = @enclosure = @categories = @published_at = NOT_SET
37
43
 
38
44
  return unless (unknown_keys = options.keys - PROVIDED_KEYS).any?
39
45
 
@@ -62,7 +68,9 @@ module Html2rss
62
68
 
63
69
  # @return [String] rendered article description
64
70
  def description
65
- @description ||= Rendering::DescriptionBuilder.new(
71
+ return @description unless @description == NOT_SET
72
+
73
+ @description = Rendering::DescriptionBuilder.new(
66
74
  base: @to_h[:description],
67
75
  title:,
68
76
  url:,
@@ -73,12 +81,16 @@ module Html2rss
73
81
 
74
82
  # @return [Url, nil]
75
83
  def url
76
- @url ||= Url.sanitize(@to_h[:url])
84
+ return @url unless @url == NOT_SET
85
+
86
+ @url = Url.sanitize(@to_h[:url])
77
87
  end
78
88
 
79
89
  # @return [Url, nil]
80
90
  def image
81
- @image ||= Url.sanitize(@to_h[:image])
91
+ return @image unless @image == NOT_SET
92
+
93
+ @image = Url.sanitize(@to_h[:image])
82
94
  end
83
95
 
84
96
  # @return [String, nil]
@@ -87,7 +99,9 @@ module Html2rss
87
99
  # Generates a unique identifier based on the URL and ID using CRC32.
88
100
  # @return [String]
89
101
  def guid
90
- @guid ||= Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
102
+ return @guid unless @guid == NOT_SET
103
+
104
+ @guid = Zlib.crc32(fetch_guid).to_s(36).encode('utf-8')
91
105
  end
92
106
 
93
107
  ##
@@ -100,27 +114,32 @@ module Html2rss
100
114
 
101
115
  # @return [Array<Html2rss::RssBuilder::Enclosure>] normalized enclosure objects
102
116
  def enclosures
103
- @enclosures ||= Array(@to_h[:enclosures])
104
- .map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
117
+ return @enclosures unless @enclosures == NOT_SET
118
+
119
+ @enclosures = Array(@to_h[:enclosures])
120
+ .map { |enclosure| Html2rss::RssBuilder::Enclosure.new(**enclosure) }
105
121
  end
106
122
 
107
123
  # @return [Html2rss::RssBuilder::Enclosure, nil]
108
124
  def enclosure
109
- return @enclosure if defined?(@enclosure)
110
-
111
- case (object = @to_h[:enclosures]&.first)
112
- when Hash
113
- @enclosure = Html2rss::RssBuilder::Enclosure.new(**object)
114
- when nil
115
- @enclosure = Html2rss::RssBuilder::Enclosure.new(url: image) if image
116
- else
117
- Log.warn "Article: unknown enclosure type: #{object.class}"
118
- end
125
+ return @enclosure unless @enclosure == NOT_SET
126
+
127
+ @enclosure = case (object = @to_h[:enclosures]&.first)
128
+ when Hash
129
+ Html2rss::RssBuilder::Enclosure.new(**object)
130
+ when nil
131
+ Html2rss::RssBuilder::Enclosure.new(url: image) if image
132
+ else
133
+ Log.warn "Article: unknown enclosure type: #{object.class}"
134
+ nil
135
+ end
119
136
  end
120
137
 
121
138
  # @return [Array<String>] normalized, unique category names
122
139
  def categories
123
- @categories ||= @to_h[:categories].dup.to_a.tap do |categories|
140
+ return @categories unless @categories == NOT_SET
141
+
142
+ @categories = @to_h[:categories].dup.to_a.tap do |categories|
124
143
  categories.map! { |category| category.to_s.strip }
125
144
  categories.reject!(&:empty?)
126
145
  categories.uniq!
@@ -130,11 +149,12 @@ module Html2rss
130
149
  # Parses and returns the published_at time.
131
150
  # @return [DateTime, nil]
132
151
  def published_at
133
- return if (string = @to_h[:published_at].to_s.strip).empty?
152
+ return @published_at unless @published_at == NOT_SET
134
153
 
135
- @published_at ||= DateTime.parse(string)
154
+ string = @to_h[:published_at].to_s.strip
155
+ @published_at = string.empty? ? nil : DateTime.parse(string)
136
156
  rescue ArgumentError
137
- nil
157
+ @published_at = nil
138
158
  end
139
159
 
140
160
  # @return [Class, nil] scraper class that produced this article
@@ -183,5 +203,6 @@ module Html2rss
183
203
  value
184
204
  end
185
205
  end
206
+ # rubocop:enable Metrics/ClassLength
186
207
  end
187
208
  end
@@ -16,9 +16,11 @@ module Html2rss
16
16
  def self.guess_content_type_from_url(url, default: 'application/octet-stream')
17
17
  return default unless url
18
18
 
19
- url = url.path.split('?').first
19
+ path = url.path
20
+ ext = File.extname(path)
21
+ ext = ext[1..] if ext.start_with?('.')
20
22
 
21
- content_type = MIME::Types.type_for(File.extname(url).delete('.'))
23
+ content_type = MIME::Types.type_for(ext)
22
24
  content_type.first&.to_s || 'application/octet-stream'
23
25
  end
24
26
 
@@ -91,7 +91,6 @@ module Html2rss
91
91
  end
92
92
 
93
93
  ##
94
- # Shorthand method to get the sanitized HTML.
95
94
  # @param html [String]
96
95
  # @param url [String, Html2rss::Url]
97
96
  # @return [String, nil]
@@ -102,10 +101,34 @@ module Html2rss
102
101
  new(html, context).get
103
102
  end
104
103
 
104
+ ##
105
+ # @param channel_url [String, Html2rss::Url]
106
+ # @return [Hash] the memoized sanitize configuration
107
+ # rubocop:disable Metrics/MethodLength, ThreadSafety/ClassInstanceVariable
108
+ def self.sanitize_config(channel_url)
109
+ @sanitize_configs ||= {}
110
+ @sanitize_configs[channel_url] ||= begin
111
+ config = Sanitize::Config.merge(
112
+ Sanitize::Config::RELAXED,
113
+ attributes: { all: %w[dir lang alt title translate] },
114
+ add_attributes: TAG_ATTRIBUTES,
115
+ transformers: [
116
+ lambda { |env|
117
+ HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
118
+ },
119
+ ->(env) { HtmlTransformers::WrapImgInA.new.call(**env) }
120
+ ]
121
+ )
122
+ config[:elements].push('audio', 'video', 'source')
123
+ config.freeze
124
+ end
125
+ end
126
+ # rubocop:enable Metrics/MethodLength, ThreadSafety/ClassInstanceVariable
127
+
105
128
  ##
106
129
  # @return [String, nil]
107
130
  def get
108
- sanitized_html = Sanitize.fragment(value, sanitize_config).to_s
131
+ sanitized_html = Sanitize.fragment(value, self.class.sanitize_config(channel_url)).to_s
109
132
  sanitized_html.gsub!(/\s+/, ' ')
110
133
  sanitized_html.strip!
111
134
  sanitized_html.empty? ? nil : sanitized_html
@@ -114,40 +137,6 @@ module Html2rss
114
137
  private
115
138
 
116
139
  def channel_url = context.dig(:config, :channel, :url)
117
-
118
- ##
119
- # @return [Sanitize::Config]
120
- def sanitize_config # rubocop:disable Metrics/MethodLength
121
- config = Sanitize::Config.merge(
122
- Sanitize::Config::RELAXED,
123
- attributes: { all: %w[dir lang alt title translate] },
124
- add_attributes: TAG_ATTRIBUTES,
125
- transformers: [
126
- method(:transform_urls_to_absolute_ones),
127
- method(:wrap_img_in_a)
128
- ]
129
- )
130
- config[:elements].push('audio', 'video', 'source')
131
- config
132
- end
133
-
134
- ##
135
- # Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
136
- #
137
- # @param env [Hash]
138
- # @return [nil]
139
- def transform_urls_to_absolute_ones(env)
140
- HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
141
- end
142
-
143
- ##
144
- # Wrapper for wrap_img_in_a.
145
- #
146
- # @param env [Hash]
147
- # @return [nil]
148
- def wrap_img_in_a(env)
149
- HtmlTransformers::WrapImgInA.new.call(**env)
150
- end
151
140
  end
152
141
  end
153
142
  end
@@ -34,19 +34,17 @@ module Html2rss
34
34
  # @param context [Selectors::Context] post-processor context
35
35
  # @return [void]
36
36
  def self.validate_args!(value, context)
37
- assert_type value, String, :value, context:
37
+ assert_type(value, String, :value, context:)
38
38
 
39
39
  options = context[:options]
40
- assert_type options[:start], Integer, :start, context:
41
-
42
- end_index = options[:end]
43
- assert_type(end_index, Integer, :end, context:) if end_index
40
+ assert_type(options[:start], Integer, :start, context:)
41
+ assert_type(options[:end], Integer, :end, context:) if options.key?(:end)
44
42
  end
45
43
 
46
44
  ##
47
45
  # Extracts the substring from the original string based on the provided start and end indices.
48
46
  #
49
- # @return [String] The extracted substring.
47
+ # @return [String, nil] The extracted substring.
50
48
  def get
51
49
  value[range]
52
50
  end
@@ -56,21 +54,16 @@ module Html2rss
56
54
  #
57
55
  # @return [Range] The range object representing the start and end/Infinity (integers).
58
56
  def range
59
- return (start_index..) unless end_index?
57
+ options = context[:options]
58
+ start = options[:start]
60
59
 
61
- if start_index == end_index
62
- raise ArgumentError,
63
- 'The `start` value must be unequal to the `end` value.'
64
- end
60
+ return (start..) unless options.key?(:end)
65
61
 
66
- (start_index..end_index)
67
- end
62
+ finish = options[:end]
63
+ raise ArgumentError, 'The `start` value must be unequal to the `end` value.' if start == finish
68
64
 
69
- private
70
-
71
- def end_index? = !context[:options][:end].to_s.empty?
72
- def end_index = context[:options][:end].to_i
73
- def start_index = context[:options][:start].to_i
65
+ (start..finish)
66
+ end
74
67
  end
75
68
  end
76
69
  end
@@ -29,7 +29,7 @@ module Html2rss
29
29
  # selector: h1
30
30
  # post_process:
31
31
  # name: template
32
- # string: '%{self} (%{price})'
32
+ # string: '`%{self}` (`%{price}`)'
33
33
  #
34
34
  # Would return:
35
35
  # 'Product (23,42€)'
@@ -54,12 +54,13 @@ module Html2rss
54
54
  @scraper = context[:scraper]
55
55
  @item = context[:item]
56
56
  @string = @options[:string].to_s
57
+ @getter = ->(key) { item_value(key) }
57
58
  end
58
59
 
59
60
  ##
60
61
  # @return [String]
61
62
  def get
62
- Html2rss::Config::DynamicParams.call(@string, {}, getter: method(:item_value), replace_missing_with: '')
63
+ Html2rss::Config::DynamicParams.call(@string, {}, getter: @getter, replace_missing_with: '')
63
64
  end
64
65
 
65
66
  private
@@ -199,7 +199,10 @@ module Html2rss
199
199
  end
200
200
 
201
201
  def select_regular(_name, item:, config:, base_url:)
202
- value = Extractors.get(config.merge(channel: channel_context(base_url)), item)
202
+ @merged_configs ||= {}
203
+ merged_config = @merged_configs[[config.object_id, base_url]] ||=
204
+ config.merge(channel: channel_context(base_url)).freeze
205
+ value = Extractors.get(merged_config, item)
203
206
 
204
207
  if value && (post_process_steps = config[:post_process])
205
208
  steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
@@ -210,8 +213,9 @@ module Html2rss
210
213
  end
211
214
 
212
215
  def post_process(item, value, post_process_steps, base_url:)
216
+ pp_context = channel_post_process_context(base_url)
213
217
  post_process_steps.each do |options|
214
- context = Context.new(config: { channel: { url: base_url, time_zone: @time_zone } },
218
+ context = Context.new(config: pp_context,
215
219
  item:, scraper: self, options:)
216
220
 
217
221
  value = PostProcessors.get(options[:name], value, context)
@@ -262,7 +266,11 @@ module Html2rss
262
266
  end
263
267
 
264
268
  def category_node_options(selector_config, base_url:)
265
- selector_config.merge(channel: channel_context(base_url), selector: nil)
269
+ @category_node_configs ||= {}
270
+ @category_node_configs[[selector_config.object_id, base_url]] ||= selector_config.merge(
271
+ channel: channel_context(base_url),
272
+ selector: nil
273
+ ).freeze
266
274
  end
267
275
 
268
276
  def apply_post_process_steps(item:, value:, post_process_steps:, base_url:)
@@ -288,7 +296,13 @@ module Html2rss
288
296
  end
289
297
 
290
298
  def channel_context(base_url)
291
- { url: base_url, time_zone: @time_zone }
299
+ @channel_contexts ||= {}
300
+ @channel_contexts[base_url] ||= { url: base_url, time_zone: @time_zone }.freeze
301
+ end
302
+
303
+ def channel_post_process_context(base_url)
304
+ @channel_pp_contexts ||= {}
305
+ @channel_pp_contexts[base_url] ||= { channel: channel_context(base_url) }.freeze
292
306
  end
293
307
 
294
308
  # @return [Hash] enclosure details.
data/lib/html2rss/url.rb CHANGED
@@ -54,8 +54,8 @@ module Html2rss
54
54
  # @param raw_url [String] the raw URL string to sanitize
55
55
  # @return [Url, nil] the sanitized URL, or nil if no valid URL found
56
56
  def self.sanitize(raw_url)
57
- matched_urls = raw_url.to_s.scan(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
58
- url = matched_urls.first.to_s.strip
57
+ match = raw_url.to_s.match(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
58
+ url = match ? match[0].strip : ''
59
59
  return nil if url.empty?
60
60
 
61
61
  new(Addressable::URI.parse(url).normalize)
@@ -125,6 +125,7 @@ module Html2rss
125
125
  # @param uri [Addressable::URI] the underlying Addressable::URI object (internal use only)
126
126
  def initialize(uri)
127
127
  @uri = uri.freeze
128
+ @path_segments = @uri.path.to_s.split('/').reject(&:empty?).freeze
128
129
  freeze
129
130
  end
130
131
 
@@ -162,7 +163,7 @@ module Html2rss
162
163
  # Returns the URL path split into non-empty segments.
163
164
  #
164
165
  # @return [Array<String>] normalized path segments
165
- def path_segments = @uri.path.to_s.split('/').reject(&:empty?)
166
+ attr_reader :path_segments
166
167
 
167
168
  ##
168
169
  # Returns a copy of the URL with the provided path.
@@ -4,6 +4,6 @@
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
6
  # Current application version.
7
- VERSION = '0.20.0'
7
+ VERSION = '0.21.0'
8
8
  public_constant :VERSION
9
9
  end
@@ -379,6 +379,13 @@
379
379
  },
380
380
  "exclusiveMinimum": 0
381
381
  },
382
+ "total_timeout_seconds": {
383
+ "type": "integer",
384
+ "not": {
385
+ "type": "null"
386
+ },
387
+ "exclusiveMinimum": 0
388
+ },
382
389
  "browserless": {
383
390
  "type": "object",
384
391
  "properties": {
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.20.0
4
+ version: 0.21.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
@@ -147,20 +147,6 @@ dependencies:
147
147
  - - "<"
148
148
  - !ruby/object:Gem::Version
149
149
  version: '2.0'
150
- - !ruby/object:Gem::Dependency
151
- name: parallel
152
- requirement: !ruby/object:Gem::Requirement
153
- requirements:
154
- - - ">="
155
- - !ruby/object:Gem::Version
156
- version: '0'
157
- type: :runtime
158
- prerelease: false
159
- version_requirements: !ruby/object:Gem::Requirement
160
- requirements:
161
- - - ">="
162
- - !ruby/object:Gem::Version
163
- version: '0'
164
150
  - !ruby/object:Gem::Dependency
165
151
  name: puppeteer-ruby
166
152
  requirement: !ruby/object:Gem::Requirement
@@ -390,7 +376,7 @@ licenses:
390
376
  - MIT
391
377
  metadata:
392
378
  allowed_push_host: https://rubygems.org
393
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.20.0
379
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.21.0
394
380
  rubygems_mfa_required: 'true'
395
381
  rdoc_options: []
396
382
  require_paths:
@@ -399,7 +385,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
399
385
  requirements:
400
386
  - - ">="
401
387
  - !ruby/object:Gem::Version
402
- version: '3.2'
388
+ version: '3.3'
403
389
  required_rubygems_version: !ruby/object:Gem::Requirement
404
390
  requirements:
405
391
  - - ">="