html2rss 0.20.1 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/html2rss.gemspec +1 -2
  3. data/lib/html2rss/auto_source/scraper/html.rb +61 -16
  4. data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
  5. data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
  6. data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
  7. data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
  8. data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
  9. data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
  10. data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
  11. data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
  12. data/lib/html2rss/auto_source/scraper.rb +0 -3
  13. data/lib/html2rss/auto_source.rb +2 -11
  14. data/lib/html2rss/category_extractor.rb +54 -20
  15. data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
  16. data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
  17. data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
  18. data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
  19. data/lib/html2rss/html_extractor.rb +51 -30
  20. data/lib/html2rss/rendering/description_builder.rb +3 -3
  21. data/lib/html2rss/rss_builder/article.rb +44 -23
  22. data/lib/html2rss/rss_builder/enclosure.rb +4 -2
  23. data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
  24. data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
  25. data/lib/html2rss/selectors/post_processors/template.rb +3 -2
  26. data/lib/html2rss/selectors.rb +18 -4
  27. data/lib/html2rss/url.rb +4 -3
  28. data/lib/html2rss/version.rb +1 -1
  29. metadata +3 -17
@@ -91,7 +91,6 @@ module Html2rss
91
91
  end
92
92
 
93
93
  ##
94
- # Shorthand method to get the sanitized HTML.
95
94
  # @param html [String]
96
95
  # @param url [String, Html2rss::Url]
97
96
  # @return [String, nil]
@@ -102,10 +101,34 @@ module Html2rss
102
101
  new(html, context).get
103
102
  end
104
103
 
104
+ ##
105
+ # @param channel_url [String, Html2rss::Url]
106
+ # @return [Hash] the memoized sanitize configuration
107
+ # rubocop:disable Metrics/MethodLength, ThreadSafety/ClassInstanceVariable
108
+ def self.sanitize_config(channel_url)
109
+ @sanitize_configs ||= {}
110
+ @sanitize_configs[channel_url] ||= begin
111
+ config = Sanitize::Config.merge(
112
+ Sanitize::Config::RELAXED,
113
+ attributes: { all: %w[dir lang alt title translate] },
114
+ add_attributes: TAG_ATTRIBUTES,
115
+ transformers: [
116
+ lambda { |env|
117
+ HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
118
+ },
119
+ ->(env) { HtmlTransformers::WrapImgInA.new.call(**env) }
120
+ ]
121
+ )
122
+ config[:elements].push('audio', 'video', 'source')
123
+ config.freeze
124
+ end
125
+ end
126
+ # rubocop:enable Metrics/MethodLength, ThreadSafety/ClassInstanceVariable
127
+
105
128
  ##
106
129
  # @return [String, nil]
107
130
  def get
108
- sanitized_html = Sanitize.fragment(value, sanitize_config).to_s
131
+ sanitized_html = Sanitize.fragment(value, self.class.sanitize_config(channel_url)).to_s
109
132
  sanitized_html.gsub!(/\s+/, ' ')
110
133
  sanitized_html.strip!
111
134
  sanitized_html.empty? ? nil : sanitized_html
@@ -114,40 +137,6 @@ module Html2rss
114
137
  private
115
138
 
116
139
  def channel_url = context.dig(:config, :channel, :url)
117
-
118
- ##
119
- # @return [Sanitize::Config]
120
- def sanitize_config # rubocop:disable Metrics/MethodLength
121
- config = Sanitize::Config.merge(
122
- Sanitize::Config::RELAXED,
123
- attributes: { all: %w[dir lang alt title translate] },
124
- add_attributes: TAG_ATTRIBUTES,
125
- transformers: [
126
- method(:transform_urls_to_absolute_ones),
127
- method(:wrap_img_in_a)
128
- ]
129
- )
130
- config[:elements].push('audio', 'video', 'source')
131
- config
132
- end
133
-
134
- ##
135
- # Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
136
- #
137
- # @param env [Hash]
138
- # @return [nil]
139
- def transform_urls_to_absolute_ones(env)
140
- HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
141
- end
142
-
143
- ##
144
- # Wrapper for wrap_img_in_a.
145
- #
146
- # @param env [Hash]
147
- # @return [nil]
148
- def wrap_img_in_a(env)
149
- HtmlTransformers::WrapImgInA.new.call(**env)
150
- end
151
140
  end
152
141
  end
153
142
  end
@@ -34,19 +34,17 @@ module Html2rss
34
34
  # @param context [Selectors::Context] post-processor context
35
35
  # @return [void]
36
36
  def self.validate_args!(value, context)
37
- assert_type value, String, :value, context:
37
+ assert_type(value, String, :value, context:)
38
38
 
39
39
  options = context[:options]
40
- assert_type options[:start], Integer, :start, context:
41
-
42
- end_index = options[:end]
43
- assert_type(end_index, Integer, :end, context:) if end_index
40
+ assert_type(options[:start], Integer, :start, context:)
41
+ assert_type(options[:end], Integer, :end, context:) if options.key?(:end)
44
42
  end
45
43
 
46
44
  ##
47
45
  # Extracts the substring from the original string based on the provided start and end indices.
48
46
  #
49
- # @return [String] The extracted substring.
47
+ # @return [String, nil] The extracted substring.
50
48
  def get
51
49
  value[range]
52
50
  end
@@ -56,21 +54,16 @@ module Html2rss
56
54
  #
57
55
  # @return [Range] The range object representing the start and end/Infinity (integers).
58
56
  def range
59
- return (start_index..) unless end_index?
57
+ options = context[:options]
58
+ start = options[:start]
60
59
 
61
- if start_index == end_index
62
- raise ArgumentError,
63
- 'The `start` value must be unequal to the `end` value.'
64
- end
60
+ return (start..) unless options.key?(:end)
65
61
 
66
- (start_index..end_index)
67
- end
62
+ finish = options[:end]
63
+ raise ArgumentError, 'The `start` value must be unequal to the `end` value.' if start == finish
68
64
 
69
- private
70
-
71
- def end_index? = !context[:options][:end].to_s.empty?
72
- def end_index = context[:options][:end].to_i
73
- def start_index = context[:options][:start].to_i
65
+ (start..finish)
66
+ end
74
67
  end
75
68
  end
76
69
  end
@@ -29,7 +29,7 @@ module Html2rss
29
29
  # selector: h1
30
30
  # post_process:
31
31
  # name: template
32
- # string: '%{self} (%{price})'
32
+ # string: '`%{self}` (`%{price}`)'
33
33
  #
34
34
  # Would return:
35
35
  # 'Product (23,42€)'
@@ -54,12 +54,13 @@ module Html2rss
54
54
  @scraper = context[:scraper]
55
55
  @item = context[:item]
56
56
  @string = @options[:string].to_s
57
+ @getter = ->(key) { item_value(key) }
57
58
  end
58
59
 
59
60
  ##
60
61
  # @return [String]
61
62
  def get
62
- Html2rss::Config::DynamicParams.call(@string, {}, getter: method(:item_value), replace_missing_with: '')
63
+ Html2rss::Config::DynamicParams.call(@string, {}, getter: @getter, replace_missing_with: '')
63
64
  end
64
65
 
65
66
  private
@@ -199,7 +199,10 @@ module Html2rss
199
199
  end
200
200
 
201
201
  def select_regular(_name, item:, config:, base_url:)
202
- value = Extractors.get(config.merge(channel: channel_context(base_url)), item)
202
+ @merged_configs ||= {}
203
+ merged_config = @merged_configs[[config.object_id, base_url]] ||=
204
+ config.merge(channel: channel_context(base_url)).freeze
205
+ value = Extractors.get(merged_config, item)
203
206
 
204
207
  if value && (post_process_steps = config[:post_process])
205
208
  steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
@@ -210,8 +213,9 @@ module Html2rss
210
213
  end
211
214
 
212
215
  def post_process(item, value, post_process_steps, base_url:)
216
+ pp_context = channel_post_process_context(base_url)
213
217
  post_process_steps.each do |options|
214
- context = Context.new(config: { channel: { url: base_url, time_zone: @time_zone } },
218
+ context = Context.new(config: pp_context,
215
219
  item:, scraper: self, options:)
216
220
 
217
221
  value = PostProcessors.get(options[:name], value, context)
@@ -262,7 +266,11 @@ module Html2rss
262
266
  end
263
267
 
264
268
  def category_node_options(selector_config, base_url:)
265
- selector_config.merge(channel: channel_context(base_url), selector: nil)
269
+ @category_node_configs ||= {}
270
+ @category_node_configs[[selector_config.object_id, base_url]] ||= selector_config.merge(
271
+ channel: channel_context(base_url),
272
+ selector: nil
273
+ ).freeze
266
274
  end
267
275
 
268
276
  def apply_post_process_steps(item:, value:, post_process_steps:, base_url:)
@@ -288,7 +296,13 @@ module Html2rss
288
296
  end
289
297
 
290
298
  def channel_context(base_url)
291
- { url: base_url, time_zone: @time_zone }
299
+ @channel_contexts ||= {}
300
+ @channel_contexts[base_url] ||= { url: base_url, time_zone: @time_zone }.freeze
301
+ end
302
+
303
+ def channel_post_process_context(base_url)
304
+ @channel_pp_contexts ||= {}
305
+ @channel_pp_contexts[base_url] ||= { channel: channel_context(base_url) }.freeze
292
306
  end
293
307
 
294
308
  # @return [Hash] enclosure details.
data/lib/html2rss/url.rb CHANGED
@@ -54,8 +54,8 @@ module Html2rss
54
54
  # @param raw_url [String] the raw URL string to sanitize
55
55
  # @return [Url, nil] the sanitized URL, or nil if no valid URL found
56
56
  def self.sanitize(raw_url)
57
- matched_urls = raw_url.to_s.scan(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
58
- url = matched_urls.first.to_s.strip
57
+ match = raw_url.to_s.match(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
58
+ url = match ? match[0].strip : ''
59
59
  return nil if url.empty?
60
60
 
61
61
  new(Addressable::URI.parse(url).normalize)
@@ -125,6 +125,7 @@ module Html2rss
125
125
  # @param uri [Addressable::URI] the underlying Addressable::URI object (internal use only)
126
126
  def initialize(uri)
127
127
  @uri = uri.freeze
128
+ @path_segments = @uri.path.to_s.split('/').reject(&:empty?).freeze
128
129
  freeze
129
130
  end
130
131
 
@@ -162,7 +163,7 @@ module Html2rss
162
163
  # Returns the URL path split into non-empty segments.
163
164
  #
164
165
  # @return [Array<String>] normalized path segments
165
- def path_segments = @uri.path.to_s.split('/').reject(&:empty?)
166
+ attr_reader :path_segments
166
167
 
167
168
  ##
168
169
  # Returns a copy of the URL with the provided path.
@@ -4,6 +4,6 @@
4
4
  # The Html2rss namespace.
5
5
  module Html2rss
6
6
  # Current application version.
7
- VERSION = '0.20.1'
7
+ VERSION = '0.21.0'
8
8
  public_constant :VERSION
9
9
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.20.1
4
+ version: 0.21.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
@@ -147,20 +147,6 @@ dependencies:
147
147
  - - "<"
148
148
  - !ruby/object:Gem::Version
149
149
  version: '2.0'
150
- - !ruby/object:Gem::Dependency
151
- name: parallel
152
- requirement: !ruby/object:Gem::Requirement
153
- requirements:
154
- - - ">="
155
- - !ruby/object:Gem::Version
156
- version: '0'
157
- type: :runtime
158
- prerelease: false
159
- version_requirements: !ruby/object:Gem::Requirement
160
- requirements:
161
- - - ">="
162
- - !ruby/object:Gem::Version
163
- version: '0'
164
150
  - !ruby/object:Gem::Dependency
165
151
  name: puppeteer-ruby
166
152
  requirement: !ruby/object:Gem::Requirement
@@ -390,7 +376,7 @@ licenses:
390
376
  - MIT
391
377
  metadata:
392
378
  allowed_push_host: https://rubygems.org
393
- changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.20.1
379
+ changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.21.0
394
380
  rubygems_mfa_required: 'true'
395
381
  rdoc_options: []
396
382
  require_paths:
@@ -399,7 +385,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
399
385
  requirements:
400
386
  - - ">="
401
387
  - !ruby/object:Gem::Version
402
- version: '3.2'
388
+ version: '3.3'
403
389
  required_rubygems_version: !ruby/object:Gem::Requirement
404
390
  requirements:
405
391
  - - ">="