html2rss 0.20.1 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/html2rss.gemspec +1 -2
- data/lib/html2rss/auto_source/scraper/html.rb +61 -16
- data/lib/html2rss/auto_source/scraper/json_state.rb +40 -27
- data/lib/html2rss/auto_source/scraper/link_heuristics.rb +85 -131
- data/lib/html2rss/auto_source/scraper/schema/category_extractor.rb +74 -28
- data/lib/html2rss/auto_source/scraper/schema/list_item.rb +3 -2
- data/lib/html2rss/auto_source/scraper/schema/thing.rb +31 -60
- data/lib/html2rss/auto_source/scraper/schema.rb +8 -2
- data/lib/html2rss/auto_source/scraper/semantic_html/deduplicator.rb +4 -18
- data/lib/html2rss/auto_source/scraper/semantic_html.rb +55 -11
- data/lib/html2rss/auto_source/scraper.rb +0 -3
- data/lib/html2rss/auto_source.rb +2 -11
- data/lib/html2rss/category_extractor.rb +54 -20
- data/lib/html2rss/html_extractor/enclosure_extractor.rb +60 -89
- data/lib/html2rss/html_extractor/list_candidates.rb +2 -8
- data/lib/html2rss/html_extractor/semantic_anchor_candidates.rb +29 -12
- data/lib/html2rss/html_extractor/semantic_containers.rb +9 -35
- data/lib/html2rss/html_extractor.rb +51 -30
- data/lib/html2rss/rendering/description_builder.rb +3 -3
- data/lib/html2rss/rss_builder/article.rb +44 -23
- data/lib/html2rss/rss_builder/enclosure.rb +4 -2
- data/lib/html2rss/selectors/post_processors/sanitize_html.rb +25 -36
- data/lib/html2rss/selectors/post_processors/substring.rb +11 -18
- data/lib/html2rss/selectors/post_processors/template.rb +3 -2
- data/lib/html2rss/selectors.rb +18 -4
- data/lib/html2rss/url.rb +4 -3
- data/lib/html2rss/version.rb +1 -1
- metadata +3 -17
|
@@ -91,7 +91,6 @@ module Html2rss
|
|
|
91
91
|
end
|
|
92
92
|
|
|
93
93
|
##
|
|
94
|
-
# Shorthand method to get the sanitized HTML.
|
|
95
94
|
# @param html [String]
|
|
96
95
|
# @param url [String, Html2rss::Url]
|
|
97
96
|
# @return [String, nil]
|
|
@@ -102,10 +101,34 @@ module Html2rss
|
|
|
102
101
|
new(html, context).get
|
|
103
102
|
end
|
|
104
103
|
|
|
104
|
+
##
|
|
105
|
+
# @param channel_url [String, Html2rss::Url]
|
|
106
|
+
# @return [Hash] the memoized sanitize configuration
|
|
107
|
+
# rubocop:disable Metrics/MethodLength, ThreadSafety/ClassInstanceVariable
|
|
108
|
+
def self.sanitize_config(channel_url)
|
|
109
|
+
@sanitize_configs ||= {}
|
|
110
|
+
@sanitize_configs[channel_url] ||= begin
|
|
111
|
+
config = Sanitize::Config.merge(
|
|
112
|
+
Sanitize::Config::RELAXED,
|
|
113
|
+
attributes: { all: %w[dir lang alt title translate] },
|
|
114
|
+
add_attributes: TAG_ATTRIBUTES,
|
|
115
|
+
transformers: [
|
|
116
|
+
lambda { |env|
|
|
117
|
+
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
|
|
118
|
+
},
|
|
119
|
+
->(env) { HtmlTransformers::WrapImgInA.new.call(**env) }
|
|
120
|
+
]
|
|
121
|
+
)
|
|
122
|
+
config[:elements].push('audio', 'video', 'source')
|
|
123
|
+
config.freeze
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
# rubocop:enable Metrics/MethodLength, ThreadSafety/ClassInstanceVariable
|
|
127
|
+
|
|
105
128
|
##
|
|
106
129
|
# @return [String, nil]
|
|
107
130
|
def get
|
|
108
|
-
sanitized_html = Sanitize.fragment(value, sanitize_config).to_s
|
|
131
|
+
sanitized_html = Sanitize.fragment(value, self.class.sanitize_config(channel_url)).to_s
|
|
109
132
|
sanitized_html.gsub!(/\s+/, ' ')
|
|
110
133
|
sanitized_html.strip!
|
|
111
134
|
sanitized_html.empty? ? nil : sanitized_html
|
|
@@ -114,40 +137,6 @@ module Html2rss
|
|
|
114
137
|
private
|
|
115
138
|
|
|
116
139
|
def channel_url = context.dig(:config, :channel, :url)
|
|
117
|
-
|
|
118
|
-
##
|
|
119
|
-
# @return [Sanitize::Config]
|
|
120
|
-
def sanitize_config # rubocop:disable Metrics/MethodLength
|
|
121
|
-
config = Sanitize::Config.merge(
|
|
122
|
-
Sanitize::Config::RELAXED,
|
|
123
|
-
attributes: { all: %w[dir lang alt title translate] },
|
|
124
|
-
add_attributes: TAG_ATTRIBUTES,
|
|
125
|
-
transformers: [
|
|
126
|
-
method(:transform_urls_to_absolute_ones),
|
|
127
|
-
method(:wrap_img_in_a)
|
|
128
|
-
]
|
|
129
|
-
)
|
|
130
|
-
config[:elements].push('audio', 'video', 'source')
|
|
131
|
-
config
|
|
132
|
-
end
|
|
133
|
-
|
|
134
|
-
##
|
|
135
|
-
# Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
|
|
136
|
-
#
|
|
137
|
-
# @param env [Hash]
|
|
138
|
-
# @return [nil]
|
|
139
|
-
def transform_urls_to_absolute_ones(env)
|
|
140
|
-
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
|
|
141
|
-
end
|
|
142
|
-
|
|
143
|
-
##
|
|
144
|
-
# Wrapper for wrap_img_in_a.
|
|
145
|
-
#
|
|
146
|
-
# @param env [Hash]
|
|
147
|
-
# @return [nil]
|
|
148
|
-
def wrap_img_in_a(env)
|
|
149
|
-
HtmlTransformers::WrapImgInA.new.call(**env)
|
|
150
|
-
end
|
|
151
140
|
end
|
|
152
141
|
end
|
|
153
142
|
end
|
|
@@ -34,19 +34,17 @@ module Html2rss
|
|
|
34
34
|
# @param context [Selectors::Context] post-processor context
|
|
35
35
|
# @return [void]
|
|
36
36
|
def self.validate_args!(value, context)
|
|
37
|
-
assert_type
|
|
37
|
+
assert_type(value, String, :value, context:)
|
|
38
38
|
|
|
39
39
|
options = context[:options]
|
|
40
|
-
assert_type
|
|
41
|
-
|
|
42
|
-
end_index = options[:end]
|
|
43
|
-
assert_type(end_index, Integer, :end, context:) if end_index
|
|
40
|
+
assert_type(options[:start], Integer, :start, context:)
|
|
41
|
+
assert_type(options[:end], Integer, :end, context:) if options.key?(:end)
|
|
44
42
|
end
|
|
45
43
|
|
|
46
44
|
##
|
|
47
45
|
# Extracts the substring from the original string based on the provided start and end indices.
|
|
48
46
|
#
|
|
49
|
-
# @return [String] The extracted substring.
|
|
47
|
+
# @return [String, nil] The extracted substring.
|
|
50
48
|
def get
|
|
51
49
|
value[range]
|
|
52
50
|
end
|
|
@@ -56,21 +54,16 @@ module Html2rss
|
|
|
56
54
|
#
|
|
57
55
|
# @return [Range] The range object representing the start and end/Infinity (integers).
|
|
58
56
|
def range
|
|
59
|
-
|
|
57
|
+
options = context[:options]
|
|
58
|
+
start = options[:start]
|
|
60
59
|
|
|
61
|
-
|
|
62
|
-
raise ArgumentError,
|
|
63
|
-
'The `start` value must be unequal to the `end` value.'
|
|
64
|
-
end
|
|
60
|
+
return (start..) unless options.key?(:end)
|
|
65
61
|
|
|
66
|
-
|
|
67
|
-
|
|
62
|
+
finish = options[:end]
|
|
63
|
+
raise ArgumentError, 'The `start` value must be unequal to the `end` value.' if start == finish
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def end_index? = !context[:options][:end].to_s.empty?
|
|
72
|
-
def end_index = context[:options][:end].to_i
|
|
73
|
-
def start_index = context[:options][:start].to_i
|
|
65
|
+
(start..finish)
|
|
66
|
+
end
|
|
74
67
|
end
|
|
75
68
|
end
|
|
76
69
|
end
|
|
@@ -29,7 +29,7 @@ module Html2rss
|
|
|
29
29
|
# selector: h1
|
|
30
30
|
# post_process:
|
|
31
31
|
# name: template
|
|
32
|
-
# string: '
|
|
32
|
+
# string: '`%{self}` (`%{price}`)'
|
|
33
33
|
#
|
|
34
34
|
# Would return:
|
|
35
35
|
# 'Product (23,42€)'
|
|
@@ -54,12 +54,13 @@ module Html2rss
|
|
|
54
54
|
@scraper = context[:scraper]
|
|
55
55
|
@item = context[:item]
|
|
56
56
|
@string = @options[:string].to_s
|
|
57
|
+
@getter = ->(key) { item_value(key) }
|
|
57
58
|
end
|
|
58
59
|
|
|
59
60
|
##
|
|
60
61
|
# @return [String]
|
|
61
62
|
def get
|
|
62
|
-
Html2rss::Config::DynamicParams.call(@string, {}, getter:
|
|
63
|
+
Html2rss::Config::DynamicParams.call(@string, {}, getter: @getter, replace_missing_with: '')
|
|
63
64
|
end
|
|
64
65
|
|
|
65
66
|
private
|
data/lib/html2rss/selectors.rb
CHANGED
|
@@ -199,7 +199,10 @@ module Html2rss
|
|
|
199
199
|
end
|
|
200
200
|
|
|
201
201
|
def select_regular(_name, item:, config:, base_url:)
|
|
202
|
-
|
|
202
|
+
@merged_configs ||= {}
|
|
203
|
+
merged_config = @merged_configs[[config.object_id, base_url]] ||=
|
|
204
|
+
config.merge(channel: channel_context(base_url)).freeze
|
|
205
|
+
value = Extractors.get(merged_config, item)
|
|
203
206
|
|
|
204
207
|
if value && (post_process_steps = config[:post_process])
|
|
205
208
|
steps = post_process_steps.is_a?(Array) ? post_process_steps : [post_process_steps]
|
|
@@ -210,8 +213,9 @@ module Html2rss
|
|
|
210
213
|
end
|
|
211
214
|
|
|
212
215
|
def post_process(item, value, post_process_steps, base_url:)
|
|
216
|
+
pp_context = channel_post_process_context(base_url)
|
|
213
217
|
post_process_steps.each do |options|
|
|
214
|
-
context = Context.new(config:
|
|
218
|
+
context = Context.new(config: pp_context,
|
|
215
219
|
item:, scraper: self, options:)
|
|
216
220
|
|
|
217
221
|
value = PostProcessors.get(options[:name], value, context)
|
|
@@ -262,7 +266,11 @@ module Html2rss
|
|
|
262
266
|
end
|
|
263
267
|
|
|
264
268
|
def category_node_options(selector_config, base_url:)
|
|
265
|
-
|
|
269
|
+
@category_node_configs ||= {}
|
|
270
|
+
@category_node_configs[[selector_config.object_id, base_url]] ||= selector_config.merge(
|
|
271
|
+
channel: channel_context(base_url),
|
|
272
|
+
selector: nil
|
|
273
|
+
).freeze
|
|
266
274
|
end
|
|
267
275
|
|
|
268
276
|
def apply_post_process_steps(item:, value:, post_process_steps:, base_url:)
|
|
@@ -288,7 +296,13 @@ module Html2rss
|
|
|
288
296
|
end
|
|
289
297
|
|
|
290
298
|
def channel_context(base_url)
|
|
291
|
-
|
|
299
|
+
@channel_contexts ||= {}
|
|
300
|
+
@channel_contexts[base_url] ||= { url: base_url, time_zone: @time_zone }.freeze
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
def channel_post_process_context(base_url)
|
|
304
|
+
@channel_pp_contexts ||= {}
|
|
305
|
+
@channel_pp_contexts[base_url] ||= { channel: channel_context(base_url) }.freeze
|
|
292
306
|
end
|
|
293
307
|
|
|
294
308
|
# @return [Hash] enclosure details.
|
data/lib/html2rss/url.rb
CHANGED
|
@@ -54,8 +54,8 @@ module Html2rss
|
|
|
54
54
|
# @param raw_url [String] the raw URL string to sanitize
|
|
55
55
|
# @return [Url, nil] the sanitized URL, or nil if no valid URL found
|
|
56
56
|
def self.sanitize(raw_url)
|
|
57
|
-
|
|
58
|
-
url =
|
|
57
|
+
match = raw_url.to_s.match(%r{(?:(?:https?|ftp|mailto)://|mailto:)[^\s<>"]+})
|
|
58
|
+
url = match ? match[0].strip : ''
|
|
59
59
|
return nil if url.empty?
|
|
60
60
|
|
|
61
61
|
new(Addressable::URI.parse(url).normalize)
|
|
@@ -125,6 +125,7 @@ module Html2rss
|
|
|
125
125
|
# @param uri [Addressable::URI] the underlying Addressable::URI object (internal use only)
|
|
126
126
|
def initialize(uri)
|
|
127
127
|
@uri = uri.freeze
|
|
128
|
+
@path_segments = @uri.path.to_s.split('/').reject(&:empty?).freeze
|
|
128
129
|
freeze
|
|
129
130
|
end
|
|
130
131
|
|
|
@@ -162,7 +163,7 @@ module Html2rss
|
|
|
162
163
|
# Returns the URL path split into non-empty segments.
|
|
163
164
|
#
|
|
164
165
|
# @return [Array<String>] normalized path segments
|
|
165
|
-
|
|
166
|
+
attr_reader :path_segments
|
|
166
167
|
|
|
167
168
|
##
|
|
168
169
|
# Returns a copy of the URL with the provided path.
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html2rss
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.21.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Gil Desmarais
|
|
@@ -147,20 +147,6 @@ dependencies:
|
|
|
147
147
|
- - "<"
|
|
148
148
|
- !ruby/object:Gem::Version
|
|
149
149
|
version: '2.0'
|
|
150
|
-
- !ruby/object:Gem::Dependency
|
|
151
|
-
name: parallel
|
|
152
|
-
requirement: !ruby/object:Gem::Requirement
|
|
153
|
-
requirements:
|
|
154
|
-
- - ">="
|
|
155
|
-
- !ruby/object:Gem::Version
|
|
156
|
-
version: '0'
|
|
157
|
-
type: :runtime
|
|
158
|
-
prerelease: false
|
|
159
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
160
|
-
requirements:
|
|
161
|
-
- - ">="
|
|
162
|
-
- !ruby/object:Gem::Version
|
|
163
|
-
version: '0'
|
|
164
150
|
- !ruby/object:Gem::Dependency
|
|
165
151
|
name: puppeteer-ruby
|
|
166
152
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -390,7 +376,7 @@ licenses:
|
|
|
390
376
|
- MIT
|
|
391
377
|
metadata:
|
|
392
378
|
allowed_push_host: https://rubygems.org
|
|
393
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
|
379
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.21.0
|
|
394
380
|
rubygems_mfa_required: 'true'
|
|
395
381
|
rdoc_options: []
|
|
396
382
|
require_paths:
|
|
@@ -399,7 +385,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
399
385
|
requirements:
|
|
400
386
|
- - ">="
|
|
401
387
|
- !ruby/object:Gem::Version
|
|
402
|
-
version: '3.
|
|
388
|
+
version: '3.3'
|
|
403
389
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
404
390
|
requirements:
|
|
405
391
|
- - ">="
|