html2rss 0.6.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +122 -17
- data/.travis.yml +3 -3
- data/CHANGELOG.md +97 -42
- data/Gemfile +2 -0
- data/Gemfile.lock +84 -53
- data/README.md +461 -47
- data/html2rss.gemspec +11 -7
- data/lib/html2rss.rb +5 -4
- data/lib/html2rss/attribute_post_processors.rb +4 -10
- data/lib/html2rss/attribute_post_processors/gsub.rb +42 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +45 -0
- data/lib/html2rss/attribute_post_processors/parse_time.rb +1 -1
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +3 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +42 -22
- data/lib/html2rss/attribute_post_processors/substring.rb +11 -5
- data/lib/html2rss/attribute_post_processors/template.rb +23 -14
- data/lib/html2rss/config.rb +40 -20
- data/lib/html2rss/feed_builder.rb +42 -20
- data/lib/html2rss/item.rb +24 -18
- data/lib/html2rss/item_extractors.rb +6 -13
- data/lib/html2rss/item_extractors/attribute.rb +1 -1
- data/lib/html2rss/item_extractors/href.rb +2 -2
- data/lib/html2rss/item_extractors/static.rb +2 -2
- data/lib/html2rss/utils.rb +18 -12
- data/lib/html2rss/version.rb +2 -1
- metadata +88 -23
data/html2rss.gemspec
CHANGED
@@ -12,7 +12,7 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance in return.'
|
13
13
|
spec.homepage = 'https://github.com/gildesmarais/html2rss'
|
14
14
|
spec.license = 'MIT'
|
15
|
-
spec.required_ruby_version = '>= 2.
|
15
|
+
spec.required_ruby_version = '>= 2.5.0'
|
16
16
|
|
17
17
|
if spec.respond_to?(:metadata)
|
18
18
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
@@ -29,15 +29,19 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
30
30
|
spec.require_paths = ['lib']
|
31
31
|
|
32
|
-
spec.add_dependency 'activesupport', '
|
32
|
+
spec.add_dependency 'activesupport', '>= 5', '< 7'
|
33
|
+
spec.add_dependency 'addressable', '~> 2.7'
|
33
34
|
spec.add_dependency 'builder'
|
34
|
-
spec.add_dependency 'faraday', '~> 0
|
35
|
-
spec.add_dependency 'faraday_middleware'
|
36
|
-
spec.add_dependency '
|
35
|
+
spec.add_dependency 'faraday', '~> 1.0'
|
36
|
+
spec.add_dependency 'faraday_middleware'
|
37
|
+
spec.add_dependency 'kramdown'
|
38
|
+
spec.add_dependency 'mime-types', '> 3.0'
|
37
39
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
38
|
-
spec.add_dependency 'reverse_markdown', '~>
|
40
|
+
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
39
41
|
spec.add_dependency 'sanitize', '~> 5.0'
|
40
|
-
spec.
|
42
|
+
spec.add_dependency 'to_regexp'
|
43
|
+
spec.add_dependency 'zeitwerk'
|
44
|
+
spec.add_development_dependency 'bundler'
|
41
45
|
spec.add_development_dependency 'byebug'
|
42
46
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
43
47
|
spec.add_development_dependency 'rubocop'
|
data/lib/html2rss.rb
CHANGED
@@ -1,19 +1,13 @@
|
|
1
|
-
require_relative 'attribute_post_processors/html_to_markdown'
|
2
|
-
require_relative 'attribute_post_processors/parse_time'
|
3
|
-
require_relative 'attribute_post_processors/parse_uri'
|
4
|
-
require_relative 'attribute_post_processors/sanitize_html'
|
5
|
-
require_relative 'attribute_post_processors/substring'
|
6
|
-
require_relative 'attribute_post_processors/template'
|
7
|
-
|
8
1
|
module Html2rss
|
9
2
|
##
|
10
3
|
# Provides a namespace for attribute post processors.
|
11
4
|
module AttributePostProcessors
|
12
5
|
def self.get_processor(name)
|
13
|
-
|
14
|
-
|
6
|
+
@get_processor ||= Hash.new do |processors, key|
|
7
|
+
processors[key] = Utils.get_class_from_name(key, 'AttributePostProcessors')
|
8
|
+
end
|
15
9
|
|
16
|
-
|
10
|
+
@get_processor[name]
|
17
11
|
end
|
18
12
|
end
|
19
13
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'to_regexp'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
#
|
7
|
+
# Imagine this HTML:
|
8
|
+
# <h1>Foo bar and boo<h1>
|
9
|
+
#
|
10
|
+
# YAML usage example:
|
11
|
+
# selectors:
|
12
|
+
# title:
|
13
|
+
# selector: h1
|
14
|
+
# post_process:
|
15
|
+
# name: gsub
|
16
|
+
# pattern: boo
|
17
|
+
# replacement: baz
|
18
|
+
#
|
19
|
+
# Would return:
|
20
|
+
# 'Foo bar and baz'
|
21
|
+
#
|
22
|
+
# `pattern` can be a Regexp or a String.
|
23
|
+
#
|
24
|
+
# `replacement` can be a String or a Hash.
|
25
|
+
#
|
26
|
+
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
27
|
+
class Gsub
|
28
|
+
def initialize(value, env)
|
29
|
+
@value = value
|
30
|
+
options = env[:options]
|
31
|
+
@pattern = options[:pattern].to_regexp || options[:pattern]
|
32
|
+
@replacement = options[:replacement]
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# @return [String]
|
37
|
+
def get
|
38
|
+
@value.to_s.gsub(@pattern, @replacement)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'kramdown'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
# Generates HTML from Markdown.
|
7
|
+
#
|
8
|
+
# It's particularly useful in conjunction with the Template post processor
|
9
|
+
# to generate a description from other selectors.
|
10
|
+
#
|
11
|
+
# YAML usage example:
|
12
|
+
#
|
13
|
+
# selectors:
|
14
|
+
# description:
|
15
|
+
# selector: section
|
16
|
+
# post_process:
|
17
|
+
# - name: template
|
18
|
+
# string: |
|
19
|
+
# # %s
|
20
|
+
#
|
21
|
+
# Price: %s
|
22
|
+
# methods:
|
23
|
+
# - self
|
24
|
+
# - price
|
25
|
+
# - name: markdown_to_html
|
26
|
+
#
|
27
|
+
# Would e.g. return:
|
28
|
+
#
|
29
|
+
# <h1>Section</h1>
|
30
|
+
#
|
31
|
+
# <p>Price: 12.34</p>
|
32
|
+
class MarkdownToHtml
|
33
|
+
def initialize(value, env)
|
34
|
+
@value = value
|
35
|
+
@env = env
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# @return [String] formatted in Markdown
|
40
|
+
def get
|
41
|
+
SanitizeHtml.new(Kramdown::Document.new(@value).to_html, @env).get
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -17,12 +17,12 @@ module Html2rss
|
|
17
17
|
# selector: span
|
18
18
|
# post_process:
|
19
19
|
# name: 'parse_time'
|
20
|
+
# time_zone: 'Europe/Berlin'
|
20
21
|
#
|
21
22
|
# Would return:
|
22
23
|
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
23
24
|
#
|
24
25
|
# It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
|
25
|
-
# As of now it ignores time zones and always falls back to the UTC time zone.
|
26
26
|
class ParseTime
|
27
27
|
def initialize(value, env)
|
28
28
|
@value = value.to_s
|
@@ -5,7 +5,7 @@ module Html2rss
|
|
5
5
|
#
|
6
6
|
# Imagine this HTML structure:
|
7
7
|
#
|
8
|
-
# <span>http://why-not-use-a-link.uh</span>
|
8
|
+
# <span>http://why-not-use-a-link.uh </span>
|
9
9
|
#
|
10
10
|
# YAML usage example:
|
11
11
|
#
|
@@ -15,6 +15,7 @@ module Html2rss
|
|
15
15
|
# extractor: text
|
16
16
|
# post_process:
|
17
17
|
# name: parse_uri
|
18
|
+
#
|
18
19
|
# Would return:
|
19
20
|
# 'http://why-not-use-a-link.uh'
|
20
21
|
class ParseUri
|
@@ -25,7 +26,7 @@ module Html2rss
|
|
25
26
|
##
|
26
27
|
# @return [String]
|
27
28
|
def get
|
28
|
-
URI(@value).to_s
|
29
|
+
URI(Html2rss::Utils.sanitize_url(@value)).to_s
|
29
30
|
end
|
30
31
|
end
|
31
32
|
end
|
@@ -4,10 +4,16 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
##
|
6
6
|
# Returns sanitized HTML code as String.
|
7
|
-
# Adds
|
8
7
|
#
|
9
|
-
#
|
10
|
-
#
|
8
|
+
# It adds:
|
9
|
+
#
|
10
|
+
# - `rel="nofollow noopener noreferrer"` to <a> tags
|
11
|
+
# - `referrer-policy='no-referrer'` to <img> tags
|
12
|
+
#
|
13
|
+
# It also:
|
14
|
+
#
|
15
|
+
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
16
|
+
# linking to the <img>'s `src`.
|
11
17
|
#
|
12
18
|
# Imagine this HTML structure:
|
13
19
|
#
|
@@ -21,7 +27,7 @@ module Html2rss
|
|
21
27
|
#
|
22
28
|
# selectors:
|
23
29
|
# description:
|
24
|
-
# selector: section
|
30
|
+
# selector: '.section'
|
25
31
|
# extractor: html
|
26
32
|
# post_process:
|
27
33
|
# name: sanitize_html
|
@@ -29,6 +35,9 @@ module Html2rss
|
|
29
35
|
# Would return:
|
30
36
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
31
37
|
class SanitizeHtml
|
38
|
+
URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
|
39
|
+
private_constant :URL_ELEMENTS_WITH_URL_ATTRIBUTE
|
40
|
+
|
32
41
|
def initialize(value, env)
|
33
42
|
@value = value
|
34
43
|
@channel_url = env[:config].url
|
@@ -41,28 +50,22 @@ module Html2rss
|
|
41
50
|
# - adds target="_blank" to a elements
|
42
51
|
# @return [String]
|
43
52
|
def get
|
44
|
-
Sanitize.fragment(@value,
|
45
|
-
Sanitize::Config::RELAXED,
|
46
|
-
attributes: { all: %w[dir lang alt title translate] },
|
47
|
-
add_attributes: {
|
48
|
-
'a' => {
|
49
|
-
'rel' => 'nofollow noopener noreferrer',
|
50
|
-
'target' => '_blank'
|
51
|
-
},
|
52
|
-
'img' => {
|
53
|
-
'referrer-policy' => 'no-referrer'
|
54
|
-
}
|
55
|
-
},
|
56
|
-
transformers: [transform_urls_to_absolute_ones]
|
57
|
-
)).to_s.split.join(' ')
|
53
|
+
Sanitize.fragment(@value, sanitize_config).to_s.split.join(' ')
|
58
54
|
end
|
59
55
|
|
60
56
|
private
|
61
57
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
58
|
+
def sanitize_config
|
59
|
+
Sanitize::Config.merge(
|
60
|
+
Sanitize::Config::RELAXED,
|
61
|
+
attributes: { all: %w[dir lang alt title translate] },
|
62
|
+
add_attributes: {
|
63
|
+
'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
|
64
|
+
'img' => { 'referrer-policy' => 'no-referrer' }
|
65
|
+
},
|
66
|
+
transformers: [transform_urls_to_absolute_ones, wrap_img_in_a]
|
67
|
+
)
|
68
|
+
end
|
66
69
|
|
67
70
|
def transform_urls_to_absolute_ones
|
68
71
|
lambda do |env|
|
@@ -78,6 +81,23 @@ module Html2rss
|
|
78
81
|
env[:node][url_attribute] = absolute_url
|
79
82
|
end
|
80
83
|
end
|
84
|
+
|
85
|
+
def wrap_img_in_a
|
86
|
+
lambda do |env|
|
87
|
+
return if env[:node_name] != 'img'
|
88
|
+
|
89
|
+
img = env[:node]
|
90
|
+
|
91
|
+
return if img.parent.name == 'a'
|
92
|
+
|
93
|
+
anchor = Nokogiri::XML::Node.new('a', img)
|
94
|
+
anchor[:href] = img[:src]
|
95
|
+
|
96
|
+
anchor.add_child img.dup
|
97
|
+
|
98
|
+
img.replace(anchor)
|
99
|
+
end
|
100
|
+
end
|
81
101
|
end
|
82
102
|
end
|
83
103
|
end
|
@@ -2,9 +2,15 @@ module Html2rss
|
|
2
2
|
module AttributePostProcessors
|
3
3
|
## Returns a defined part of a String.
|
4
4
|
#
|
5
|
+
# Both parameters must be an Integer and they can be negative.
|
5
6
|
# The +end+ parameter can be omitted, in that case it will not cut the
|
6
7
|
# String at the end.
|
7
8
|
#
|
9
|
+
# A Regexp or a MatchString is not supported.
|
10
|
+
#
|
11
|
+
# See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
|
12
|
+
# documentation for more information.
|
13
|
+
#
|
8
14
|
# Imagine this HTML:
|
9
15
|
# <h1>Foo bar and baz<h1>
|
10
16
|
#
|
@@ -13,9 +19,9 @@ module Html2rss
|
|
13
19
|
# title:
|
14
20
|
# selector: h1
|
15
21
|
# post_process:
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
22
|
+
# name: substring
|
23
|
+
# start: 4
|
24
|
+
# end: 6
|
19
25
|
#
|
20
26
|
# Would return:
|
21
27
|
# 'bar'
|
@@ -28,8 +34,8 @@ module Html2rss
|
|
28
34
|
##
|
29
35
|
# @return [String]
|
30
36
|
def get
|
31
|
-
ending = @options.fetch(
|
32
|
-
@value[@options[
|
37
|
+
ending = @options.fetch(:end, @value.length).to_i
|
38
|
+
@value[@options[:start].to_i..ending]
|
33
39
|
end
|
34
40
|
end
|
35
41
|
end
|
@@ -4,7 +4,8 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
## Returns a formatted String according to the string pattern.
|
6
6
|
#
|
7
|
-
# If +self+ is
|
7
|
+
# If +self+ is used, the selectors extracted value will be used.
|
8
|
+
# It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
|
8
9
|
#
|
9
10
|
# Imagine this HTML:
|
10
11
|
# <li>
|
@@ -22,11 +23,8 @@ module Html2rss
|
|
22
23
|
# title:
|
23
24
|
# selector: h1
|
24
25
|
# post_process:
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# methods:
|
28
|
-
# - self
|
29
|
-
# - price
|
26
|
+
# name: template
|
27
|
+
# string: '%{self} (%{price})'
|
30
28
|
#
|
31
29
|
# Would return:
|
32
30
|
# 'Product (23,42€)'
|
@@ -35,25 +33,36 @@ module Html2rss
|
|
35
33
|
@value = value
|
36
34
|
@options = env[:options]
|
37
35
|
@item = env[:item]
|
36
|
+
@string = @options[:string]
|
38
37
|
end
|
39
38
|
|
40
39
|
##
|
41
|
-
# - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
|
42
40
|
# @return [String]
|
43
41
|
def get
|
44
|
-
|
42
|
+
return format_string_with_methods if @options[:methods]
|
43
|
+
|
44
|
+
names = string.scan(/%[<|{](\w*)[>|}]/)
|
45
|
+
names.flatten!
|
46
|
+
names.compact!
|
47
|
+
names.map!(&:to_sym)
|
48
|
+
|
49
|
+
format(string, names.map { |name| [name, item_value(name)] }.to_h)
|
45
50
|
end
|
46
51
|
|
47
52
|
private
|
48
53
|
|
49
|
-
|
50
|
-
@options['string']
|
51
|
-
end
|
54
|
+
attr_reader :string
|
52
55
|
|
53
56
|
def methods
|
54
|
-
@methods ||= @options[
|
55
|
-
|
56
|
-
|
57
|
+
@methods ||= @options[:methods].map(&method(:item_value))
|
58
|
+
end
|
59
|
+
|
60
|
+
def format_string_with_methods
|
61
|
+
string % methods
|
62
|
+
end
|
63
|
+
|
64
|
+
def item_value(method_name)
|
65
|
+
method_name.to_sym == :self ? @value.to_s : @item.public_send(method_name).to_s
|
57
66
|
end
|
58
67
|
end
|
59
68
|
end
|
data/lib/html2rss/config.rb
CHANGED
@@ -1,71 +1,91 @@
|
|
1
|
+
require 'active_support/core_ext/hash'
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
##
|
3
5
|
# The Config class abstracts from the config data structure and
|
4
6
|
# provides default values.
|
5
7
|
class Config
|
6
8
|
def initialize(feed_config, global_config = {})
|
7
|
-
@global_config =
|
8
|
-
@feed_config =
|
9
|
-
@channel_config =
|
9
|
+
@global_config = global_config.deep_symbolize_keys
|
10
|
+
@feed_config = feed_config.deep_symbolize_keys
|
11
|
+
@channel_config = @feed_config.fetch(:channel, {})
|
10
12
|
end
|
11
13
|
|
12
14
|
def author
|
13
|
-
channel_config.fetch
|
15
|
+
channel_config.fetch :author, 'html2rss'
|
14
16
|
end
|
15
17
|
|
16
18
|
def ttl
|
17
|
-
channel_config.fetch
|
19
|
+
channel_config.fetch :ttl, 360
|
18
20
|
end
|
19
21
|
|
20
22
|
def title
|
21
|
-
channel_config.fetch
|
23
|
+
channel_config.fetch(:title) { generated_title }
|
24
|
+
end
|
25
|
+
|
26
|
+
def generated_title
|
27
|
+
uri = URI(url)
|
28
|
+
|
29
|
+
nicer_path = uri.path.split('/')
|
30
|
+
nicer_path.reject! { |part| part == '' }
|
31
|
+
|
32
|
+
nicer_path.any? ? "#{uri.host}: #{nicer_path.join(' ').titleize}" : uri.host
|
22
33
|
end
|
23
34
|
|
24
35
|
def language
|
25
|
-
channel_config.fetch
|
36
|
+
channel_config.fetch :language, 'en'
|
26
37
|
end
|
27
38
|
|
28
39
|
def description
|
29
|
-
channel_config.fetch
|
40
|
+
channel_config.fetch :description, "Latest items from #{url}."
|
30
41
|
end
|
31
42
|
|
32
43
|
def url
|
33
|
-
channel_config.dig
|
44
|
+
channel_config.dig :url
|
34
45
|
end
|
35
46
|
alias link url
|
36
47
|
|
37
48
|
def time_zone
|
38
|
-
channel_config.fetch
|
49
|
+
channel_config.fetch :time_zone, 'UTC'
|
39
50
|
end
|
40
51
|
|
41
52
|
def json?
|
42
|
-
channel_config.fetch
|
53
|
+
channel_config.fetch :json, false
|
43
54
|
end
|
44
55
|
|
45
56
|
def headers
|
46
|
-
global_config.fetch(
|
57
|
+
global_config.fetch(:headers, {}).merge(channel_config.fetch(:headers, {}))
|
47
58
|
end
|
48
59
|
|
49
60
|
def attribute_options(name)
|
50
|
-
feed_config.dig(
|
61
|
+
feed_config.dig(:selectors).fetch(name, {}).merge(channel: channel_config)
|
51
62
|
end
|
52
63
|
|
53
64
|
def attribute?(name)
|
54
|
-
attribute_names.include?(name
|
65
|
+
attribute_names.include?(name)
|
55
66
|
end
|
56
67
|
|
57
|
-
def
|
58
|
-
feed_config.dig(
|
68
|
+
def category_selectors
|
69
|
+
categories = feed_config.dig(:selectors, :categories)
|
70
|
+
return [] unless categories
|
71
|
+
|
72
|
+
categories = categories.keep_if { |category| category.to_s != '' }
|
73
|
+
categories.map!(&:to_sym)
|
74
|
+
categories.uniq!
|
75
|
+
|
76
|
+
categories
|
59
77
|
end
|
60
78
|
|
61
79
|
def selector(name)
|
62
|
-
feed_config.dig(
|
80
|
+
feed_config.dig(:selectors, name, :selector)
|
63
81
|
end
|
64
82
|
|
65
83
|
def attribute_names
|
66
|
-
@attribute_names ||= feed_config.fetch(
|
67
|
-
|
68
|
-
|
84
|
+
@attribute_names ||= feed_config.fetch(:selectors, {}).keys.tap { |attrs| attrs.delete(:items) }
|
85
|
+
end
|
86
|
+
|
87
|
+
def items_order
|
88
|
+
feed_config.dig(:selectors, :items, :order)&.to_sym
|
69
89
|
end
|
70
90
|
|
71
91
|
private
|