html2rss 0.6.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +122 -17
- data/.travis.yml +3 -3
- data/CHANGELOG.md +97 -42
- data/Gemfile +2 -0
- data/Gemfile.lock +84 -53
- data/README.md +461 -47
- data/html2rss.gemspec +11 -7
- data/lib/html2rss.rb +5 -4
- data/lib/html2rss/attribute_post_processors.rb +4 -10
- data/lib/html2rss/attribute_post_processors/gsub.rb +42 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +45 -0
- data/lib/html2rss/attribute_post_processors/parse_time.rb +1 -1
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +3 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +42 -22
- data/lib/html2rss/attribute_post_processors/substring.rb +11 -5
- data/lib/html2rss/attribute_post_processors/template.rb +23 -14
- data/lib/html2rss/config.rb +40 -20
- data/lib/html2rss/feed_builder.rb +42 -20
- data/lib/html2rss/item.rb +24 -18
- data/lib/html2rss/item_extractors.rb +6 -13
- data/lib/html2rss/item_extractors/attribute.rb +1 -1
- data/lib/html2rss/item_extractors/href.rb +2 -2
- data/lib/html2rss/item_extractors/static.rb +2 -2
- data/lib/html2rss/utils.rb +18 -12
- data/lib/html2rss/version.rb +2 -1
- metadata +88 -23
data/html2rss.gemspec
CHANGED
@@ -12,7 +12,7 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = 'Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance in return.'
|
13
13
|
spec.homepage = 'https://github.com/gildesmarais/html2rss'
|
14
14
|
spec.license = 'MIT'
|
15
|
-
spec.required_ruby_version = '>= 2.
|
15
|
+
spec.required_ruby_version = '>= 2.5.0'
|
16
16
|
|
17
17
|
if spec.respond_to?(:metadata)
|
18
18
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
@@ -29,15 +29,19 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
30
30
|
spec.require_paths = ['lib']
|
31
31
|
|
32
|
-
spec.add_dependency 'activesupport', '
|
32
|
+
spec.add_dependency 'activesupport', '>= 5', '< 7'
|
33
|
+
spec.add_dependency 'addressable', '~> 2.7'
|
33
34
|
spec.add_dependency 'builder'
|
34
|
-
spec.add_dependency 'faraday', '~> 0
|
35
|
-
spec.add_dependency 'faraday_middleware'
|
36
|
-
spec.add_dependency '
|
35
|
+
spec.add_dependency 'faraday', '~> 1.0'
|
36
|
+
spec.add_dependency 'faraday_middleware'
|
37
|
+
spec.add_dependency 'kramdown'
|
38
|
+
spec.add_dependency 'mime-types', '> 3.0'
|
37
39
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
38
|
-
spec.add_dependency 'reverse_markdown', '~>
|
40
|
+
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
39
41
|
spec.add_dependency 'sanitize', '~> 5.0'
|
40
|
-
spec.
|
42
|
+
spec.add_dependency 'to_regexp'
|
43
|
+
spec.add_dependency 'zeitwerk'
|
44
|
+
spec.add_development_dependency 'bundler'
|
41
45
|
spec.add_development_dependency 'byebug'
|
42
46
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
43
47
|
spec.add_development_dependency 'rubocop'
|
data/lib/html2rss.rb
CHANGED
@@ -1,19 +1,13 @@
|
|
1
|
-
require_relative 'attribute_post_processors/html_to_markdown'
|
2
|
-
require_relative 'attribute_post_processors/parse_time'
|
3
|
-
require_relative 'attribute_post_processors/parse_uri'
|
4
|
-
require_relative 'attribute_post_processors/sanitize_html'
|
5
|
-
require_relative 'attribute_post_processors/substring'
|
6
|
-
require_relative 'attribute_post_processors/template'
|
7
|
-
|
8
1
|
module Html2rss
|
9
2
|
##
|
10
3
|
# Provides a namespace for attribute post processors.
|
11
4
|
module AttributePostProcessors
|
12
5
|
def self.get_processor(name)
|
13
|
-
|
14
|
-
|
6
|
+
@get_processor ||= Hash.new do |processors, key|
|
7
|
+
processors[key] = Utils.get_class_from_name(key, 'AttributePostProcessors')
|
8
|
+
end
|
15
9
|
|
16
|
-
|
10
|
+
@get_processor[name]
|
17
11
|
end
|
18
12
|
end
|
19
13
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'to_regexp'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
#
|
7
|
+
# Imagine this HTML:
|
8
|
+
# <h1>Foo bar and boo<h1>
|
9
|
+
#
|
10
|
+
# YAML usage example:
|
11
|
+
# selectors:
|
12
|
+
# title:
|
13
|
+
# selector: h1
|
14
|
+
# post_process:
|
15
|
+
# name: gsub
|
16
|
+
# pattern: boo
|
17
|
+
# replacement: baz
|
18
|
+
#
|
19
|
+
# Would return:
|
20
|
+
# 'Foo bar and baz'
|
21
|
+
#
|
22
|
+
# `pattern` can be a Regexp or a String.
|
23
|
+
#
|
24
|
+
# `replacement` can be a String or a Hash.
|
25
|
+
#
|
26
|
+
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
27
|
+
class Gsub
|
28
|
+
def initialize(value, env)
|
29
|
+
@value = value
|
30
|
+
options = env[:options]
|
31
|
+
@pattern = options[:pattern].to_regexp || options[:pattern]
|
32
|
+
@replacement = options[:replacement]
|
33
|
+
end
|
34
|
+
|
35
|
+
##
|
36
|
+
# @return [String]
|
37
|
+
def get
|
38
|
+
@value.to_s.gsub(@pattern, @replacement)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'kramdown'
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
# Generates HTML from Markdown.
|
7
|
+
#
|
8
|
+
# It's particularly useful in conjunction with the Template post processor
|
9
|
+
# to generate a description from other selectors.
|
10
|
+
#
|
11
|
+
# YAML usage example:
|
12
|
+
#
|
13
|
+
# selectors:
|
14
|
+
# description:
|
15
|
+
# selector: section
|
16
|
+
# post_process:
|
17
|
+
# - name: template
|
18
|
+
# string: |
|
19
|
+
# # %s
|
20
|
+
#
|
21
|
+
# Price: %s
|
22
|
+
# methods:
|
23
|
+
# - self
|
24
|
+
# - price
|
25
|
+
# - name: markdown_to_html
|
26
|
+
#
|
27
|
+
# Would e.g. return:
|
28
|
+
#
|
29
|
+
# <h1>Section</h1>
|
30
|
+
#
|
31
|
+
# <p>Price: 12.34</p>
|
32
|
+
class MarkdownToHtml
|
33
|
+
def initialize(value, env)
|
34
|
+
@value = value
|
35
|
+
@env = env
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# @return [String] formatted in Markdown
|
40
|
+
def get
|
41
|
+
SanitizeHtml.new(Kramdown::Document.new(@value).to_html, @env).get
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -17,12 +17,12 @@ module Html2rss
|
|
17
17
|
# selector: span
|
18
18
|
# post_process:
|
19
19
|
# name: 'parse_time'
|
20
|
+
# time_zone: 'Europe/Berlin'
|
20
21
|
#
|
21
22
|
# Would return:
|
22
23
|
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
23
24
|
#
|
24
25
|
# It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
|
25
|
-
# As of now it ignores time zones and always falls back to the UTC time zone.
|
26
26
|
class ParseTime
|
27
27
|
def initialize(value, env)
|
28
28
|
@value = value.to_s
|
@@ -5,7 +5,7 @@ module Html2rss
|
|
5
5
|
#
|
6
6
|
# Imagine this HTML structure:
|
7
7
|
#
|
8
|
-
# <span>http://why-not-use-a-link.uh</span>
|
8
|
+
# <span>http://why-not-use-a-link.uh </span>
|
9
9
|
#
|
10
10
|
# YAML usage example:
|
11
11
|
#
|
@@ -15,6 +15,7 @@ module Html2rss
|
|
15
15
|
# extractor: text
|
16
16
|
# post_process:
|
17
17
|
# name: parse_uri
|
18
|
+
#
|
18
19
|
# Would return:
|
19
20
|
# 'http://why-not-use-a-link.uh'
|
20
21
|
class ParseUri
|
@@ -25,7 +26,7 @@ module Html2rss
|
|
25
26
|
##
|
26
27
|
# @return [String]
|
27
28
|
def get
|
28
|
-
URI(@value).to_s
|
29
|
+
URI(Html2rss::Utils.sanitize_url(@value)).to_s
|
29
30
|
end
|
30
31
|
end
|
31
32
|
end
|
@@ -4,10 +4,16 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
##
|
6
6
|
# Returns sanitized HTML code as String.
|
7
|
-
# Adds
|
8
7
|
#
|
9
|
-
#
|
10
|
-
#
|
8
|
+
# It adds:
|
9
|
+
#
|
10
|
+
# - `rel="nofollow noopener noreferrer"` to <a> tags
|
11
|
+
# - `referrer-policy='no-referrer'` to <img> tags
|
12
|
+
#
|
13
|
+
# It also:
|
14
|
+
#
|
15
|
+
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
16
|
+
# linking to the <img>'s `src`.
|
11
17
|
#
|
12
18
|
# Imagine this HTML structure:
|
13
19
|
#
|
@@ -21,7 +27,7 @@ module Html2rss
|
|
21
27
|
#
|
22
28
|
# selectors:
|
23
29
|
# description:
|
24
|
-
# selector: section
|
30
|
+
# selector: '.section'
|
25
31
|
# extractor: html
|
26
32
|
# post_process:
|
27
33
|
# name: sanitize_html
|
@@ -29,6 +35,9 @@ module Html2rss
|
|
29
35
|
# Would return:
|
30
36
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
31
37
|
class SanitizeHtml
|
38
|
+
URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
|
39
|
+
private_constant :URL_ELEMENTS_WITH_URL_ATTRIBUTE
|
40
|
+
|
32
41
|
def initialize(value, env)
|
33
42
|
@value = value
|
34
43
|
@channel_url = env[:config].url
|
@@ -41,28 +50,22 @@ module Html2rss
|
|
41
50
|
# - adds target="_blank" to a elements
|
42
51
|
# @return [String]
|
43
52
|
def get
|
44
|
-
Sanitize.fragment(@value,
|
45
|
-
Sanitize::Config::RELAXED,
|
46
|
-
attributes: { all: %w[dir lang alt title translate] },
|
47
|
-
add_attributes: {
|
48
|
-
'a' => {
|
49
|
-
'rel' => 'nofollow noopener noreferrer',
|
50
|
-
'target' => '_blank'
|
51
|
-
},
|
52
|
-
'img' => {
|
53
|
-
'referrer-policy' => 'no-referrer'
|
54
|
-
}
|
55
|
-
},
|
56
|
-
transformers: [transform_urls_to_absolute_ones]
|
57
|
-
)).to_s.split.join(' ')
|
53
|
+
Sanitize.fragment(@value, sanitize_config).to_s.split.join(' ')
|
58
54
|
end
|
59
55
|
|
60
56
|
private
|
61
57
|
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
58
|
+
def sanitize_config
|
59
|
+
Sanitize::Config.merge(
|
60
|
+
Sanitize::Config::RELAXED,
|
61
|
+
attributes: { all: %w[dir lang alt title translate] },
|
62
|
+
add_attributes: {
|
63
|
+
'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
|
64
|
+
'img' => { 'referrer-policy' => 'no-referrer' }
|
65
|
+
},
|
66
|
+
transformers: [transform_urls_to_absolute_ones, wrap_img_in_a]
|
67
|
+
)
|
68
|
+
end
|
66
69
|
|
67
70
|
def transform_urls_to_absolute_ones
|
68
71
|
lambda do |env|
|
@@ -78,6 +81,23 @@ module Html2rss
|
|
78
81
|
env[:node][url_attribute] = absolute_url
|
79
82
|
end
|
80
83
|
end
|
84
|
+
|
85
|
+
def wrap_img_in_a
|
86
|
+
lambda do |env|
|
87
|
+
return if env[:node_name] != 'img'
|
88
|
+
|
89
|
+
img = env[:node]
|
90
|
+
|
91
|
+
return if img.parent.name == 'a'
|
92
|
+
|
93
|
+
anchor = Nokogiri::XML::Node.new('a', img)
|
94
|
+
anchor[:href] = img[:src]
|
95
|
+
|
96
|
+
anchor.add_child img.dup
|
97
|
+
|
98
|
+
img.replace(anchor)
|
99
|
+
end
|
100
|
+
end
|
81
101
|
end
|
82
102
|
end
|
83
103
|
end
|
@@ -2,9 +2,15 @@ module Html2rss
|
|
2
2
|
module AttributePostProcessors
|
3
3
|
## Returns a defined part of a String.
|
4
4
|
#
|
5
|
+
# Both parameters must be an Integer and they can be negative.
|
5
6
|
# The +end+ parameter can be omitted, in that case it will not cut the
|
6
7
|
# String at the end.
|
7
8
|
#
|
9
|
+
# A Regexp or a MatchString is not supported.
|
10
|
+
#
|
11
|
+
# See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
|
12
|
+
# documentation for more information.
|
13
|
+
#
|
8
14
|
# Imagine this HTML:
|
9
15
|
# <h1>Foo bar and baz<h1>
|
10
16
|
#
|
@@ -13,9 +19,9 @@ module Html2rss
|
|
13
19
|
# title:
|
14
20
|
# selector: h1
|
15
21
|
# post_process:
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
22
|
+
# name: substring
|
23
|
+
# start: 4
|
24
|
+
# end: 6
|
19
25
|
#
|
20
26
|
# Would return:
|
21
27
|
# 'bar'
|
@@ -28,8 +34,8 @@ module Html2rss
|
|
28
34
|
##
|
29
35
|
# @return [String]
|
30
36
|
def get
|
31
|
-
ending = @options.fetch(
|
32
|
-
@value[@options[
|
37
|
+
ending = @options.fetch(:end, @value.length).to_i
|
38
|
+
@value[@options[:start].to_i..ending]
|
33
39
|
end
|
34
40
|
end
|
35
41
|
end
|
@@ -4,7 +4,8 @@ module Html2rss
|
|
4
4
|
module AttributePostProcessors
|
5
5
|
## Returns a formatted String according to the string pattern.
|
6
6
|
#
|
7
|
-
# If +self+ is
|
7
|
+
# If +self+ is used, the selectors extracted value will be used.
|
8
|
+
# It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
|
8
9
|
#
|
9
10
|
# Imagine this HTML:
|
10
11
|
# <li>
|
@@ -22,11 +23,8 @@ module Html2rss
|
|
22
23
|
# title:
|
23
24
|
# selector: h1
|
24
25
|
# post_process:
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# methods:
|
28
|
-
# - self
|
29
|
-
# - price
|
26
|
+
# name: template
|
27
|
+
# string: '%{self} (%{price})'
|
30
28
|
#
|
31
29
|
# Would return:
|
32
30
|
# 'Product (23,42€)'
|
@@ -35,25 +33,36 @@ module Html2rss
|
|
35
33
|
@value = value
|
36
34
|
@options = env[:options]
|
37
35
|
@item = env[:item]
|
36
|
+
@string = @options[:string]
|
38
37
|
end
|
39
38
|
|
40
39
|
##
|
41
|
-
# - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
|
42
40
|
# @return [String]
|
43
41
|
def get
|
44
|
-
|
42
|
+
return format_string_with_methods if @options[:methods]
|
43
|
+
|
44
|
+
names = string.scan(/%[<|{](\w*)[>|}]/)
|
45
|
+
names.flatten!
|
46
|
+
names.compact!
|
47
|
+
names.map!(&:to_sym)
|
48
|
+
|
49
|
+
format(string, names.map { |name| [name, item_value(name)] }.to_h)
|
45
50
|
end
|
46
51
|
|
47
52
|
private
|
48
53
|
|
49
|
-
|
50
|
-
@options['string']
|
51
|
-
end
|
54
|
+
attr_reader :string
|
52
55
|
|
53
56
|
def methods
|
54
|
-
@methods ||= @options[
|
55
|
-
|
56
|
-
|
57
|
+
@methods ||= @options[:methods].map(&method(:item_value))
|
58
|
+
end
|
59
|
+
|
60
|
+
def format_string_with_methods
|
61
|
+
string % methods
|
62
|
+
end
|
63
|
+
|
64
|
+
def item_value(method_name)
|
65
|
+
method_name.to_sym == :self ? @value.to_s : @item.public_send(method_name).to_s
|
57
66
|
end
|
58
67
|
end
|
59
68
|
end
|
data/lib/html2rss/config.rb
CHANGED
@@ -1,71 +1,91 @@
|
|
1
|
+
require 'active_support/core_ext/hash'
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
##
|
3
5
|
# The Config class abstracts from the config data structure and
|
4
6
|
# provides default values.
|
5
7
|
class Config
|
6
8
|
def initialize(feed_config, global_config = {})
|
7
|
-
@global_config =
|
8
|
-
@feed_config =
|
9
|
-
@channel_config =
|
9
|
+
@global_config = global_config.deep_symbolize_keys
|
10
|
+
@feed_config = feed_config.deep_symbolize_keys
|
11
|
+
@channel_config = @feed_config.fetch(:channel, {})
|
10
12
|
end
|
11
13
|
|
12
14
|
def author
|
13
|
-
channel_config.fetch
|
15
|
+
channel_config.fetch :author, 'html2rss'
|
14
16
|
end
|
15
17
|
|
16
18
|
def ttl
|
17
|
-
channel_config.fetch
|
19
|
+
channel_config.fetch :ttl, 360
|
18
20
|
end
|
19
21
|
|
20
22
|
def title
|
21
|
-
channel_config.fetch
|
23
|
+
channel_config.fetch(:title) { generated_title }
|
24
|
+
end
|
25
|
+
|
26
|
+
def generated_title
|
27
|
+
uri = URI(url)
|
28
|
+
|
29
|
+
nicer_path = uri.path.split('/')
|
30
|
+
nicer_path.reject! { |part| part == '' }
|
31
|
+
|
32
|
+
nicer_path.any? ? "#{uri.host}: #{nicer_path.join(' ').titleize}" : uri.host
|
22
33
|
end
|
23
34
|
|
24
35
|
def language
|
25
|
-
channel_config.fetch
|
36
|
+
channel_config.fetch :language, 'en'
|
26
37
|
end
|
27
38
|
|
28
39
|
def description
|
29
|
-
channel_config.fetch
|
40
|
+
channel_config.fetch :description, "Latest items from #{url}."
|
30
41
|
end
|
31
42
|
|
32
43
|
def url
|
33
|
-
channel_config.dig
|
44
|
+
channel_config.dig :url
|
34
45
|
end
|
35
46
|
alias link url
|
36
47
|
|
37
48
|
def time_zone
|
38
|
-
channel_config.fetch
|
49
|
+
channel_config.fetch :time_zone, 'UTC'
|
39
50
|
end
|
40
51
|
|
41
52
|
def json?
|
42
|
-
channel_config.fetch
|
53
|
+
channel_config.fetch :json, false
|
43
54
|
end
|
44
55
|
|
45
56
|
def headers
|
46
|
-
global_config.fetch(
|
57
|
+
global_config.fetch(:headers, {}).merge(channel_config.fetch(:headers, {}))
|
47
58
|
end
|
48
59
|
|
49
60
|
def attribute_options(name)
|
50
|
-
feed_config.dig(
|
61
|
+
feed_config.dig(:selectors).fetch(name, {}).merge(channel: channel_config)
|
51
62
|
end
|
52
63
|
|
53
64
|
def attribute?(name)
|
54
|
-
attribute_names.include?(name
|
65
|
+
attribute_names.include?(name)
|
55
66
|
end
|
56
67
|
|
57
|
-
def
|
58
|
-
feed_config.dig(
|
68
|
+
def category_selectors
|
69
|
+
categories = feed_config.dig(:selectors, :categories)
|
70
|
+
return [] unless categories
|
71
|
+
|
72
|
+
categories = categories.keep_if { |category| category.to_s != '' }
|
73
|
+
categories.map!(&:to_sym)
|
74
|
+
categories.uniq!
|
75
|
+
|
76
|
+
categories
|
59
77
|
end
|
60
78
|
|
61
79
|
def selector(name)
|
62
|
-
feed_config.dig(
|
80
|
+
feed_config.dig(:selectors, name, :selector)
|
63
81
|
end
|
64
82
|
|
65
83
|
def attribute_names
|
66
|
-
@attribute_names ||= feed_config.fetch(
|
67
|
-
|
68
|
-
|
84
|
+
@attribute_names ||= feed_config.fetch(:selectors, {}).keys.tap { |attrs| attrs.delete(:items) }
|
85
|
+
end
|
86
|
+
|
87
|
+
def items_order
|
88
|
+
feed_config.dig(:selectors, :items, :order)&.to_sym
|
69
89
|
end
|
70
90
|
|
71
91
|
private
|