html2rss 0.3.3 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +18 -11
- data/.travis.yml +3 -3
- data/.yardopts +6 -0
- data/Gemfile.lock +23 -5
- data/README.md +2 -1
- data/docs/Html2rss.html +353 -0
- data/docs/Html2rss/AttributePostProcessors.html +203 -0
- data/docs/Html2rss/AttributePostProcessors/ParseTime.html +332 -0
- data/docs/Html2rss/AttributePostProcessors/ParseUri.html +314 -0
- data/docs/Html2rss/AttributePostProcessors/SanitizeHtml.html +346 -0
- data/docs/Html2rss/AttributePostProcessors/Substring.html +321 -0
- data/docs/Html2rss/AttributePostProcessors/Template.html +336 -0
- data/docs/Html2rss/Config.html +795 -0
- data/docs/Html2rss/FeedBuilder.html +295 -0
- data/docs/Html2rss/Item.html +654 -0
- data/docs/Html2rss/ItemExtractors.html +297 -0
- data/docs/Html2rss/ItemExtractors/Attribute.html +317 -0
- data/docs/Html2rss/ItemExtractors/CurrentTime.html +297 -0
- data/docs/Html2rss/ItemExtractors/Href.html +319 -0
- data/docs/Html2rss/ItemExtractors/Html.html +314 -0
- data/docs/Html2rss/ItemExtractors/Static.html +301 -0
- data/docs/Html2rss/ItemExtractors/Text.html +312 -0
- data/docs/Html2rss/Utils.html +115 -0
- data/docs/Html2rss/Utils/IndifferentAccessHash.html +142 -0
- data/docs/_index.html +300 -0
- data/docs/class_list.html +51 -0
- data/docs/css/common.css +1 -0
- data/docs/css/full_list.css +58 -0
- data/docs/css/style.css +496 -0
- data/docs/file.README.html +135 -0
- data/docs/file_list.html +56 -0
- data/docs/frames.html +17 -0
- data/docs/index.html +135 -0
- data/docs/js/app.js +303 -0
- data/docs/js/full_list.js +216 -0
- data/docs/js/jquery.js +4 -0
- data/docs/method_list.html +435 -0
- data/docs/top-level-namespace.html +110 -0
- data/html2rss.gemspec +3 -0
- data/lib/html2rss.rb +19 -4
- data/lib/html2rss/attribute_post_processors.rb +5 -3
- data/lib/html2rss/attribute_post_processors/parse_time.rb +29 -3
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +20 -1
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +65 -3
- data/lib/html2rss/attribute_post_processors/substring.rb +24 -3
- data/lib/html2rss/attribute_post_processors/template.rb +37 -10
- data/lib/html2rss/config.rb +11 -12
- data/lib/html2rss/feed_builder.rb +8 -6
- data/lib/html2rss/item.rb +28 -19
- data/lib/html2rss/item_extractors.rb +29 -0
- data/lib/html2rss/item_extractors/attribute.rb +37 -0
- data/lib/html2rss/item_extractors/current_time.rb +21 -0
- data/lib/html2rss/item_extractors/href.rb +36 -0
- data/lib/html2rss/item_extractors/html.rb +34 -0
- data/lib/html2rss/item_extractors/static.rb +28 -0
- data/lib/html2rss/item_extractors/text.rb +32 -0
- data/lib/html2rss/utils.rb +25 -0
- data/lib/html2rss/version.rb +1 -1
- metadata +88 -4
- data/lib/html2rss/item_extractor.rb +0 -37
@@ -0,0 +1,110 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="utf-8">
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6
|
+
<title>
|
7
|
+
Top Level Namespace
|
8
|
+
|
9
|
+
— Documentation by YARD 0.9.20
|
10
|
+
|
11
|
+
</title>
|
12
|
+
|
13
|
+
<link rel="stylesheet" href="css/style.css" type="text/css" charset="utf-8" />
|
14
|
+
|
15
|
+
<link rel="stylesheet" href="css/common.css" type="text/css" charset="utf-8" />
|
16
|
+
|
17
|
+
<script type="text/javascript" charset="utf-8">
|
18
|
+
pathId = "";
|
19
|
+
relpath = '';
|
20
|
+
</script>
|
21
|
+
|
22
|
+
|
23
|
+
<script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
|
24
|
+
|
25
|
+
<script type="text/javascript" charset="utf-8" src="js/app.js"></script>
|
26
|
+
|
27
|
+
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
<div class="nav_wrap">
|
31
|
+
<iframe id="nav" src="class_list.html?1"></iframe>
|
32
|
+
<div id="resizer"></div>
|
33
|
+
</div>
|
34
|
+
|
35
|
+
<div id="main" tabindex="-1">
|
36
|
+
<div id="header">
|
37
|
+
<div id="menu">
|
38
|
+
|
39
|
+
<a href="_index.html">Index</a> »
|
40
|
+
|
41
|
+
|
42
|
+
<span class="title">Top Level Namespace</span>
|
43
|
+
|
44
|
+
</div>
|
45
|
+
|
46
|
+
<div id="search">
|
47
|
+
|
48
|
+
<a class="full_list_link" id="class_list_link"
|
49
|
+
href="class_list.html">
|
50
|
+
|
51
|
+
<svg width="24" height="24">
|
52
|
+
<rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
|
53
|
+
<rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
|
54
|
+
<rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
|
55
|
+
</svg>
|
56
|
+
</a>
|
57
|
+
|
58
|
+
</div>
|
59
|
+
<div class="clear"></div>
|
60
|
+
</div>
|
61
|
+
|
62
|
+
<div id="content"><h1>Top Level Namespace
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
</h1>
|
67
|
+
<div class="box_info">
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
</div>
|
80
|
+
|
81
|
+
<h2>Defined Under Namespace</h2>
|
82
|
+
<p class="children">
|
83
|
+
|
84
|
+
|
85
|
+
<strong class="modules">Modules:</strong> <span class='object_link'><a href="Html2rss.html" title="Html2rss (module)">Html2rss</a></span>
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
</p>
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
</div>
|
101
|
+
|
102
|
+
<div id="footer">
|
103
|
+
Generated on Sun Jul 14 19:35:05 2019 by
|
104
|
+
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
105
|
+
0.9.20 (ruby-2.6.3).
|
106
|
+
</div>
|
107
|
+
|
108
|
+
</div>
|
109
|
+
</body>
|
110
|
+
</html>
|
data/html2rss.gemspec
CHANGED
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
28
|
spec.require_paths = ['lib']
|
29
29
|
|
30
|
+
spec.add_dependency 'activesupport', '~> 5.0'
|
30
31
|
spec.add_dependency 'faraday', '~> 0.15'
|
31
32
|
spec.add_dependency 'faraday_middleware', '~> 0.13'
|
32
33
|
spec.add_dependency 'hashie', '~> 3.6'
|
@@ -37,6 +38,8 @@ Gem::Specification.new do |spec|
|
|
37
38
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
38
39
|
spec.add_development_dependency 'rubocop'
|
39
40
|
spec.add_development_dependency 'rubocop-performance'
|
41
|
+
spec.add_development_dependency 'rubocop-rspec'
|
40
42
|
spec.add_development_dependency 'simplecov'
|
41
43
|
spec.add_development_dependency 'vcr'
|
44
|
+
spec.add_development_dependency 'yard'
|
42
45
|
end
|
data/lib/html2rss.rb
CHANGED
@@ -1,28 +1,41 @@
|
|
1
1
|
require 'html2rss/config'
|
2
2
|
require 'html2rss/feed_builder'
|
3
3
|
require 'html2rss/version'
|
4
|
+
require 'html2rss/utils'
|
4
5
|
require 'yaml'
|
5
6
|
|
6
7
|
module Html2rss
|
8
|
+
##
|
9
|
+
# Returns a RSS object which is generated from the provided file.
|
10
|
+
#
|
11
|
+
# `file_path`: a File object of a YAML file
|
12
|
+
# `name`: the of the feed
|
13
|
+
#
|
14
|
+
# Example:
|
15
|
+
#
|
16
|
+
# feed = Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')
|
17
|
+
# # => #<RSS::Rss:0x00007fb2f6331228
|
18
|
+
# @return [RSS:Rss]
|
7
19
|
def self.feed_from_yaml_config(file, name)
|
8
20
|
# rubocop:disable Security/YAMLLoad
|
9
21
|
yaml = YAML.load(File.open(file))
|
10
22
|
# rubocop:enable Security/YAMLLoad
|
11
23
|
|
12
24
|
feed_config = yaml['feeds'][name]
|
13
|
-
global_config = yaml.reject { |
|
25
|
+
global_config = yaml.reject { |key| key == 'feeds' }
|
14
26
|
|
15
27
|
config = Config.new(feed_config, global_config)
|
16
28
|
feed(config)
|
17
29
|
end
|
18
30
|
|
19
31
|
##
|
20
|
-
# Returns
|
32
|
+
# Returns a RSS object which is generated from the provided config.
|
21
33
|
#
|
22
34
|
# `config`: can be a Hash or an instance of Html2rss::Config.
|
23
35
|
#
|
24
|
-
#
|
25
|
-
#
|
36
|
+
# Example:
|
37
|
+
#
|
38
|
+
# feed = Html2rss.feed(
|
26
39
|
# channel: { name: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com' },
|
27
40
|
# selectors: {
|
28
41
|
# items: { selector: '#hot-network-questions > ul > li' },
|
@@ -30,6 +43,8 @@ module Html2rss
|
|
30
43
|
# link: { selector: 'a', extractor: 'href' }
|
31
44
|
# }
|
32
45
|
# )
|
46
|
+
# # => #<RSS::Rss:0x00007fb2f48d14a0 ...>
|
47
|
+
# @return [RSS:Rss]
|
33
48
|
def self.feed(config)
|
34
49
|
config = Config.new(config) unless config.is_a?(Config)
|
35
50
|
|
@@ -5,10 +5,12 @@ require_relative 'attribute_post_processors/substring'
|
|
5
5
|
require_relative 'attribute_post_processors/template'
|
6
6
|
|
7
7
|
module Html2rss
|
8
|
+
##
|
9
|
+
# Provides a namespace for attribute post processors.
|
8
10
|
module AttributePostProcessors
|
9
|
-
def self.get_processor(
|
10
|
-
|
11
|
-
class_name = ['Html2rss', 'AttributePostProcessors',
|
11
|
+
def self.get_processor(name)
|
12
|
+
camel_cased_name = name.split('_').map(&:capitalize).join
|
13
|
+
class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
|
12
14
|
|
13
15
|
Object.const_get(class_name)
|
14
16
|
end
|
@@ -1,12 +1,38 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'active_support/core_ext/time'
|
3
|
+
|
1
4
|
module Html2rss
|
2
5
|
module AttributePostProcessors
|
6
|
+
##
|
7
|
+
# Returns the {https://www.w3.org/Protocols/rfc822/ RFC822} representation of a time.
|
8
|
+
#
|
9
|
+
# Imagine this HTML structure:
|
10
|
+
#
|
11
|
+
# <p>Published on <span>2019-07-02</span></p>
|
12
|
+
#
|
13
|
+
# YAML usage example:
|
14
|
+
#
|
15
|
+
# selectors:
|
16
|
+
# description:
|
17
|
+
# selector: span
|
18
|
+
# post_process:
|
19
|
+
# name: 'parse_time'
|
20
|
+
#
|
21
|
+
# Would return:
|
22
|
+
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
23
|
+
#
|
24
|
+
# It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
|
25
|
+
# As of now it ignores time zones and always falls back to the UTC time zone.
|
3
26
|
class ParseTime
|
4
|
-
def initialize(value,
|
5
|
-
@value = value
|
27
|
+
def initialize(value, env)
|
28
|
+
@value = value.to_s
|
29
|
+
@time_zone = env[:config].time_zone
|
6
30
|
end
|
7
31
|
|
32
|
+
##
|
33
|
+
# @return [String] rfc822 formatted time
|
8
34
|
def get
|
9
|
-
Time.parse(@value).rfc822
|
35
|
+
Time.use_zone(@time_zone) { Time.zone.parse(@value).rfc822 }
|
10
36
|
end
|
11
37
|
end
|
12
38
|
end
|
@@ -1,10 +1,29 @@
|
|
1
1
|
module Html2rss
|
2
2
|
module AttributePostProcessors
|
3
|
+
##
|
4
|
+
# Returns the URI as String.
|
5
|
+
#
|
6
|
+
# Imagine this HTML structure:
|
7
|
+
#
|
8
|
+
# <span>http://why-not-use-a-link.uh</span>
|
9
|
+
#
|
10
|
+
# YAML usage example:
|
11
|
+
#
|
12
|
+
# selectors:
|
13
|
+
# link:
|
14
|
+
# selector: span
|
15
|
+
# extractor: text
|
16
|
+
# post_process:
|
17
|
+
# name: parse_uri
|
18
|
+
# Would return:
|
19
|
+
# 'http://why-not-use-a-link.uh'
|
3
20
|
class ParseUri
|
4
|
-
def initialize(value,
|
21
|
+
def initialize(value, _env)
|
5
22
|
@value = value
|
6
23
|
end
|
7
24
|
|
25
|
+
##
|
26
|
+
# @return [String]
|
8
27
|
def get
|
9
28
|
URI(@value).to_s
|
10
29
|
end
|
@@ -2,21 +2,83 @@ require 'sanitize'
|
|
2
2
|
|
3
3
|
module Html2rss
|
4
4
|
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
# Returns sanitized HTML code as String.
|
7
|
+
# Adds
|
8
|
+
#
|
9
|
+
# - rel="nofollow noopener noreferrer" to a elements
|
10
|
+
# - referrer-policy='no-referrer' to img elements
|
11
|
+
#
|
12
|
+
# Imagine this HTML structure:
|
13
|
+
#
|
14
|
+
# <section>
|
15
|
+
# Lorem <b>ipsum</b> dolor...
|
16
|
+
# <iframe src="https://evil.corp/miner"></iframe>
|
17
|
+
# <script>alert();</script>
|
18
|
+
# </section>
|
19
|
+
#
|
20
|
+
# YAML usage example:
|
21
|
+
#
|
22
|
+
# selectors:
|
23
|
+
# description:
|
24
|
+
# selector: section
|
25
|
+
# extractor: html
|
26
|
+
# post_process:
|
27
|
+
# name: sanitize_html
|
28
|
+
#
|
29
|
+
# Would return:
|
30
|
+
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
5
31
|
class SanitizeHtml
|
6
|
-
def initialize(value,
|
32
|
+
def initialize(value, env)
|
7
33
|
@value = value
|
34
|
+
@channel_url = env[:config].url
|
8
35
|
end
|
9
36
|
|
37
|
+
##
|
38
|
+
# - uses the {https://github.com/rgrove/sanitize sanitize gem}
|
39
|
+
# - uses the config {https://github.com/rgrove/sanitize#sanitizeconfigrelaxed Sanitize::Config::RELAXED}
|
40
|
+
# - adds rel="nofollow noopener noreferrer" to a elements
|
41
|
+
# - adds target="_blank" to a elements
|
42
|
+
# @return [String]
|
10
43
|
def get
|
11
44
|
Sanitize.fragment(@value, Sanitize::Config.merge(
|
12
45
|
Sanitize::Config::RELAXED,
|
46
|
+
attributes: {
|
47
|
+
all: %w[dir lang alt title translate]
|
48
|
+
},
|
13
49
|
add_attributes: {
|
14
50
|
'a' => {
|
15
51
|
'rel' => 'nofollow noopener noreferrer',
|
16
52
|
'target' => '_blank'
|
53
|
+
},
|
54
|
+
'img' => {
|
55
|
+
'referrer-policy' => 'no-referrer'
|
17
56
|
}
|
18
|
-
}
|
19
|
-
|
57
|
+
},
|
58
|
+
transformers: [transform_urls_to_absolute_ones]
|
59
|
+
)).to_s.split.join(' ')
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
|
65
|
+
'a' => :href,
|
66
|
+
'img' => :src
|
67
|
+
}.freeze
|
68
|
+
|
69
|
+
def transform_urls_to_absolute_ones
|
70
|
+
lambda do |env|
|
71
|
+
return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(env[:node_name])
|
72
|
+
|
73
|
+
url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[env[:node_name]]
|
74
|
+
url = env[:node][url_attribute]
|
75
|
+
|
76
|
+
return if URI(url).absolute?
|
77
|
+
|
78
|
+
absolute_url = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
|
79
|
+
|
80
|
+
env[:node][url_attribute] = absolute_url
|
81
|
+
end
|
20
82
|
end
|
21
83
|
end
|
22
84
|
end
|
@@ -1,13 +1,34 @@
|
|
1
1
|
module Html2rss
|
2
2
|
module AttributePostProcessors
|
3
|
+
## Returns a defined part of a String.
|
4
|
+
#
|
5
|
+
# The +end+ parameter can be omitted, in that case it will not cut the
|
6
|
+
# String at the end.
|
7
|
+
#
|
8
|
+
# Imagine this HTML:
|
9
|
+
# <h1>Foo bar and baz<h1>
|
10
|
+
#
|
11
|
+
# YAML usage example:
|
12
|
+
# selectors:
|
13
|
+
# title:
|
14
|
+
# selector: h1
|
15
|
+
# post_process:
|
16
|
+
# name: substring
|
17
|
+
# start: 4
|
18
|
+
# end: 6
|
19
|
+
#
|
20
|
+
# Would return:
|
21
|
+
# 'bar'
|
3
22
|
class Substring
|
4
|
-
def initialize(value,
|
23
|
+
def initialize(value, env)
|
5
24
|
@value = value
|
6
|
-
@options = options
|
25
|
+
@options = env[:options]
|
7
26
|
end
|
8
27
|
|
28
|
+
##
|
29
|
+
# @return [String]
|
9
30
|
def get
|
10
|
-
ending = @options.fetch('end',
|
31
|
+
ending = @options.fetch('end', @value.length).to_i
|
11
32
|
@value[@options['start'].to_i..ending]
|
12
33
|
end
|
13
34
|
end
|
@@ -2,13 +2,44 @@ require 'sanitize'
|
|
2
2
|
|
3
3
|
module Html2rss
|
4
4
|
module AttributePostProcessors
|
5
|
+
## Returns a formatted String according to the string pattern.
|
6
|
+
#
|
7
|
+
# If +self+ is given as a method, the extracted value will be used.
|
8
|
+
#
|
9
|
+
# Imagine this HTML:
|
10
|
+
# <li>
|
11
|
+
# <h1>Product</h1>
|
12
|
+
# <span class="price">23,42€</span>
|
13
|
+
# </li>
|
14
|
+
#
|
15
|
+
# YAML usage example:
|
16
|
+
#
|
17
|
+
# selectors:
|
18
|
+
# items:
|
19
|
+
# selector: 'li'
|
20
|
+
# price:
|
21
|
+
# selector: '.price'
|
22
|
+
# title:
|
23
|
+
# selector: h1
|
24
|
+
# post_process:
|
25
|
+
# name: template
|
26
|
+
# string: '%s (%s)'
|
27
|
+
# methods:
|
28
|
+
# - self
|
29
|
+
# - price
|
30
|
+
#
|
31
|
+
# Would return:
|
32
|
+
# 'Product (23,42€)'
|
5
33
|
class Template
|
6
|
-
def initialize(value,
|
34
|
+
def initialize(value, env)
|
7
35
|
@value = value
|
8
|
-
@options = options
|
9
|
-
@item = item
|
36
|
+
@options = env[:options]
|
37
|
+
@item = env[:item]
|
10
38
|
end
|
11
39
|
|
40
|
+
##
|
41
|
+
# - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
|
42
|
+
# @return [String]
|
12
43
|
def get
|
13
44
|
string % methods
|
14
45
|
end
|
@@ -20,13 +51,9 @@ module Html2rss
|
|
20
51
|
end
|
21
52
|
|
22
53
|
def methods
|
23
|
-
@methods ||= @options['methods'].map
|
24
|
-
|
25
|
-
|
26
|
-
else
|
27
|
-
@item.send(method.to_sym)&.to_s
|
28
|
-
end
|
29
|
-
}
|
54
|
+
@methods ||= @options['methods'].map do |method|
|
55
|
+
method == 'self' ? @value.to_s : @item.public_send(method.to_sym).to_s
|
56
|
+
end
|
30
57
|
end
|
31
58
|
end
|
32
59
|
end
|