html2rss 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +18 -11
- data/.travis.yml +3 -3
- data/.yardopts +6 -0
- data/Gemfile.lock +23 -5
- data/README.md +2 -1
- data/docs/Html2rss.html +353 -0
- data/docs/Html2rss/AttributePostProcessors.html +203 -0
- data/docs/Html2rss/AttributePostProcessors/ParseTime.html +332 -0
- data/docs/Html2rss/AttributePostProcessors/ParseUri.html +314 -0
- data/docs/Html2rss/AttributePostProcessors/SanitizeHtml.html +346 -0
- data/docs/Html2rss/AttributePostProcessors/Substring.html +321 -0
- data/docs/Html2rss/AttributePostProcessors/Template.html +336 -0
- data/docs/Html2rss/Config.html +795 -0
- data/docs/Html2rss/FeedBuilder.html +295 -0
- data/docs/Html2rss/Item.html +654 -0
- data/docs/Html2rss/ItemExtractors.html +297 -0
- data/docs/Html2rss/ItemExtractors/Attribute.html +317 -0
- data/docs/Html2rss/ItemExtractors/CurrentTime.html +297 -0
- data/docs/Html2rss/ItemExtractors/Href.html +319 -0
- data/docs/Html2rss/ItemExtractors/Html.html +314 -0
- data/docs/Html2rss/ItemExtractors/Static.html +301 -0
- data/docs/Html2rss/ItemExtractors/Text.html +312 -0
- data/docs/Html2rss/Utils.html +115 -0
- data/docs/Html2rss/Utils/IndifferentAccessHash.html +142 -0
- data/docs/_index.html +300 -0
- data/docs/class_list.html +51 -0
- data/docs/css/common.css +1 -0
- data/docs/css/full_list.css +58 -0
- data/docs/css/style.css +496 -0
- data/docs/file.README.html +135 -0
- data/docs/file_list.html +56 -0
- data/docs/frames.html +17 -0
- data/docs/index.html +135 -0
- data/docs/js/app.js +303 -0
- data/docs/js/full_list.js +216 -0
- data/docs/js/jquery.js +4 -0
- data/docs/method_list.html +435 -0
- data/docs/top-level-namespace.html +110 -0
- data/html2rss.gemspec +3 -0
- data/lib/html2rss.rb +19 -4
- data/lib/html2rss/attribute_post_processors.rb +5 -3
- data/lib/html2rss/attribute_post_processors/parse_time.rb +29 -3
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +20 -1
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +65 -3
- data/lib/html2rss/attribute_post_processors/substring.rb +24 -3
- data/lib/html2rss/attribute_post_processors/template.rb +37 -10
- data/lib/html2rss/config.rb +11 -12
- data/lib/html2rss/feed_builder.rb +8 -6
- data/lib/html2rss/item.rb +28 -19
- data/lib/html2rss/item_extractors.rb +29 -0
- data/lib/html2rss/item_extractors/attribute.rb +37 -0
- data/lib/html2rss/item_extractors/current_time.rb +21 -0
- data/lib/html2rss/item_extractors/href.rb +36 -0
- data/lib/html2rss/item_extractors/html.rb +34 -0
- data/lib/html2rss/item_extractors/static.rb +28 -0
- data/lib/html2rss/item_extractors/text.rb +32 -0
- data/lib/html2rss/utils.rb +25 -0
- data/lib/html2rss/version.rb +1 -1
- metadata +88 -4
- data/lib/html2rss/item_extractor.rb +0 -37
@@ -0,0 +1,110 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<meta charset="utf-8">
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6
|
+
<title>
|
7
|
+
Top Level Namespace
|
8
|
+
|
9
|
+
— Documentation by YARD 0.9.20
|
10
|
+
|
11
|
+
</title>
|
12
|
+
|
13
|
+
<link rel="stylesheet" href="css/style.css" type="text/css" charset="utf-8" />
|
14
|
+
|
15
|
+
<link rel="stylesheet" href="css/common.css" type="text/css" charset="utf-8" />
|
16
|
+
|
17
|
+
<script type="text/javascript" charset="utf-8">
|
18
|
+
pathId = "";
|
19
|
+
relpath = '';
|
20
|
+
</script>
|
21
|
+
|
22
|
+
|
23
|
+
<script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
|
24
|
+
|
25
|
+
<script type="text/javascript" charset="utf-8" src="js/app.js"></script>
|
26
|
+
|
27
|
+
|
28
|
+
</head>
|
29
|
+
<body>
|
30
|
+
<div class="nav_wrap">
|
31
|
+
<iframe id="nav" src="class_list.html?1"></iframe>
|
32
|
+
<div id="resizer"></div>
|
33
|
+
</div>
|
34
|
+
|
35
|
+
<div id="main" tabindex="-1">
|
36
|
+
<div id="header">
|
37
|
+
<div id="menu">
|
38
|
+
|
39
|
+
<a href="_index.html">Index</a> »
|
40
|
+
|
41
|
+
|
42
|
+
<span class="title">Top Level Namespace</span>
|
43
|
+
|
44
|
+
</div>
|
45
|
+
|
46
|
+
<div id="search">
|
47
|
+
|
48
|
+
<a class="full_list_link" id="class_list_link"
|
49
|
+
href="class_list.html">
|
50
|
+
|
51
|
+
<svg width="24" height="24">
|
52
|
+
<rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
|
53
|
+
<rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
|
54
|
+
<rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
|
55
|
+
</svg>
|
56
|
+
</a>
|
57
|
+
|
58
|
+
</div>
|
59
|
+
<div class="clear"></div>
|
60
|
+
</div>
|
61
|
+
|
62
|
+
<div id="content"><h1>Top Level Namespace
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
</h1>
|
67
|
+
<div class="box_info">
|
68
|
+
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
</div>
|
80
|
+
|
81
|
+
<h2>Defined Under Namespace</h2>
|
82
|
+
<p class="children">
|
83
|
+
|
84
|
+
|
85
|
+
<strong class="modules">Modules:</strong> <span class='object_link'><a href="Html2rss.html" title="Html2rss (module)">Html2rss</a></span>
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
</p>
|
91
|
+
|
92
|
+
|
93
|
+
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
</div>
|
101
|
+
|
102
|
+
<div id="footer">
|
103
|
+
Generated on Sun Jul 14 19:35:05 2019 by
|
104
|
+
<a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
|
105
|
+
0.9.20 (ruby-2.6.3).
|
106
|
+
</div>
|
107
|
+
|
108
|
+
</div>
|
109
|
+
</body>
|
110
|
+
</html>
|
data/html2rss.gemspec
CHANGED
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
28
|
spec.require_paths = ['lib']
|
29
29
|
|
30
|
+
spec.add_dependency 'activesupport', '~> 5.0'
|
30
31
|
spec.add_dependency 'faraday', '~> 0.15'
|
31
32
|
spec.add_dependency 'faraday_middleware', '~> 0.13'
|
32
33
|
spec.add_dependency 'hashie', '~> 3.6'
|
@@ -37,6 +38,8 @@ Gem::Specification.new do |spec|
|
|
37
38
|
spec.add_development_dependency 'rspec', '~> 3.0'
|
38
39
|
spec.add_development_dependency 'rubocop'
|
39
40
|
spec.add_development_dependency 'rubocop-performance'
|
41
|
+
spec.add_development_dependency 'rubocop-rspec'
|
40
42
|
spec.add_development_dependency 'simplecov'
|
41
43
|
spec.add_development_dependency 'vcr'
|
44
|
+
spec.add_development_dependency 'yard'
|
42
45
|
end
|
data/lib/html2rss.rb
CHANGED
@@ -1,28 +1,41 @@
|
|
1
1
|
require 'html2rss/config'
|
2
2
|
require 'html2rss/feed_builder'
|
3
3
|
require 'html2rss/version'
|
4
|
+
require 'html2rss/utils'
|
4
5
|
require 'yaml'
|
5
6
|
|
6
7
|
module Html2rss
|
8
|
+
##
|
9
|
+
# Returns a RSS object which is generated from the provided file.
|
10
|
+
#
|
11
|
+
# `file_path`: a File object of a YAML file
|
12
|
+
# `name`: the of the feed
|
13
|
+
#
|
14
|
+
# Example:
|
15
|
+
#
|
16
|
+
# feed = Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')
|
17
|
+
# # => #<RSS::Rss:0x00007fb2f6331228
|
18
|
+
# @return [RSS:Rss]
|
7
19
|
def self.feed_from_yaml_config(file, name)
|
8
20
|
# rubocop:disable Security/YAMLLoad
|
9
21
|
yaml = YAML.load(File.open(file))
|
10
22
|
# rubocop:enable Security/YAMLLoad
|
11
23
|
|
12
24
|
feed_config = yaml['feeds'][name]
|
13
|
-
global_config = yaml.reject { |
|
25
|
+
global_config = yaml.reject { |key| key == 'feeds' }
|
14
26
|
|
15
27
|
config = Config.new(feed_config, global_config)
|
16
28
|
feed(config)
|
17
29
|
end
|
18
30
|
|
19
31
|
##
|
20
|
-
# Returns
|
32
|
+
# Returns a RSS object which is generated from the provided config.
|
21
33
|
#
|
22
34
|
# `config`: can be a Hash or an instance of Html2rss::Config.
|
23
35
|
#
|
24
|
-
#
|
25
|
-
#
|
36
|
+
# Example:
|
37
|
+
#
|
38
|
+
# feed = Html2rss.feed(
|
26
39
|
# channel: { name: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com' },
|
27
40
|
# selectors: {
|
28
41
|
# items: { selector: '#hot-network-questions > ul > li' },
|
@@ -30,6 +43,8 @@ module Html2rss
|
|
30
43
|
# link: { selector: 'a', extractor: 'href' }
|
31
44
|
# }
|
32
45
|
# )
|
46
|
+
# # => #<RSS::Rss:0x00007fb2f48d14a0 ...>
|
47
|
+
# @return [RSS:Rss]
|
33
48
|
def self.feed(config)
|
34
49
|
config = Config.new(config) unless config.is_a?(Config)
|
35
50
|
|
@@ -5,10 +5,12 @@ require_relative 'attribute_post_processors/substring'
|
|
5
5
|
require_relative 'attribute_post_processors/template'
|
6
6
|
|
7
7
|
module Html2rss
|
8
|
+
##
|
9
|
+
# Provides a namespace for attribute post processors.
|
8
10
|
module AttributePostProcessors
|
9
|
-
def self.get_processor(
|
10
|
-
|
11
|
-
class_name = ['Html2rss', 'AttributePostProcessors',
|
11
|
+
def self.get_processor(name)
|
12
|
+
camel_cased_name = name.split('_').map(&:capitalize).join
|
13
|
+
class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
|
12
14
|
|
13
15
|
Object.const_get(class_name)
|
14
16
|
end
|
@@ -1,12 +1,38 @@
|
|
1
|
+
require 'active_support'
|
2
|
+
require 'active_support/core_ext/time'
|
3
|
+
|
1
4
|
module Html2rss
|
2
5
|
module AttributePostProcessors
|
6
|
+
##
|
7
|
+
# Returns the {https://www.w3.org/Protocols/rfc822/ RFC822} representation of a time.
|
8
|
+
#
|
9
|
+
# Imagine this HTML structure:
|
10
|
+
#
|
11
|
+
# <p>Published on <span>2019-07-02</span></p>
|
12
|
+
#
|
13
|
+
# YAML usage example:
|
14
|
+
#
|
15
|
+
# selectors:
|
16
|
+
# description:
|
17
|
+
# selector: span
|
18
|
+
# post_process:
|
19
|
+
# name: 'parse_time'
|
20
|
+
#
|
21
|
+
# Would return:
|
22
|
+
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
23
|
+
#
|
24
|
+
# It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
|
25
|
+
# As of now it ignores time zones and always falls back to the UTC time zone.
|
3
26
|
class ParseTime
|
4
|
-
def initialize(value,
|
5
|
-
@value = value
|
27
|
+
def initialize(value, env)
|
28
|
+
@value = value.to_s
|
29
|
+
@time_zone = env[:config].time_zone
|
6
30
|
end
|
7
31
|
|
32
|
+
##
|
33
|
+
# @return [String] rfc822 formatted time
|
8
34
|
def get
|
9
|
-
Time.parse(@value).rfc822
|
35
|
+
Time.use_zone(@time_zone) { Time.zone.parse(@value).rfc822 }
|
10
36
|
end
|
11
37
|
end
|
12
38
|
end
|
@@ -1,10 +1,29 @@
|
|
1
1
|
module Html2rss
|
2
2
|
module AttributePostProcessors
|
3
|
+
##
|
4
|
+
# Returns the URI as String.
|
5
|
+
#
|
6
|
+
# Imagine this HTML structure:
|
7
|
+
#
|
8
|
+
# <span>http://why-not-use-a-link.uh</span>
|
9
|
+
#
|
10
|
+
# YAML usage example:
|
11
|
+
#
|
12
|
+
# selectors:
|
13
|
+
# link:
|
14
|
+
# selector: span
|
15
|
+
# extractor: text
|
16
|
+
# post_process:
|
17
|
+
# name: parse_uri
|
18
|
+
# Would return:
|
19
|
+
# 'http://why-not-use-a-link.uh'
|
3
20
|
class ParseUri
|
4
|
-
def initialize(value,
|
21
|
+
def initialize(value, _env)
|
5
22
|
@value = value
|
6
23
|
end
|
7
24
|
|
25
|
+
##
|
26
|
+
# @return [String]
|
8
27
|
def get
|
9
28
|
URI(@value).to_s
|
10
29
|
end
|
@@ -2,21 +2,83 @@ require 'sanitize'
|
|
2
2
|
|
3
3
|
module Html2rss
|
4
4
|
module AttributePostProcessors
|
5
|
+
##
|
6
|
+
# Returns sanitized HTML code as String.
|
7
|
+
# Adds
|
8
|
+
#
|
9
|
+
# - rel="nofollow noopener noreferrer" to a elements
|
10
|
+
# - referrer-policy='no-referrer' to img elements
|
11
|
+
#
|
12
|
+
# Imagine this HTML structure:
|
13
|
+
#
|
14
|
+
# <section>
|
15
|
+
# Lorem <b>ipsum</b> dolor...
|
16
|
+
# <iframe src="https://evil.corp/miner"></iframe>
|
17
|
+
# <script>alert();</script>
|
18
|
+
# </section>
|
19
|
+
#
|
20
|
+
# YAML usage example:
|
21
|
+
#
|
22
|
+
# selectors:
|
23
|
+
# description:
|
24
|
+
# selector: section
|
25
|
+
# extractor: html
|
26
|
+
# post_process:
|
27
|
+
# name: sanitize_html
|
28
|
+
#
|
29
|
+
# Would return:
|
30
|
+
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
5
31
|
class SanitizeHtml
|
6
|
-
def initialize(value,
|
32
|
+
def initialize(value, env)
|
7
33
|
@value = value
|
34
|
+
@channel_url = env[:config].url
|
8
35
|
end
|
9
36
|
|
37
|
+
##
|
38
|
+
# - uses the {https://github.com/rgrove/sanitize sanitize gem}
|
39
|
+
# - uses the config {https://github.com/rgrove/sanitize#sanitizeconfigrelaxed Sanitize::Config::RELAXED}
|
40
|
+
# - adds rel="nofollow noopener noreferrer" to a elements
|
41
|
+
# - adds target="_blank" to a elements
|
42
|
+
# @return [String]
|
10
43
|
def get
|
11
44
|
Sanitize.fragment(@value, Sanitize::Config.merge(
|
12
45
|
Sanitize::Config::RELAXED,
|
46
|
+
attributes: {
|
47
|
+
all: %w[dir lang alt title translate]
|
48
|
+
},
|
13
49
|
add_attributes: {
|
14
50
|
'a' => {
|
15
51
|
'rel' => 'nofollow noopener noreferrer',
|
16
52
|
'target' => '_blank'
|
53
|
+
},
|
54
|
+
'img' => {
|
55
|
+
'referrer-policy' => 'no-referrer'
|
17
56
|
}
|
18
|
-
}
|
19
|
-
|
57
|
+
},
|
58
|
+
transformers: [transform_urls_to_absolute_ones]
|
59
|
+
)).to_s.split.join(' ')
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
|
64
|
+
URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
|
65
|
+
'a' => :href,
|
66
|
+
'img' => :src
|
67
|
+
}.freeze
|
68
|
+
|
69
|
+
def transform_urls_to_absolute_ones
|
70
|
+
lambda do |env|
|
71
|
+
return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(env[:node_name])
|
72
|
+
|
73
|
+
url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[env[:node_name]]
|
74
|
+
url = env[:node][url_attribute]
|
75
|
+
|
76
|
+
return if URI(url).absolute?
|
77
|
+
|
78
|
+
absolute_url = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
|
79
|
+
|
80
|
+
env[:node][url_attribute] = absolute_url
|
81
|
+
end
|
20
82
|
end
|
21
83
|
end
|
22
84
|
end
|
@@ -1,13 +1,34 @@
|
|
1
1
|
module Html2rss
|
2
2
|
module AttributePostProcessors
|
3
|
+
## Returns a defined part of a String.
|
4
|
+
#
|
5
|
+
# The +end+ parameter can be omitted, in that case it will not cut the
|
6
|
+
# String at the end.
|
7
|
+
#
|
8
|
+
# Imagine this HTML:
|
9
|
+
# <h1>Foo bar and baz<h1>
|
10
|
+
#
|
11
|
+
# YAML usage example:
|
12
|
+
# selectors:
|
13
|
+
# title:
|
14
|
+
# selector: h1
|
15
|
+
# post_process:
|
16
|
+
# name: substring
|
17
|
+
# start: 4
|
18
|
+
# end: 6
|
19
|
+
#
|
20
|
+
# Would return:
|
21
|
+
# 'bar'
|
3
22
|
class Substring
|
4
|
-
def initialize(value,
|
23
|
+
def initialize(value, env)
|
5
24
|
@value = value
|
6
|
-
@options = options
|
25
|
+
@options = env[:options]
|
7
26
|
end
|
8
27
|
|
28
|
+
##
|
29
|
+
# @return [String]
|
9
30
|
def get
|
10
|
-
ending = @options.fetch('end',
|
31
|
+
ending = @options.fetch('end', @value.length).to_i
|
11
32
|
@value[@options['start'].to_i..ending]
|
12
33
|
end
|
13
34
|
end
|
@@ -2,13 +2,44 @@ require 'sanitize'
|
|
2
2
|
|
3
3
|
module Html2rss
|
4
4
|
module AttributePostProcessors
|
5
|
+
## Returns a formatted String according to the string pattern.
|
6
|
+
#
|
7
|
+
# If +self+ is given as a method, the extracted value will be used.
|
8
|
+
#
|
9
|
+
# Imagine this HTML:
|
10
|
+
# <li>
|
11
|
+
# <h1>Product</h1>
|
12
|
+
# <span class="price">23,42€</span>
|
13
|
+
# </li>
|
14
|
+
#
|
15
|
+
# YAML usage example:
|
16
|
+
#
|
17
|
+
# selectors:
|
18
|
+
# items:
|
19
|
+
# selector: 'li'
|
20
|
+
# price:
|
21
|
+
# selector: '.price'
|
22
|
+
# title:
|
23
|
+
# selector: h1
|
24
|
+
# post_process:
|
25
|
+
# name: template
|
26
|
+
# string: '%s (%s)'
|
27
|
+
# methods:
|
28
|
+
# - self
|
29
|
+
# - price
|
30
|
+
#
|
31
|
+
# Would return:
|
32
|
+
# 'Product (23,42€)'
|
5
33
|
class Template
|
6
|
-
def initialize(value,
|
34
|
+
def initialize(value, env)
|
7
35
|
@value = value
|
8
|
-
@options = options
|
9
|
-
@item = item
|
36
|
+
@options = env[:options]
|
37
|
+
@item = env[:item]
|
10
38
|
end
|
11
39
|
|
40
|
+
##
|
41
|
+
# - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
|
42
|
+
# @return [String]
|
12
43
|
def get
|
13
44
|
string % methods
|
14
45
|
end
|
@@ -20,13 +51,9 @@ module Html2rss
|
|
20
51
|
end
|
21
52
|
|
22
53
|
def methods
|
23
|
-
@methods ||= @options['methods'].map
|
24
|
-
|
25
|
-
|
26
|
-
else
|
27
|
-
@item.send(method.to_sym)&.to_s
|
28
|
-
end
|
29
|
-
}
|
54
|
+
@methods ||= @options['methods'].map do |method|
|
55
|
+
method == 'self' ? @value.to_s : @item.public_send(method.to_sym).to_s
|
56
|
+
end
|
30
57
|
end
|
31
58
|
end
|
32
59
|
end
|