html2rss 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +18 -11
  3. data/.travis.yml +3 -3
  4. data/.yardopts +6 -0
  5. data/Gemfile.lock +23 -5
  6. data/README.md +2 -1
  7. data/docs/Html2rss.html +353 -0
  8. data/docs/Html2rss/AttributePostProcessors.html +203 -0
  9. data/docs/Html2rss/AttributePostProcessors/ParseTime.html +332 -0
  10. data/docs/Html2rss/AttributePostProcessors/ParseUri.html +314 -0
  11. data/docs/Html2rss/AttributePostProcessors/SanitizeHtml.html +346 -0
  12. data/docs/Html2rss/AttributePostProcessors/Substring.html +321 -0
  13. data/docs/Html2rss/AttributePostProcessors/Template.html +336 -0
  14. data/docs/Html2rss/Config.html +795 -0
  15. data/docs/Html2rss/FeedBuilder.html +295 -0
  16. data/docs/Html2rss/Item.html +654 -0
  17. data/docs/Html2rss/ItemExtractors.html +297 -0
  18. data/docs/Html2rss/ItemExtractors/Attribute.html +317 -0
  19. data/docs/Html2rss/ItemExtractors/CurrentTime.html +297 -0
  20. data/docs/Html2rss/ItemExtractors/Href.html +319 -0
  21. data/docs/Html2rss/ItemExtractors/Html.html +314 -0
  22. data/docs/Html2rss/ItemExtractors/Static.html +301 -0
  23. data/docs/Html2rss/ItemExtractors/Text.html +312 -0
  24. data/docs/Html2rss/Utils.html +115 -0
  25. data/docs/Html2rss/Utils/IndifferentAccessHash.html +142 -0
  26. data/docs/_index.html +300 -0
  27. data/docs/class_list.html +51 -0
  28. data/docs/css/common.css +1 -0
  29. data/docs/css/full_list.css +58 -0
  30. data/docs/css/style.css +496 -0
  31. data/docs/file.README.html +135 -0
  32. data/docs/file_list.html +56 -0
  33. data/docs/frames.html +17 -0
  34. data/docs/index.html +135 -0
  35. data/docs/js/app.js +303 -0
  36. data/docs/js/full_list.js +216 -0
  37. data/docs/js/jquery.js +4 -0
  38. data/docs/method_list.html +435 -0
  39. data/docs/top-level-namespace.html +110 -0
  40. data/html2rss.gemspec +3 -0
  41. data/lib/html2rss.rb +19 -4
  42. data/lib/html2rss/attribute_post_processors.rb +5 -3
  43. data/lib/html2rss/attribute_post_processors/parse_time.rb +29 -3
  44. data/lib/html2rss/attribute_post_processors/parse_uri.rb +20 -1
  45. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +65 -3
  46. data/lib/html2rss/attribute_post_processors/substring.rb +24 -3
  47. data/lib/html2rss/attribute_post_processors/template.rb +37 -10
  48. data/lib/html2rss/config.rb +11 -12
  49. data/lib/html2rss/feed_builder.rb +8 -6
  50. data/lib/html2rss/item.rb +28 -19
  51. data/lib/html2rss/item_extractors.rb +29 -0
  52. data/lib/html2rss/item_extractors/attribute.rb +37 -0
  53. data/lib/html2rss/item_extractors/current_time.rb +21 -0
  54. data/lib/html2rss/item_extractors/href.rb +36 -0
  55. data/lib/html2rss/item_extractors/html.rb +34 -0
  56. data/lib/html2rss/item_extractors/static.rb +28 -0
  57. data/lib/html2rss/item_extractors/text.rb +32 -0
  58. data/lib/html2rss/utils.rb +25 -0
  59. data/lib/html2rss/version.rb +1 -1
  60. metadata +88 -4
  61. data/lib/html2rss/item_extractor.rb +0 -37
@@ -0,0 +1,110 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>
7
+ Top Level Namespace
8
+
9
+ &mdash; Documentation by YARD 0.9.20
10
+
11
+ </title>
12
+
13
+ <link rel="stylesheet" href="css/style.css" type="text/css" charset="utf-8" />
14
+
15
+ <link rel="stylesheet" href="css/common.css" type="text/css" charset="utf-8" />
16
+
17
+ <script type="text/javascript" charset="utf-8">
18
+ pathId = "";
19
+ relpath = '';
20
+ </script>
21
+
22
+
23
+ <script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
24
+
25
+ <script type="text/javascript" charset="utf-8" src="js/app.js"></script>
26
+
27
+
28
+ </head>
29
+ <body>
30
+ <div class="nav_wrap">
31
+ <iframe id="nav" src="class_list.html?1"></iframe>
32
+ <div id="resizer"></div>
33
+ </div>
34
+
35
+ <div id="main" tabindex="-1">
36
+ <div id="header">
37
+ <div id="menu">
38
+
39
+ <a href="_index.html">Index</a> &raquo;
40
+
41
+
42
+ <span class="title">Top Level Namespace</span>
43
+
44
+ </div>
45
+
46
+ <div id="search">
47
+
48
+ <a class="full_list_link" id="class_list_link"
49
+ href="class_list.html">
50
+
51
+ <svg width="24" height="24">
52
+ <rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
53
+ <rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
54
+ <rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
55
+ </svg>
56
+ </a>
57
+
58
+ </div>
59
+ <div class="clear"></div>
60
+ </div>
61
+
62
+ <div id="content"><h1>Top Level Namespace
63
+
64
+
65
+
66
+ </h1>
67
+ <div class="box_info">
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+ </div>
80
+
81
+ <h2>Defined Under Namespace</h2>
82
+ <p class="children">
83
+
84
+
85
+ <strong class="modules">Modules:</strong> <span class='object_link'><a href="Html2rss.html" title="Html2rss (module)">Html2rss</a></span>
86
+
87
+
88
+
89
+
90
+ </p>
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+ </div>
101
+
102
+ <div id="footer">
103
+ Generated on Sun Jul 14 19:35:05 2019 by
104
+ <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
105
+ 0.9.20 (ruby-2.6.3).
106
+ </div>
107
+
108
+ </div>
109
+ </body>
110
+ </html>
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
27
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
28
  spec.require_paths = ['lib']
29
29
 
30
+ spec.add_dependency 'activesupport', '~> 5.0'
30
31
  spec.add_dependency 'faraday', '~> 0.15'
31
32
  spec.add_dependency 'faraday_middleware', '~> 0.13'
32
33
  spec.add_dependency 'hashie', '~> 3.6'
@@ -37,6 +38,8 @@ Gem::Specification.new do |spec|
37
38
  spec.add_development_dependency 'rspec', '~> 3.0'
38
39
  spec.add_development_dependency 'rubocop'
39
40
  spec.add_development_dependency 'rubocop-performance'
41
+ spec.add_development_dependency 'rubocop-rspec'
40
42
  spec.add_development_dependency 'simplecov'
41
43
  spec.add_development_dependency 'vcr'
44
+ spec.add_development_dependency 'yard'
42
45
  end
@@ -1,28 +1,41 @@
1
1
  require 'html2rss/config'
2
2
  require 'html2rss/feed_builder'
3
3
  require 'html2rss/version'
4
+ require 'html2rss/utils'
4
5
  require 'yaml'
5
6
 
6
7
  module Html2rss
8
+ ##
9
+ # Returns a RSS object which is generated from the provided file.
10
+ #
11
+ # `file_path`: a File object of a YAML file
12
+ # `name`: the of the feed
13
+ #
14
+ # Example:
15
+ #
16
+ # feed = Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')
17
+ # # => #<RSS::Rss:0x00007fb2f6331228
18
+ # @return [RSS:Rss]
7
19
  def self.feed_from_yaml_config(file, name)
8
20
  # rubocop:disable Security/YAMLLoad
9
21
  yaml = YAML.load(File.open(file))
10
22
  # rubocop:enable Security/YAMLLoad
11
23
 
12
24
  feed_config = yaml['feeds'][name]
13
- global_config = yaml.reject { |k| k == 'feeds' }
25
+ global_config = yaml.reject { |key| key == 'feeds' }
14
26
 
15
27
  config = Config.new(feed_config, global_config)
16
28
  feed(config)
17
29
  end
18
30
 
19
31
  ##
20
- # Returns the RSS object, which is generated from the provided config.
32
+ # Returns a RSS object which is generated from the provided config.
21
33
  #
22
34
  # `config`: can be a Hash or an instance of Html2rss::Config.
23
35
  #
24
- # = Example with a Ruby Hash
25
- # Html2rss.feed(
36
+ # Example:
37
+ #
38
+ # feed = Html2rss.feed(
26
39
  # channel: { name: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com' },
27
40
  # selectors: {
28
41
  # items: { selector: '#hot-network-questions > ul > li' },
@@ -30,6 +43,8 @@ module Html2rss
30
43
  # link: { selector: 'a', extractor: 'href' }
31
44
  # }
32
45
  # )
46
+ # # => #<RSS::Rss:0x00007fb2f48d14a0 ...>
47
+ # @return [RSS:Rss]
33
48
  def self.feed(config)
34
49
  config = Config.new(config) unless config.is_a?(Config)
35
50
 
@@ -5,10 +5,12 @@ require_relative 'attribute_post_processors/substring'
5
5
  require_relative 'attribute_post_processors/template'
6
6
 
7
7
  module Html2rss
8
+ ##
9
+ # Provides a namespace for attribute post processors.
8
10
  module AttributePostProcessors
9
- def self.get_processor(options)
10
- camel_cased_option = options['name'].split('_').collect(&:capitalize).join
11
- class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_option].join('::')
11
+ def self.get_processor(name)
12
+ camel_cased_name = name.split('_').map(&:capitalize).join
13
+ class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
12
14
 
13
15
  Object.const_get(class_name)
14
16
  end
@@ -1,12 +1,38 @@
1
+ require 'active_support'
2
+ require 'active_support/core_ext/time'
3
+
1
4
  module Html2rss
2
5
  module AttributePostProcessors
6
+ ##
7
+ # Returns the {https://www.w3.org/Protocols/rfc822/ RFC822} representation of a time.
8
+ #
9
+ # Imagine this HTML structure:
10
+ #
11
+ # <p>Published on <span>2019-07-02</span></p>
12
+ #
13
+ # YAML usage example:
14
+ #
15
+ # selectors:
16
+ # description:
17
+ # selector: span
18
+ # post_process:
19
+ # name: 'parse_time'
20
+ #
21
+ # Would return:
22
+ # "Tue, 02 Jul 2019 00:00:00 +0200"
23
+ #
24
+ # It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
25
+ # As of now it ignores time zones and always falls back to the UTC time zone.
3
26
  class ParseTime
4
- def initialize(value, _options, _item)
5
- @value = value
27
+ def initialize(value, env)
28
+ @value = value.to_s
29
+ @time_zone = env[:config].time_zone
6
30
  end
7
31
 
32
+ ##
33
+ # @return [String] rfc822 formatted time
8
34
  def get
9
- Time.parse(@value).rfc822
35
+ Time.use_zone(@time_zone) { Time.zone.parse(@value).rfc822 }
10
36
  end
11
37
  end
12
38
  end
@@ -1,10 +1,29 @@
1
1
  module Html2rss
2
2
  module AttributePostProcessors
3
+ ##
4
+ # Returns the URI as String.
5
+ #
6
+ # Imagine this HTML structure:
7
+ #
8
+ # <span>http://why-not-use-a-link.uh</span>
9
+ #
10
+ # YAML usage example:
11
+ #
12
+ # selectors:
13
+ # link:
14
+ # selector: span
15
+ # extractor: text
16
+ # post_process:
17
+ # name: parse_uri
18
+ # Would return:
19
+ # 'http://why-not-use-a-link.uh'
3
20
  class ParseUri
4
- def initialize(value, _options, _item)
21
+ def initialize(value, _env)
5
22
  @value = value
6
23
  end
7
24
 
25
+ ##
26
+ # @return [String]
8
27
  def get
9
28
  URI(@value).to_s
10
29
  end
@@ -2,21 +2,83 @@ require 'sanitize'
2
2
 
3
3
  module Html2rss
4
4
  module AttributePostProcessors
5
+ ##
6
+ # Returns sanitized HTML code as String.
7
+ # Adds
8
+ #
9
+ # - rel="nofollow noopener noreferrer" to a elements
10
+ # - referrer-policy='no-referrer' to img elements
11
+ #
12
+ # Imagine this HTML structure:
13
+ #
14
+ # <section>
15
+ # Lorem <b>ipsum</b> dolor...
16
+ # <iframe src="https://evil.corp/miner"></iframe>
17
+ # <script>alert();</script>
18
+ # </section>
19
+ #
20
+ # YAML usage example:
21
+ #
22
+ # selectors:
23
+ # description:
24
+ # selector: section
25
+ # extractor: html
26
+ # post_process:
27
+ # name: sanitize_html
28
+ #
29
+ # Would return:
30
+ # '<p>Lorem <b>ipsum</b> dolor ...</p>'
5
31
  class SanitizeHtml
6
- def initialize(value, _options, _item)
32
+ def initialize(value, env)
7
33
  @value = value
34
+ @channel_url = env[:config].url
8
35
  end
9
36
 
37
+ ##
38
+ # - uses the {https://github.com/rgrove/sanitize sanitize gem}
39
+ # - uses the config {https://github.com/rgrove/sanitize#sanitizeconfigrelaxed Sanitize::Config::RELAXED}
40
+ # - adds rel="nofollow noopener noreferrer" to a elements
41
+ # - adds target="_blank" to a elements
42
+ # @return [String]
10
43
  def get
11
44
  Sanitize.fragment(@value, Sanitize::Config.merge(
12
45
  Sanitize::Config::RELAXED,
46
+ attributes: {
47
+ all: %w[dir lang alt title translate]
48
+ },
13
49
  add_attributes: {
14
50
  'a' => {
15
51
  'rel' => 'nofollow noopener noreferrer',
16
52
  'target' => '_blank'
53
+ },
54
+ 'img' => {
55
+ 'referrer-policy' => 'no-referrer'
17
56
  }
18
- }
19
- ))
57
+ },
58
+ transformers: [transform_urls_to_absolute_ones]
59
+ )).to_s.split.join(' ')
60
+ end
61
+
62
+ private
63
+
64
+ URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
65
+ 'a' => :href,
66
+ 'img' => :src
67
+ }.freeze
68
+
69
+ def transform_urls_to_absolute_ones
70
+ lambda do |env|
71
+ return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(env[:node_name])
72
+
73
+ url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[env[:node_name]]
74
+ url = env[:node][url_attribute]
75
+
76
+ return if URI(url).absolute?
77
+
78
+ absolute_url = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
79
+
80
+ env[:node][url_attribute] = absolute_url
81
+ end
20
82
  end
21
83
  end
22
84
  end
@@ -1,13 +1,34 @@
1
1
  module Html2rss
2
2
  module AttributePostProcessors
3
+ ## Returns a defined part of a String.
4
+ #
5
+ # The +end+ parameter can be omitted, in that case it will not cut the
6
+ # String at the end.
7
+ #
8
+ # Imagine this HTML:
9
+ # <h1>Foo bar and baz<h1>
10
+ #
11
+ # YAML usage example:
12
+ # selectors:
13
+ # title:
14
+ # selector: h1
15
+ # post_process:
16
+ # name: substring
17
+ # start: 4
18
+ # end: 6
19
+ #
20
+ # Would return:
21
+ # 'bar'
3
22
  class Substring
4
- def initialize(value, options, _item)
23
+ def initialize(value, env)
5
24
  @value = value
6
- @options = options
25
+ @options = env[:options]
7
26
  end
8
27
 
28
+ ##
29
+ # @return [String]
9
30
  def get
10
- ending = @options.fetch('end', false) ? @options['end'].to_i : @value.length
31
+ ending = @options.fetch('end', @value.length).to_i
11
32
  @value[@options['start'].to_i..ending]
12
33
  end
13
34
  end
@@ -2,13 +2,44 @@ require 'sanitize'
2
2
 
3
3
  module Html2rss
4
4
  module AttributePostProcessors
5
+ ## Returns a formatted String according to the string pattern.
6
+ #
7
+ # If +self+ is given as a method, the extracted value will be used.
8
+ #
9
+ # Imagine this HTML:
10
+ # <li>
11
+ # <h1>Product</h1>
12
+ # <span class="price">23,42€</span>
13
+ # </li>
14
+ #
15
+ # YAML usage example:
16
+ #
17
+ # selectors:
18
+ # items:
19
+ # selector: 'li'
20
+ # price:
21
+ # selector: '.price'
22
+ # title:
23
+ # selector: h1
24
+ # post_process:
25
+ # name: template
26
+ # string: '%s (%s)'
27
+ # methods:
28
+ # - self
29
+ # - price
30
+ #
31
+ # Would return:
32
+ # 'Product (23,42€)'
5
33
  class Template
6
- def initialize(value, options, item)
34
+ def initialize(value, env)
7
35
  @value = value
8
- @options = options
9
- @item = item
36
+ @options = env[:options]
37
+ @item = env[:item]
10
38
  end
11
39
 
40
+ ##
41
+ # - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
42
+ # @return [String]
12
43
  def get
13
44
  string % methods
14
45
  end
@@ -20,13 +51,9 @@ module Html2rss
20
51
  end
21
52
 
22
53
  def methods
23
- @methods ||= @options['methods'].map { |method|
24
- if method == 'self'
25
- @value
26
- else
27
- @item.send(method.to_sym)&.to_s
28
- end
29
- }
54
+ @methods ||= @options['methods'].map do |method|
55
+ method == 'self' ? @value.to_s : @item.public_send(method.to_sym).to_s
56
+ end
30
57
  end
31
58
  end
32
59
  end