html2rss 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +18 -11
  3. data/.travis.yml +3 -3
  4. data/.yardopts +6 -0
  5. data/Gemfile.lock +23 -5
  6. data/README.md +2 -1
  7. data/docs/Html2rss.html +353 -0
  8. data/docs/Html2rss/AttributePostProcessors.html +203 -0
  9. data/docs/Html2rss/AttributePostProcessors/ParseTime.html +332 -0
  10. data/docs/Html2rss/AttributePostProcessors/ParseUri.html +314 -0
  11. data/docs/Html2rss/AttributePostProcessors/SanitizeHtml.html +346 -0
  12. data/docs/Html2rss/AttributePostProcessors/Substring.html +321 -0
  13. data/docs/Html2rss/AttributePostProcessors/Template.html +336 -0
  14. data/docs/Html2rss/Config.html +795 -0
  15. data/docs/Html2rss/FeedBuilder.html +295 -0
  16. data/docs/Html2rss/Item.html +654 -0
  17. data/docs/Html2rss/ItemExtractors.html +297 -0
  18. data/docs/Html2rss/ItemExtractors/Attribute.html +317 -0
  19. data/docs/Html2rss/ItemExtractors/CurrentTime.html +297 -0
  20. data/docs/Html2rss/ItemExtractors/Href.html +319 -0
  21. data/docs/Html2rss/ItemExtractors/Html.html +314 -0
  22. data/docs/Html2rss/ItemExtractors/Static.html +301 -0
  23. data/docs/Html2rss/ItemExtractors/Text.html +312 -0
  24. data/docs/Html2rss/Utils.html +115 -0
  25. data/docs/Html2rss/Utils/IndifferentAccessHash.html +142 -0
  26. data/docs/_index.html +300 -0
  27. data/docs/class_list.html +51 -0
  28. data/docs/css/common.css +1 -0
  29. data/docs/css/full_list.css +58 -0
  30. data/docs/css/style.css +496 -0
  31. data/docs/file.README.html +135 -0
  32. data/docs/file_list.html +56 -0
  33. data/docs/frames.html +17 -0
  34. data/docs/index.html +135 -0
  35. data/docs/js/app.js +303 -0
  36. data/docs/js/full_list.js +216 -0
  37. data/docs/js/jquery.js +4 -0
  38. data/docs/method_list.html +435 -0
  39. data/docs/top-level-namespace.html +110 -0
  40. data/html2rss.gemspec +3 -0
  41. data/lib/html2rss.rb +19 -4
  42. data/lib/html2rss/attribute_post_processors.rb +5 -3
  43. data/lib/html2rss/attribute_post_processors/parse_time.rb +29 -3
  44. data/lib/html2rss/attribute_post_processors/parse_uri.rb +20 -1
  45. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +65 -3
  46. data/lib/html2rss/attribute_post_processors/substring.rb +24 -3
  47. data/lib/html2rss/attribute_post_processors/template.rb +37 -10
  48. data/lib/html2rss/config.rb +11 -12
  49. data/lib/html2rss/feed_builder.rb +8 -6
  50. data/lib/html2rss/item.rb +28 -19
  51. data/lib/html2rss/item_extractors.rb +29 -0
  52. data/lib/html2rss/item_extractors/attribute.rb +37 -0
  53. data/lib/html2rss/item_extractors/current_time.rb +21 -0
  54. data/lib/html2rss/item_extractors/href.rb +36 -0
  55. data/lib/html2rss/item_extractors/html.rb +34 -0
  56. data/lib/html2rss/item_extractors/static.rb +28 -0
  57. data/lib/html2rss/item_extractors/text.rb +32 -0
  58. data/lib/html2rss/utils.rb +25 -0
  59. data/lib/html2rss/version.rb +1 -1
  60. metadata +88 -4
  61. data/lib/html2rss/item_extractor.rb +0 -37
@@ -0,0 +1,110 @@
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>
7
+ Top Level Namespace
8
+
9
+ &mdash; Documentation by YARD 0.9.20
10
+
11
+ </title>
12
+
13
+ <link rel="stylesheet" href="css/style.css" type="text/css" charset="utf-8" />
14
+
15
+ <link rel="stylesheet" href="css/common.css" type="text/css" charset="utf-8" />
16
+
17
+ <script type="text/javascript" charset="utf-8">
18
+ pathId = "";
19
+ relpath = '';
20
+ </script>
21
+
22
+
23
+ <script type="text/javascript" charset="utf-8" src="js/jquery.js"></script>
24
+
25
+ <script type="text/javascript" charset="utf-8" src="js/app.js"></script>
26
+
27
+
28
+ </head>
29
+ <body>
30
+ <div class="nav_wrap">
31
+ <iframe id="nav" src="class_list.html?1"></iframe>
32
+ <div id="resizer"></div>
33
+ </div>
34
+
35
+ <div id="main" tabindex="-1">
36
+ <div id="header">
37
+ <div id="menu">
38
+
39
+ <a href="_index.html">Index</a> &raquo;
40
+
41
+
42
+ <span class="title">Top Level Namespace</span>
43
+
44
+ </div>
45
+
46
+ <div id="search">
47
+
48
+ <a class="full_list_link" id="class_list_link"
49
+ href="class_list.html">
50
+
51
+ <svg width="24" height="24">
52
+ <rect x="0" y="4" width="24" height="4" rx="1" ry="1"></rect>
53
+ <rect x="0" y="12" width="24" height="4" rx="1" ry="1"></rect>
54
+ <rect x="0" y="20" width="24" height="4" rx="1" ry="1"></rect>
55
+ </svg>
56
+ </a>
57
+
58
+ </div>
59
+ <div class="clear"></div>
60
+ </div>
61
+
62
+ <div id="content"><h1>Top Level Namespace
63
+
64
+
65
+
66
+ </h1>
67
+ <div class="box_info">
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+ </div>
80
+
81
+ <h2>Defined Under Namespace</h2>
82
+ <p class="children">
83
+
84
+
85
+ <strong class="modules">Modules:</strong> <span class='object_link'><a href="Html2rss.html" title="Html2rss (module)">Html2rss</a></span>
86
+
87
+
88
+
89
+
90
+ </p>
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+ </div>
101
+
102
+ <div id="footer">
103
+ Generated on Sun Jul 14 19:35:05 2019 by
104
+ <a href="http://yardoc.org" title="Yay! A Ruby Documentation Tool" target="_parent">yard</a>
105
+ 0.9.20 (ruby-2.6.3).
106
+ </div>
107
+
108
+ </div>
109
+ </body>
110
+ </html>
@@ -27,6 +27,7 @@ Gem::Specification.new do |spec|
27
27
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
28
  spec.require_paths = ['lib']
29
29
 
30
+ spec.add_dependency 'activesupport', '~> 5.0'
30
31
  spec.add_dependency 'faraday', '~> 0.15'
31
32
  spec.add_dependency 'faraday_middleware', '~> 0.13'
32
33
  spec.add_dependency 'hashie', '~> 3.6'
@@ -37,6 +38,8 @@ Gem::Specification.new do |spec|
37
38
  spec.add_development_dependency 'rspec', '~> 3.0'
38
39
  spec.add_development_dependency 'rubocop'
39
40
  spec.add_development_dependency 'rubocop-performance'
41
+ spec.add_development_dependency 'rubocop-rspec'
40
42
  spec.add_development_dependency 'simplecov'
41
43
  spec.add_development_dependency 'vcr'
44
+ spec.add_development_dependency 'yard'
42
45
  end
@@ -1,28 +1,41 @@
1
1
  require 'html2rss/config'
2
2
  require 'html2rss/feed_builder'
3
3
  require 'html2rss/version'
4
+ require 'html2rss/utils'
4
5
  require 'yaml'
5
6
 
6
7
  module Html2rss
8
+ ##
9
+ # Returns a RSS object which is generated from the provided file.
10
+ #
11
+ # `file_path`: a File object of a YAML file
12
+ # `name`: the of the feed
13
+ #
14
+ # Example:
15
+ #
16
+ # feed = Html2rss.feed_from_yaml_config(File.join(['spec', 'config.test.yml']), 'nuxt-releases')
17
+ # # => #<RSS::Rss:0x00007fb2f6331228
18
+ # @return [RSS:Rss]
7
19
  def self.feed_from_yaml_config(file, name)
8
20
  # rubocop:disable Security/YAMLLoad
9
21
  yaml = YAML.load(File.open(file))
10
22
  # rubocop:enable Security/YAMLLoad
11
23
 
12
24
  feed_config = yaml['feeds'][name]
13
- global_config = yaml.reject { |k| k == 'feeds' }
25
+ global_config = yaml.reject { |key| key == 'feeds' }
14
26
 
15
27
  config = Config.new(feed_config, global_config)
16
28
  feed(config)
17
29
  end
18
30
 
19
31
  ##
20
- # Returns the RSS object, which is generated from the provided config.
32
+ # Returns a RSS object which is generated from the provided config.
21
33
  #
22
34
  # `config`: can be a Hash or an instance of Html2rss::Config.
23
35
  #
24
- # = Example with a Ruby Hash
25
- # Html2rss.feed(
36
+ # Example:
37
+ #
38
+ # feed = Html2rss.feed(
26
39
  # channel: { name: 'StackOverflow: Hot Network Questions', url: 'https://stackoverflow.com' },
27
40
  # selectors: {
28
41
  # items: { selector: '#hot-network-questions > ul > li' },
@@ -30,6 +43,8 @@ module Html2rss
30
43
  # link: { selector: 'a', extractor: 'href' }
31
44
  # }
32
45
  # )
46
+ # # => #<RSS::Rss:0x00007fb2f48d14a0 ...>
47
+ # @return [RSS:Rss]
33
48
  def self.feed(config)
34
49
  config = Config.new(config) unless config.is_a?(Config)
35
50
 
@@ -5,10 +5,12 @@ require_relative 'attribute_post_processors/substring'
5
5
  require_relative 'attribute_post_processors/template'
6
6
 
7
7
  module Html2rss
8
+ ##
9
+ # Provides a namespace for attribute post processors.
8
10
  module AttributePostProcessors
9
- def self.get_processor(options)
10
- camel_cased_option = options['name'].split('_').collect(&:capitalize).join
11
- class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_option].join('::')
11
+ def self.get_processor(name)
12
+ camel_cased_name = name.split('_').map(&:capitalize).join
13
+ class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
12
14
 
13
15
  Object.const_get(class_name)
14
16
  end
@@ -1,12 +1,38 @@
1
+ require 'active_support'
2
+ require 'active_support/core_ext/time'
3
+
1
4
  module Html2rss
2
5
  module AttributePostProcessors
6
+ ##
7
+ # Returns the {https://www.w3.org/Protocols/rfc822/ RFC822} representation of a time.
8
+ #
9
+ # Imagine this HTML structure:
10
+ #
11
+ # <p>Published on <span>2019-07-02</span></p>
12
+ #
13
+ # YAML usage example:
14
+ #
15
+ # selectors:
16
+ # description:
17
+ # selector: span
18
+ # post_process:
19
+ # name: 'parse_time'
20
+ #
21
+ # Would return:
22
+ # "Tue, 02 Jul 2019 00:00:00 +0200"
23
+ #
24
+ # It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
25
+ # As of now it ignores time zones and always falls back to the UTC time zone.
3
26
  class ParseTime
4
- def initialize(value, _options, _item)
5
- @value = value
27
+ def initialize(value, env)
28
+ @value = value.to_s
29
+ @time_zone = env[:config].time_zone
6
30
  end
7
31
 
32
+ ##
33
+ # @return [String] rfc822 formatted time
8
34
  def get
9
- Time.parse(@value).rfc822
35
+ Time.use_zone(@time_zone) { Time.zone.parse(@value).rfc822 }
10
36
  end
11
37
  end
12
38
  end
@@ -1,10 +1,29 @@
1
1
  module Html2rss
2
2
  module AttributePostProcessors
3
+ ##
4
+ # Returns the URI as String.
5
+ #
6
+ # Imagine this HTML structure:
7
+ #
8
+ # <span>http://why-not-use-a-link.uh</span>
9
+ #
10
+ # YAML usage example:
11
+ #
12
+ # selectors:
13
+ # link:
14
+ # selector: span
15
+ # extractor: text
16
+ # post_process:
17
+ # name: parse_uri
18
+ # Would return:
19
+ # 'http://why-not-use-a-link.uh'
3
20
  class ParseUri
4
- def initialize(value, _options, _item)
21
+ def initialize(value, _env)
5
22
  @value = value
6
23
  end
7
24
 
25
+ ##
26
+ # @return [String]
8
27
  def get
9
28
  URI(@value).to_s
10
29
  end
@@ -2,21 +2,83 @@ require 'sanitize'
2
2
 
3
3
  module Html2rss
4
4
  module AttributePostProcessors
5
+ ##
6
+ # Returns sanitized HTML code as String.
7
+ # Adds
8
+ #
9
+ # - rel="nofollow noopener noreferrer" to a elements
10
+ # - referrer-policy='no-referrer' to img elements
11
+ #
12
+ # Imagine this HTML structure:
13
+ #
14
+ # <section>
15
+ # Lorem <b>ipsum</b> dolor...
16
+ # <iframe src="https://evil.corp/miner"></iframe>
17
+ # <script>alert();</script>
18
+ # </section>
19
+ #
20
+ # YAML usage example:
21
+ #
22
+ # selectors:
23
+ # description:
24
+ # selector: section
25
+ # extractor: html
26
+ # post_process:
27
+ # name: sanitize_html
28
+ #
29
+ # Would return:
30
+ # '<p>Lorem <b>ipsum</b> dolor ...</p>'
5
31
  class SanitizeHtml
6
- def initialize(value, _options, _item)
32
+ def initialize(value, env)
7
33
  @value = value
34
+ @channel_url = env[:config].url
8
35
  end
9
36
 
37
+ ##
38
+ # - uses the {https://github.com/rgrove/sanitize sanitize gem}
39
+ # - uses the config {https://github.com/rgrove/sanitize#sanitizeconfigrelaxed Sanitize::Config::RELAXED}
40
+ # - adds rel="nofollow noopener noreferrer" to a elements
41
+ # - adds target="_blank" to a elements
42
+ # @return [String]
10
43
  def get
11
44
  Sanitize.fragment(@value, Sanitize::Config.merge(
12
45
  Sanitize::Config::RELAXED,
46
+ attributes: {
47
+ all: %w[dir lang alt title translate]
48
+ },
13
49
  add_attributes: {
14
50
  'a' => {
15
51
  'rel' => 'nofollow noopener noreferrer',
16
52
  'target' => '_blank'
53
+ },
54
+ 'img' => {
55
+ 'referrer-policy' => 'no-referrer'
17
56
  }
18
- }
19
- ))
57
+ },
58
+ transformers: [transform_urls_to_absolute_ones]
59
+ )).to_s.split.join(' ')
60
+ end
61
+
62
+ private
63
+
64
+ URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
65
+ 'a' => :href,
66
+ 'img' => :src
67
+ }.freeze
68
+
69
+ def transform_urls_to_absolute_ones
70
+ lambda do |env|
71
+ return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(env[:node_name])
72
+
73
+ url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[env[:node_name]]
74
+ url = env[:node][url_attribute]
75
+
76
+ return if URI(url).absolute?
77
+
78
+ absolute_url = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
79
+
80
+ env[:node][url_attribute] = absolute_url
81
+ end
20
82
  end
21
83
  end
22
84
  end
@@ -1,13 +1,34 @@
1
1
  module Html2rss
2
2
  module AttributePostProcessors
3
+ ## Returns a defined part of a String.
4
+ #
5
+ # The +end+ parameter can be omitted, in that case it will not cut the
6
+ # String at the end.
7
+ #
8
+ # Imagine this HTML:
9
+ # <h1>Foo bar and baz<h1>
10
+ #
11
+ # YAML usage example:
12
+ # selectors:
13
+ # title:
14
+ # selector: h1
15
+ # post_process:
16
+ # name: substring
17
+ # start: 4
18
+ # end: 6
19
+ #
20
+ # Would return:
21
+ # 'bar'
3
22
  class Substring
4
- def initialize(value, options, _item)
23
+ def initialize(value, env)
5
24
  @value = value
6
- @options = options
25
+ @options = env[:options]
7
26
  end
8
27
 
28
+ ##
29
+ # @return [String]
9
30
  def get
10
- ending = @options.fetch('end', false) ? @options['end'].to_i : @value.length
31
+ ending = @options.fetch('end', @value.length).to_i
11
32
  @value[@options['start'].to_i..ending]
12
33
  end
13
34
  end
@@ -2,13 +2,44 @@ require 'sanitize'
2
2
 
3
3
  module Html2rss
4
4
  module AttributePostProcessors
5
+ ## Returns a formatted String according to the string pattern.
6
+ #
7
+ # If +self+ is given as a method, the extracted value will be used.
8
+ #
9
+ # Imagine this HTML:
10
+ # <li>
11
+ # <h1>Product</h1>
12
+ # <span class="price">23,42€</span>
13
+ # </li>
14
+ #
15
+ # YAML usage example:
16
+ #
17
+ # selectors:
18
+ # items:
19
+ # selector: 'li'
20
+ # price:
21
+ # selector: '.price'
22
+ # title:
23
+ # selector: h1
24
+ # post_process:
25
+ # name: template
26
+ # string: '%s (%s)'
27
+ # methods:
28
+ # - self
29
+ # - price
30
+ #
31
+ # Would return:
32
+ # 'Product (23,42€)'
5
33
  class Template
6
- def initialize(value, options, item)
34
+ def initialize(value, env)
7
35
  @value = value
8
- @options = options
9
- @item = item
36
+ @options = env[:options]
37
+ @item = env[:item]
10
38
  end
11
39
 
40
+ ##
41
+ # - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
42
+ # @return [String]
12
43
  def get
13
44
  string % methods
14
45
  end
@@ -20,13 +51,9 @@ module Html2rss
20
51
  end
21
52
 
22
53
  def methods
23
- @methods ||= @options['methods'].map { |method|
24
- if method == 'self'
25
- @value
26
- else
27
- @item.send(method.to_sym)&.to_s
28
- end
29
- }
54
+ @methods ||= @options['methods'].map do |method|
55
+ method == 'self' ? @value.to_s : @item.public_send(method.to_sym).to_s
56
+ end
30
57
  end
31
58
  end
32
59
  end