html2rss 0.6.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,7 +12,7 @@ Gem::Specification.new do |spec|
12
12
  spec.description = 'Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance in return.'
13
13
  spec.homepage = 'https://github.com/gildesmarais/html2rss'
14
14
  spec.license = 'MIT'
15
- spec.required_ruby_version = '>= 2.4.0'
15
+ spec.required_ruby_version = '>= 2.5.0'
16
16
 
17
17
  if spec.respond_to?(:metadata)
18
18
  spec.metadata['allowed_push_host'] = 'https://rubygems.org'
@@ -29,15 +29,19 @@ Gem::Specification.new do |spec|
29
29
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
30
  spec.require_paths = ['lib']
31
31
 
32
- spec.add_dependency 'activesupport', '~> 5.0'
32
+ spec.add_dependency 'activesupport', '>= 5', '< 7'
33
+ spec.add_dependency 'addressable', '~> 2.7'
33
34
  spec.add_dependency 'builder'
34
- spec.add_dependency 'faraday', '~> 0.15'
35
- spec.add_dependency 'faraday_middleware', '~> 0.13'
36
- spec.add_dependency 'hashie', '~> 3.6'
35
+ spec.add_dependency 'faraday', '~> 1.0'
36
+ spec.add_dependency 'faraday_middleware'
37
+ spec.add_dependency 'kramdown'
38
+ spec.add_dependency 'mime-types', '> 3.0'
37
39
  spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
38
- spec.add_dependency 'reverse_markdown', '~> 1.3'
40
+ spec.add_dependency 'reverse_markdown', '~> 2.0'
39
41
  spec.add_dependency 'sanitize', '~> 5.0'
40
- spec.add_development_dependency 'bundler', '~> 1.16'
42
+ spec.add_dependency 'to_regexp'
43
+ spec.add_dependency 'zeitwerk'
44
+ spec.add_development_dependency 'bundler'
41
45
  spec.add_development_dependency 'byebug'
42
46
  spec.add_development_dependency 'rspec', '~> 3.0'
43
47
  spec.add_development_dependency 'rubocop'
@@ -1,7 +1,8 @@
1
- require 'html2rss/config'
2
- require 'html2rss/feed_builder'
3
- require 'html2rss/version'
4
- require 'html2rss/utils'
1
+ require 'zeitwerk'
2
+
3
+ loader = Zeitwerk::Loader.for_gem
4
+ loader.setup
5
+
5
6
  require 'yaml'
6
7
 
7
8
  ##
@@ -1,19 +1,13 @@
1
- require_relative 'attribute_post_processors/html_to_markdown'
2
- require_relative 'attribute_post_processors/parse_time'
3
- require_relative 'attribute_post_processors/parse_uri'
4
- require_relative 'attribute_post_processors/sanitize_html'
5
- require_relative 'attribute_post_processors/substring'
6
- require_relative 'attribute_post_processors/template'
7
-
8
1
  module Html2rss
9
2
  ##
10
3
  # Provides a namespace for attribute post processors.
11
4
  module AttributePostProcessors
12
5
  def self.get_processor(name)
13
- camel_cased_name = name.split('_').map(&:capitalize).join
14
- class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
6
+ @get_processor ||= Hash.new do |processors, key|
7
+ processors[key] = Utils.get_class_from_name(key, 'AttributePostProcessors')
8
+ end
15
9
 
16
- Object.const_get(class_name)
10
+ @get_processor[name]
17
11
  end
18
12
  end
19
13
  end
@@ -0,0 +1,42 @@
1
+ require 'to_regexp'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ ##
6
+ #
7
+ # Imagine this HTML:
8
+ # <h1>Foo bar and boo<h1>
9
+ #
10
+ # YAML usage example:
11
+ # selectors:
12
+ # title:
13
+ # selector: h1
14
+ # post_process:
15
+ # name: gsub
16
+ # pattern: boo
17
+ # replacement: baz
18
+ #
19
+ # Would return:
20
+ # 'Foo bar and baz'
21
+ #
22
+ # `pattern` can be a Regexp or a String.
23
+ #
24
+ # `replacement` can be a String or a Hash.
25
+ #
26
+ # See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
27
+ class Gsub
28
+ def initialize(value, env)
29
+ @value = value
30
+ options = env[:options]
31
+ @pattern = options[:pattern].to_regexp || options[:pattern]
32
+ @replacement = options[:replacement]
33
+ end
34
+
35
+ ##
36
+ # @return [String]
37
+ def get
38
+ @value.to_s.gsub(@pattern, @replacement)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,45 @@
1
+ require 'kramdown'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ ##
6
+ # Generates HTML from Markdown.
7
+ #
8
+ # It's particularly useful in conjunction with the Template post processor
9
+ # to generate a description from other selectors.
10
+ #
11
+ # YAML usage example:
12
+ #
13
+ # selectors:
14
+ # description:
15
+ # selector: section
16
+ # post_process:
17
+ # - name: template
18
+ # string: |
19
+ # # %s
20
+ #
21
+ # Price: %s
22
+ # methods:
23
+ # - self
24
+ # - price
25
+ # - name: markdown_to_html
26
+ #
27
+ # Would e.g. return:
28
+ #
29
+ # <h1>Section</h1>
30
+ #
31
+ # <p>Price: 12.34</p>
32
+ class MarkdownToHtml
33
+ def initialize(value, env)
34
+ @value = value
35
+ @env = env
36
+ end
37
+
38
+ ##
39
+ # @return [String] formatted in Markdown
40
+ def get
41
+ SanitizeHtml.new(Kramdown::Document.new(@value).to_html, @env).get
42
+ end
43
+ end
44
+ end
45
+ end
@@ -17,12 +17,12 @@ module Html2rss
17
17
  # selector: span
18
18
  # post_process:
19
19
  # name: 'parse_time'
20
+ # time_zone: 'Europe/Berlin'
20
21
  #
21
22
  # Would return:
22
23
  # "Tue, 02 Jul 2019 00:00:00 +0200"
23
24
  #
24
25
  # It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
25
- # As of now it ignores time zones and always falls back to the UTC time zone.
26
26
  class ParseTime
27
27
  def initialize(value, env)
28
28
  @value = value.to_s
@@ -5,7 +5,7 @@ module Html2rss
5
5
  #
6
6
  # Imagine this HTML structure:
7
7
  #
8
- # <span>http://why-not-use-a-link.uh</span>
8
+ # <span>http://why-not-use-a-link.uh </span>
9
9
  #
10
10
  # YAML usage example:
11
11
  #
@@ -15,6 +15,7 @@ module Html2rss
15
15
  # extractor: text
16
16
  # post_process:
17
17
  # name: parse_uri
18
+ #
18
19
  # Would return:
19
20
  # 'http://why-not-use-a-link.uh'
20
21
  class ParseUri
@@ -25,7 +26,7 @@ module Html2rss
25
26
  ##
26
27
  # @return [String]
27
28
  def get
28
- URI(@value).to_s
29
+ URI(Html2rss::Utils.sanitize_url(@value)).to_s
29
30
  end
30
31
  end
31
32
  end
@@ -4,10 +4,16 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  ##
6
6
  # Returns sanitized HTML code as String.
7
- # Adds
8
7
  #
9
- # - rel="nofollow noopener noreferrer" to a elements
10
- # - referrer-policy='no-referrer' to img elements
8
+ # It adds:
9
+ #
10
+ # - `rel="nofollow noopener noreferrer"` to <a> tags
11
+ # - `referrer-policy='no-referrer'` to <img> tags
12
+ #
13
+ # It also:
14
+ #
15
+ # - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
16
+ # linking to the <img>'s `src`.
11
17
  #
12
18
  # Imagine this HTML structure:
13
19
  #
@@ -21,7 +27,7 @@ module Html2rss
21
27
  #
22
28
  # selectors:
23
29
  # description:
24
- # selector: section
30
+ # selector: '.section'
25
31
  # extractor: html
26
32
  # post_process:
27
33
  # name: sanitize_html
@@ -29,6 +35,9 @@ module Html2rss
29
35
  # Would return:
30
36
  # '<p>Lorem <b>ipsum</b> dolor ...</p>'
31
37
  class SanitizeHtml
38
+ URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
39
+ private_constant :URL_ELEMENTS_WITH_URL_ATTRIBUTE
40
+
32
41
  def initialize(value, env)
33
42
  @value = value
34
43
  @channel_url = env[:config].url
@@ -41,28 +50,22 @@ module Html2rss
41
50
  # - adds target="_blank" to a elements
42
51
  # @return [String]
43
52
  def get
44
- Sanitize.fragment(@value, Sanitize::Config.merge(
45
- Sanitize::Config::RELAXED,
46
- attributes: { all: %w[dir lang alt title translate] },
47
- add_attributes: {
48
- 'a' => {
49
- 'rel' => 'nofollow noopener noreferrer',
50
- 'target' => '_blank'
51
- },
52
- 'img' => {
53
- 'referrer-policy' => 'no-referrer'
54
- }
55
- },
56
- transformers: [transform_urls_to_absolute_ones]
57
- )).to_s.split.join(' ')
53
+ Sanitize.fragment(@value, sanitize_config).to_s.split.join(' ')
58
54
  end
59
55
 
60
56
  private
61
57
 
62
- URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
63
- 'a' => :href,
64
- 'img' => :src
65
- }.freeze
58
+ def sanitize_config
59
+ Sanitize::Config.merge(
60
+ Sanitize::Config::RELAXED,
61
+ attributes: { all: %w[dir lang alt title translate] },
62
+ add_attributes: {
63
+ 'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
64
+ 'img' => { 'referrer-policy' => 'no-referrer' }
65
+ },
66
+ transformers: [transform_urls_to_absolute_ones, wrap_img_in_a]
67
+ )
68
+ end
66
69
 
67
70
  def transform_urls_to_absolute_ones
68
71
  lambda do |env|
@@ -78,6 +81,23 @@ module Html2rss
78
81
  env[:node][url_attribute] = absolute_url
79
82
  end
80
83
  end
84
+
85
+ def wrap_img_in_a
86
+ lambda do |env|
87
+ return if env[:node_name] != 'img'
88
+
89
+ img = env[:node]
90
+
91
+ return if img.parent.name == 'a'
92
+
93
+ anchor = Nokogiri::XML::Node.new('a', img)
94
+ anchor[:href] = img[:src]
95
+
96
+ anchor.add_child img.dup
97
+
98
+ img.replace(anchor)
99
+ end
100
+ end
81
101
  end
82
102
  end
83
103
  end
@@ -2,9 +2,15 @@ module Html2rss
2
2
  module AttributePostProcessors
3
3
  ## Returns a defined part of a String.
4
4
  #
5
+ # Both parameters must be an Integer and they can be negative.
5
6
  # The +end+ parameter can be omitted, in that case it will not cut the
6
7
  # String at the end.
7
8
  #
9
+ # A Regexp or a MatchString is not supported.
10
+ #
11
+ # See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
12
+ # documentation for more information.
13
+ #
8
14
  # Imagine this HTML:
9
15
  # <h1>Foo bar and baz<h1>
10
16
  #
@@ -13,9 +19,9 @@ module Html2rss
13
19
  # title:
14
20
  # selector: h1
15
21
  # post_process:
16
- # name: substring
17
- # start: 4
18
- # end: 6
22
+ # name: substring
23
+ # start: 4
24
+ # end: 6
19
25
  #
20
26
  # Would return:
21
27
  # 'bar'
@@ -28,8 +34,8 @@ module Html2rss
28
34
  ##
29
35
  # @return [String]
30
36
  def get
31
- ending = @options.fetch('end', @value.length).to_i
32
- @value[@options['start'].to_i..ending]
37
+ ending = @options.fetch(:end, @value.length).to_i
38
+ @value[@options[:start].to_i..ending]
33
39
  end
34
40
  end
35
41
  end
@@ -4,7 +4,8 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  ## Returns a formatted String according to the string pattern.
6
6
  #
7
- # If +self+ is given as a method, the extracted value will be used.
7
+ # If +self+ is used, the selectors extracted value will be used.
8
+ # It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
8
9
  #
9
10
  # Imagine this HTML:
10
11
  # <li>
@@ -22,11 +23,8 @@ module Html2rss
22
23
  # title:
23
24
  # selector: h1
24
25
  # post_process:
25
- # name: template
26
- # string: '%s (%s)'
27
- # methods:
28
- # - self
29
- # - price
26
+ # name: template
27
+ # string: '%{self} (%{price})'
30
28
  #
31
29
  # Would return:
32
30
  # 'Product (23,42€)'
@@ -35,25 +33,36 @@ module Html2rss
35
33
  @value = value
36
34
  @options = env[:options]
37
35
  @item = env[:item]
36
+ @string = @options[:string]
38
37
  end
39
38
 
40
39
  ##
41
- # - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
42
40
  # @return [String]
43
41
  def get
44
- string % methods
42
+ return format_string_with_methods if @options[:methods]
43
+
44
+ names = string.scan(/%[<|{](\w*)[>|}]/)
45
+ names.flatten!
46
+ names.compact!
47
+ names.map!(&:to_sym)
48
+
49
+ format(string, names.map { |name| [name, item_value(name)] }.to_h)
45
50
  end
46
51
 
47
52
  private
48
53
 
49
- def string
50
- @options['string']
51
- end
54
+ attr_reader :string
52
55
 
53
56
  def methods
54
- @methods ||= @options['methods'].map do |method|
55
- method == 'self' ? @value.to_s : @item.public_send(method.to_sym).to_s
56
- end
57
+ @methods ||= @options[:methods].map(&method(:item_value))
58
+ end
59
+
60
+ def format_string_with_methods
61
+ string % methods
62
+ end
63
+
64
+ def item_value(method_name)
65
+ method_name.to_sym == :self ? @value.to_s : @item.public_send(method_name).to_s
57
66
  end
58
67
  end
59
68
  end
@@ -1,71 +1,91 @@
1
+ require 'active_support/core_ext/hash'
2
+
1
3
  module Html2rss
2
4
  ##
3
5
  # The Config class abstracts from the config data structure and
4
6
  # provides default values.
5
7
  class Config
6
8
  def initialize(feed_config, global_config = {})
7
- @global_config = Utils::IndifferentAccessHash.new global_config
8
- @feed_config = Utils::IndifferentAccessHash.new feed_config
9
- @channel_config = Utils::IndifferentAccessHash.new @feed_config.fetch('channel', {})
9
+ @global_config = global_config.deep_symbolize_keys
10
+ @feed_config = feed_config.deep_symbolize_keys
11
+ @channel_config = @feed_config.fetch(:channel, {})
10
12
  end
11
13
 
12
14
  def author
13
- channel_config.fetch 'author', 'html2rss'
15
+ channel_config.fetch :author, 'html2rss'
14
16
  end
15
17
 
16
18
  def ttl
17
- channel_config.fetch 'ttl', 3600
19
+ channel_config.fetch :ttl, 360
18
20
  end
19
21
 
20
22
  def title
21
- channel_config.fetch 'title', 'html2rss generated title'
23
+ channel_config.fetch(:title) { generated_title }
24
+ end
25
+
26
+ def generated_title
27
+ uri = URI(url)
28
+
29
+ nicer_path = uri.path.split('/')
30
+ nicer_path.reject! { |part| part == '' }
31
+
32
+ nicer_path.any? ? "#{uri.host}: #{nicer_path.join(' ').titleize}" : uri.host
22
33
  end
23
34
 
24
35
  def language
25
- channel_config.fetch 'language', 'en'
36
+ channel_config.fetch :language, 'en'
26
37
  end
27
38
 
28
39
  def description
29
- channel_config.fetch 'description', 'A description of my html2rss feed.'
40
+ channel_config.fetch :description, "Latest items from #{url}."
30
41
  end
31
42
 
32
43
  def url
33
- channel_config.dig 'url'
44
+ channel_config.dig :url
34
45
  end
35
46
  alias link url
36
47
 
37
48
  def time_zone
38
- channel_config.fetch 'time_zone', 'UTC'
49
+ channel_config.fetch :time_zone, 'UTC'
39
50
  end
40
51
 
41
52
  def json?
42
- channel_config.fetch 'json', false
53
+ channel_config.fetch :json, false
43
54
  end
44
55
 
45
56
  def headers
46
- global_config.fetch('headers', {}).merge(channel_config.fetch('headers', {}))
57
+ global_config.fetch(:headers, {}).merge(channel_config.fetch(:headers, {}))
47
58
  end
48
59
 
49
60
  def attribute_options(name)
50
- feed_config.dig('selectors').fetch(name, {}).merge('channel' => channel_config)
61
+ feed_config.dig(:selectors).fetch(name, {}).merge(channel: channel_config)
51
62
  end
52
63
 
53
64
  def attribute?(name)
54
- attribute_names.include?(name.to_s)
65
+ attribute_names.include?(name)
55
66
  end
56
67
 
57
- def categories
58
- feed_config.dig('selectors').fetch('categories', []).map(&:to_sym)
68
+ def category_selectors
69
+ categories = feed_config.dig(:selectors, :categories)
70
+ return [] unless categories
71
+
72
+ categories = categories.keep_if { |category| category.to_s != '' }
73
+ categories.map!(&:to_sym)
74
+ categories.uniq!
75
+
76
+ categories
59
77
  end
60
78
 
61
79
  def selector(name)
62
- feed_config.dig('selectors', name, 'selector')
80
+ feed_config.dig(:selectors, name, :selector)
63
81
  end
64
82
 
65
83
  def attribute_names
66
- @attribute_names ||= feed_config.fetch('selectors', {}).keys.map(&:to_s).tap do |attrs|
67
- attrs.delete('items')
68
- end
84
+ @attribute_names ||= feed_config.fetch(:selectors, {}).keys.tap { |attrs| attrs.delete(:items) }
85
+ end
86
+
87
+ def items_order
88
+ feed_config.dig(:selectors, :items, :order)&.to_sym
69
89
  end
70
90
 
71
91
  private