html2rss 0.6.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -12,7 +12,7 @@ Gem::Specification.new do |spec|
12
12
  spec.description = 'Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance in return.'
13
13
  spec.homepage = 'https://github.com/gildesmarais/html2rss'
14
14
  spec.license = 'MIT'
15
- spec.required_ruby_version = '>= 2.4.0'
15
+ spec.required_ruby_version = '>= 2.5.0'
16
16
 
17
17
  if spec.respond_to?(:metadata)
18
18
  spec.metadata['allowed_push_host'] = 'https://rubygems.org'
@@ -29,15 +29,19 @@ Gem::Specification.new do |spec|
29
29
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
30
  spec.require_paths = ['lib']
31
31
 
32
- spec.add_dependency 'activesupport', '~> 5.0'
32
+ spec.add_dependency 'activesupport', '>= 5', '< 7'
33
+ spec.add_dependency 'addressable', '~> 2.7'
33
34
  spec.add_dependency 'builder'
34
- spec.add_dependency 'faraday', '~> 0.15'
35
- spec.add_dependency 'faraday_middleware', '~> 0.13'
36
- spec.add_dependency 'hashie', '~> 3.6'
35
+ spec.add_dependency 'faraday', '~> 1.0'
36
+ spec.add_dependency 'faraday_middleware'
37
+ spec.add_dependency 'kramdown'
38
+ spec.add_dependency 'mime-types', '> 3.0'
37
39
  spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
38
- spec.add_dependency 'reverse_markdown', '~> 1.3'
40
+ spec.add_dependency 'reverse_markdown', '~> 2.0'
39
41
  spec.add_dependency 'sanitize', '~> 5.0'
40
- spec.add_development_dependency 'bundler', '~> 1.16'
42
+ spec.add_dependency 'to_regexp'
43
+ spec.add_dependency 'zeitwerk'
44
+ spec.add_development_dependency 'bundler'
41
45
  spec.add_development_dependency 'byebug'
42
46
  spec.add_development_dependency 'rspec', '~> 3.0'
43
47
  spec.add_development_dependency 'rubocop'
@@ -1,7 +1,8 @@
1
- require 'html2rss/config'
2
- require 'html2rss/feed_builder'
3
- require 'html2rss/version'
4
- require 'html2rss/utils'
1
+ require 'zeitwerk'
2
+
3
+ loader = Zeitwerk::Loader.for_gem
4
+ loader.setup
5
+
5
6
  require 'yaml'
6
7
 
7
8
  ##
@@ -1,19 +1,13 @@
1
- require_relative 'attribute_post_processors/html_to_markdown'
2
- require_relative 'attribute_post_processors/parse_time'
3
- require_relative 'attribute_post_processors/parse_uri'
4
- require_relative 'attribute_post_processors/sanitize_html'
5
- require_relative 'attribute_post_processors/substring'
6
- require_relative 'attribute_post_processors/template'
7
-
8
1
  module Html2rss
9
2
  ##
10
3
  # Provides a namespace for attribute post processors.
11
4
  module AttributePostProcessors
12
5
  def self.get_processor(name)
13
- camel_cased_name = name.split('_').map(&:capitalize).join
14
- class_name = ['Html2rss', 'AttributePostProcessors', camel_cased_name].join('::')
6
+ @get_processor ||= Hash.new do |processors, key|
7
+ processors[key] = Utils.get_class_from_name(key, 'AttributePostProcessors')
8
+ end
15
9
 
16
- Object.const_get(class_name)
10
+ @get_processor[name]
17
11
  end
18
12
  end
19
13
  end
@@ -0,0 +1,42 @@
1
+ require 'to_regexp'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ ##
6
+ #
7
+ # Imagine this HTML:
8
+ # <h1>Foo bar and boo<h1>
9
+ #
10
+ # YAML usage example:
11
+ # selectors:
12
+ # title:
13
+ # selector: h1
14
+ # post_process:
15
+ # name: gsub
16
+ # pattern: boo
17
+ # replacement: baz
18
+ #
19
+ # Would return:
20
+ # 'Foo bar and baz'
21
+ #
22
+ # `pattern` can be a Regexp or a String.
23
+ #
24
+ # `replacement` can be a String or a Hash.
25
+ #
26
+ # See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
27
+ class Gsub
28
+ def initialize(value, env)
29
+ @value = value
30
+ options = env[:options]
31
+ @pattern = options[:pattern].to_regexp || options[:pattern]
32
+ @replacement = options[:replacement]
33
+ end
34
+
35
+ ##
36
+ # @return [String]
37
+ def get
38
+ @value.to_s.gsub(@pattern, @replacement)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,45 @@
1
+ require 'kramdown'
2
+
3
+ module Html2rss
4
+ module AttributePostProcessors
5
+ ##
6
+ # Generates HTML from Markdown.
7
+ #
8
+ # It's particularly useful in conjunction with the Template post processor
9
+ # to generate a description from other selectors.
10
+ #
11
+ # YAML usage example:
12
+ #
13
+ # selectors:
14
+ # description:
15
+ # selector: section
16
+ # post_process:
17
+ # - name: template
18
+ # string: |
19
+ # # %s
20
+ #
21
+ # Price: %s
22
+ # methods:
23
+ # - self
24
+ # - price
25
+ # - name: markdown_to_html
26
+ #
27
+ # Would e.g. return:
28
+ #
29
+ # <h1>Section</h1>
30
+ #
31
+ # <p>Price: 12.34</p>
32
+ class MarkdownToHtml
33
+ def initialize(value, env)
34
+ @value = value
35
+ @env = env
36
+ end
37
+
38
+ ##
39
+ # @return [String] formatted in Markdown
40
+ def get
41
+ SanitizeHtml.new(Kramdown::Document.new(@value).to_html, @env).get
42
+ end
43
+ end
44
+ end
45
+ end
@@ -17,12 +17,12 @@ module Html2rss
17
17
  # selector: span
18
18
  # post_process:
19
19
  # name: 'parse_time'
20
+ # time_zone: 'Europe/Berlin'
20
21
  #
21
22
  # Would return:
22
23
  # "Tue, 02 Jul 2019 00:00:00 +0200"
23
24
  #
24
25
  # It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
25
- # As of now it ignores time zones and always falls back to the UTC time zone.
26
26
  class ParseTime
27
27
  def initialize(value, env)
28
28
  @value = value.to_s
@@ -5,7 +5,7 @@ module Html2rss
5
5
  #
6
6
  # Imagine this HTML structure:
7
7
  #
8
- # <span>http://why-not-use-a-link.uh</span>
8
+ # <span>http://why-not-use-a-link.uh </span>
9
9
  #
10
10
  # YAML usage example:
11
11
  #
@@ -15,6 +15,7 @@ module Html2rss
15
15
  # extractor: text
16
16
  # post_process:
17
17
  # name: parse_uri
18
+ #
18
19
  # Would return:
19
20
  # 'http://why-not-use-a-link.uh'
20
21
  class ParseUri
@@ -25,7 +26,7 @@ module Html2rss
25
26
  ##
26
27
  # @return [String]
27
28
  def get
28
- URI(@value).to_s
29
+ URI(Html2rss::Utils.sanitize_url(@value)).to_s
29
30
  end
30
31
  end
31
32
  end
@@ -4,10 +4,16 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  ##
6
6
  # Returns sanitized HTML code as String.
7
- # Adds
8
7
  #
9
- # - rel="nofollow noopener noreferrer" to a elements
10
- # - referrer-policy='no-referrer' to img elements
8
+ # It adds:
9
+ #
10
+ # - `rel="nofollow noopener noreferrer"` to <a> tags
11
+ # - `referrer-policy='no-referrer'` to <img> tags
12
+ #
13
+ # It also:
14
+ #
15
+ # - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
16
+ # linking to the <img>'s `src`.
11
17
  #
12
18
  # Imagine this HTML structure:
13
19
  #
@@ -21,7 +27,7 @@ module Html2rss
21
27
  #
22
28
  # selectors:
23
29
  # description:
24
- # selector: section
30
+ # selector: '.section'
25
31
  # extractor: html
26
32
  # post_process:
27
33
  # name: sanitize_html
@@ -29,6 +35,9 @@ module Html2rss
29
35
  # Would return:
30
36
  # '<p>Lorem <b>ipsum</b> dolor ...</p>'
31
37
  class SanitizeHtml
38
+ URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
39
+ private_constant :URL_ELEMENTS_WITH_URL_ATTRIBUTE
40
+
32
41
  def initialize(value, env)
33
42
  @value = value
34
43
  @channel_url = env[:config].url
@@ -41,28 +50,22 @@ module Html2rss
41
50
  # - adds target="_blank" to a elements
42
51
  # @return [String]
43
52
  def get
44
- Sanitize.fragment(@value, Sanitize::Config.merge(
45
- Sanitize::Config::RELAXED,
46
- attributes: { all: %w[dir lang alt title translate] },
47
- add_attributes: {
48
- 'a' => {
49
- 'rel' => 'nofollow noopener noreferrer',
50
- 'target' => '_blank'
51
- },
52
- 'img' => {
53
- 'referrer-policy' => 'no-referrer'
54
- }
55
- },
56
- transformers: [transform_urls_to_absolute_ones]
57
- )).to_s.split.join(' ')
53
+ Sanitize.fragment(@value, sanitize_config).to_s.split.join(' ')
58
54
  end
59
55
 
60
56
  private
61
57
 
62
- URL_ELEMENTS_WITH_URL_ATTRIBUTE = {
63
- 'a' => :href,
64
- 'img' => :src
65
- }.freeze
58
+ def sanitize_config
59
+ Sanitize::Config.merge(
60
+ Sanitize::Config::RELAXED,
61
+ attributes: { all: %w[dir lang alt title translate] },
62
+ add_attributes: {
63
+ 'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
64
+ 'img' => { 'referrer-policy' => 'no-referrer' }
65
+ },
66
+ transformers: [transform_urls_to_absolute_ones, wrap_img_in_a]
67
+ )
68
+ end
66
69
 
67
70
  def transform_urls_to_absolute_ones
68
71
  lambda do |env|
@@ -78,6 +81,23 @@ module Html2rss
78
81
  env[:node][url_attribute] = absolute_url
79
82
  end
80
83
  end
84
+
85
+ def wrap_img_in_a
86
+ lambda do |env|
87
+ return if env[:node_name] != 'img'
88
+
89
+ img = env[:node]
90
+
91
+ return if img.parent.name == 'a'
92
+
93
+ anchor = Nokogiri::XML::Node.new('a', img)
94
+ anchor[:href] = img[:src]
95
+
96
+ anchor.add_child img.dup
97
+
98
+ img.replace(anchor)
99
+ end
100
+ end
81
101
  end
82
102
  end
83
103
  end
@@ -2,9 +2,15 @@ module Html2rss
2
2
  module AttributePostProcessors
3
3
  ## Returns a defined part of a String.
4
4
  #
5
+ # Both parameters must be an Integer and they can be negative.
5
6
  # The +end+ parameter can be omitted, in that case it will not cut the
6
7
  # String at the end.
7
8
  #
9
+ # A Regexp or a MatchString is not supported.
10
+ #
11
+ # See the [`String#[]`](https://ruby-doc.org/core/String.html#method-i-5B-5D)
12
+ # documentation for more information.
13
+ #
8
14
  # Imagine this HTML:
9
15
  # <h1>Foo bar and baz<h1>
10
16
  #
@@ -13,9 +19,9 @@ module Html2rss
13
19
  # title:
14
20
  # selector: h1
15
21
  # post_process:
16
- # name: substring
17
- # start: 4
18
- # end: 6
22
+ # name: substring
23
+ # start: 4
24
+ # end: 6
19
25
  #
20
26
  # Would return:
21
27
  # 'bar'
@@ -28,8 +34,8 @@ module Html2rss
28
34
  ##
29
35
  # @return [String]
30
36
  def get
31
- ending = @options.fetch('end', @value.length).to_i
32
- @value[@options['start'].to_i..ending]
37
+ ending = @options.fetch(:end, @value.length).to_i
38
+ @value[@options[:start].to_i..ending]
33
39
  end
34
40
  end
35
41
  end
@@ -4,7 +4,8 @@ module Html2rss
4
4
  module AttributePostProcessors
5
5
  ## Returns a formatted String according to the string pattern.
6
6
  #
7
- # If +self+ is given as a method, the extracted value will be used.
7
+ # If +self+ is used, the selectors extracted value will be used.
8
+ # It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
8
9
  #
9
10
  # Imagine this HTML:
10
11
  # <li>
@@ -22,11 +23,8 @@ module Html2rss
22
23
  # title:
23
24
  # selector: h1
24
25
  # post_process:
25
- # name: template
26
- # string: '%s (%s)'
27
- # methods:
28
- # - self
29
- # - price
26
+ # name: template
27
+ # string: '%{self} (%{price})'
30
28
  #
31
29
  # Would return:
32
30
  # 'Product (23,42€)'
@@ -35,25 +33,36 @@ module Html2rss
35
33
  @value = value
36
34
  @options = env[:options]
37
35
  @item = env[:item]
36
+ @string = @options[:string]
38
37
  end
39
38
 
40
39
  ##
41
- # - uses {http://ruby-doc.org/core-2.6.3/String.html#method-i-25 String#%}
42
40
  # @return [String]
43
41
  def get
44
- string % methods
42
+ return format_string_with_methods if @options[:methods]
43
+
44
+ names = string.scan(/%[<|{](\w*)[>|}]/)
45
+ names.flatten!
46
+ names.compact!
47
+ names.map!(&:to_sym)
48
+
49
+ format(string, names.map { |name| [name, item_value(name)] }.to_h)
45
50
  end
46
51
 
47
52
  private
48
53
 
49
- def string
50
- @options['string']
51
- end
54
+ attr_reader :string
52
55
 
53
56
  def methods
54
- @methods ||= @options['methods'].map do |method|
55
- method == 'self' ? @value.to_s : @item.public_send(method.to_sym).to_s
56
- end
57
+ @methods ||= @options[:methods].map(&method(:item_value))
58
+ end
59
+
60
+ def format_string_with_methods
61
+ string % methods
62
+ end
63
+
64
+ def item_value(method_name)
65
+ method_name.to_sym == :self ? @value.to_s : @item.public_send(method_name).to_s
57
66
  end
58
67
  end
59
68
  end
@@ -1,71 +1,91 @@
1
+ require 'active_support/core_ext/hash'
2
+
1
3
  module Html2rss
2
4
  ##
3
5
  # The Config class abstracts from the config data structure and
4
6
  # provides default values.
5
7
  class Config
6
8
  def initialize(feed_config, global_config = {})
7
- @global_config = Utils::IndifferentAccessHash.new global_config
8
- @feed_config = Utils::IndifferentAccessHash.new feed_config
9
- @channel_config = Utils::IndifferentAccessHash.new @feed_config.fetch('channel', {})
9
+ @global_config = global_config.deep_symbolize_keys
10
+ @feed_config = feed_config.deep_symbolize_keys
11
+ @channel_config = @feed_config.fetch(:channel, {})
10
12
  end
11
13
 
12
14
  def author
13
- channel_config.fetch 'author', 'html2rss'
15
+ channel_config.fetch :author, 'html2rss'
14
16
  end
15
17
 
16
18
  def ttl
17
- channel_config.fetch 'ttl', 3600
19
+ channel_config.fetch :ttl, 360
18
20
  end
19
21
 
20
22
  def title
21
- channel_config.fetch 'title', 'html2rss generated title'
23
+ channel_config.fetch(:title) { generated_title }
24
+ end
25
+
26
+ def generated_title
27
+ uri = URI(url)
28
+
29
+ nicer_path = uri.path.split('/')
30
+ nicer_path.reject! { |part| part == '' }
31
+
32
+ nicer_path.any? ? "#{uri.host}: #{nicer_path.join(' ').titleize}" : uri.host
22
33
  end
23
34
 
24
35
  def language
25
- channel_config.fetch 'language', 'en'
36
+ channel_config.fetch :language, 'en'
26
37
  end
27
38
 
28
39
  def description
29
- channel_config.fetch 'description', 'A description of my html2rss feed.'
40
+ channel_config.fetch :description, "Latest items from #{url}."
30
41
  end
31
42
 
32
43
  def url
33
- channel_config.dig 'url'
44
+ channel_config.dig :url
34
45
  end
35
46
  alias link url
36
47
 
37
48
  def time_zone
38
- channel_config.fetch 'time_zone', 'UTC'
49
+ channel_config.fetch :time_zone, 'UTC'
39
50
  end
40
51
 
41
52
  def json?
42
- channel_config.fetch 'json', false
53
+ channel_config.fetch :json, false
43
54
  end
44
55
 
45
56
  def headers
46
- global_config.fetch('headers', {}).merge(channel_config.fetch('headers', {}))
57
+ global_config.fetch(:headers, {}).merge(channel_config.fetch(:headers, {}))
47
58
  end
48
59
 
49
60
  def attribute_options(name)
50
- feed_config.dig('selectors').fetch(name, {}).merge('channel' => channel_config)
61
+ feed_config.dig(:selectors).fetch(name, {}).merge(channel: channel_config)
51
62
  end
52
63
 
53
64
  def attribute?(name)
54
- attribute_names.include?(name.to_s)
65
+ attribute_names.include?(name)
55
66
  end
56
67
 
57
- def categories
58
- feed_config.dig('selectors').fetch('categories', []).map(&:to_sym)
68
+ def category_selectors
69
+ categories = feed_config.dig(:selectors, :categories)
70
+ return [] unless categories
71
+
72
+ categories = categories.keep_if { |category| category.to_s != '' }
73
+ categories.map!(&:to_sym)
74
+ categories.uniq!
75
+
76
+ categories
59
77
  end
60
78
 
61
79
  def selector(name)
62
- feed_config.dig('selectors', name, 'selector')
80
+ feed_config.dig(:selectors, name, :selector)
63
81
  end
64
82
 
65
83
  def attribute_names
66
- @attribute_names ||= feed_config.fetch('selectors', {}).keys.map(&:to_s).tap do |attrs|
67
- attrs.delete('items')
68
- end
84
+ @attribute_names ||= feed_config.fetch(:selectors, {}).keys.tap { |attrs| attrs.delete(:items) }
85
+ end
86
+
87
+ def items_order
88
+ feed_config.dig(:selectors, :items, :order)&.to_sym
69
89
  end
70
90
 
71
91
  private