html2rss 0.9.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/.mergify.yml +15 -0
- data/.rubocop.yml +11 -145
- data/Gemfile +19 -2
- data/Gemfile.lock +111 -97
- data/README.md +323 -270
- data/bin/console +1 -0
- data/exe/html2rss +6 -0
- data/html2rss.gemspec +15 -20
- data/lib/html2rss/attribute_post_processors/gsub.rb +30 -8
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +7 -2
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +27 -0
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +41 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +11 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +11 -4
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +40 -44
- data/lib/html2rss/attribute_post_processors/substring.rb +14 -4
- data/lib/html2rss/attribute_post_processors/template.rb +36 -12
- data/lib/html2rss/attribute_post_processors.rb +28 -5
- data/lib/html2rss/cli.rb +29 -0
- data/lib/html2rss/config/channel.rb +117 -0
- data/lib/html2rss/config/selectors.rb +91 -0
- data/lib/html2rss/config.rb +71 -82
- data/lib/html2rss/item.rb +118 -42
- data/lib/html2rss/item_extractors/attribute.rb +20 -7
- data/lib/html2rss/item_extractors/href.rb +20 -4
- data/lib/html2rss/item_extractors/html.rb +18 -6
- data/lib/html2rss/item_extractors/static.rb +18 -7
- data/lib/html2rss/item_extractors/text.rb +17 -5
- data/lib/html2rss/item_extractors.rb +75 -10
- data/lib/html2rss/object_to_xml_converter.rb +56 -0
- data/lib/html2rss/rss_builder/channel.rb +21 -0
- data/lib/html2rss/rss_builder/item.rb +83 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +37 -0
- data/lib/html2rss/rss_builder.rb +96 -0
- data/lib/html2rss/utils.rb +94 -19
- data/lib/html2rss/version.rb +5 -1
- data/lib/html2rss.rb +51 -20
- data/rakefile.rb +16 -0
- metadata +51 -154
- data/.travis.yml +0 -25
- data/CHANGELOG.md +0 -221
- data/lib/html2rss/feed_builder.rb +0 -81
- data/lib/html2rss/item_extractors/current_time.rb +0 -21
- data/support/logo.png +0 -0
data/bin/console
CHANGED
data/exe/html2rss
ADDED
data/html2rss.gemspec
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
lib = File.expand_path('lib', __dir__)
|
2
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
5
|
require 'html2rss/version'
|
@@ -10,44 +12,37 @@ Gem::Specification.new do |spec|
|
|
10
12
|
|
11
13
|
spec.summary = 'Returns an RSS::Rss object by scraping a URL.'
|
12
14
|
spec.description = 'Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance in return.'
|
13
|
-
spec.homepage = 'https://github.com/
|
15
|
+
spec.homepage = 'https://github.com/html2rss/html2rss'
|
14
16
|
spec.license = 'MIT'
|
15
|
-
spec.required_ruby_version = '>=
|
17
|
+
spec.required_ruby_version = '>= 3.1'
|
16
18
|
|
17
19
|
if spec.respond_to?(:metadata)
|
18
20
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
19
|
-
spec.metadata['changelog_uri'] = 'https://github.com/
|
21
|
+
spec.metadata['changelog_uri'] = 'https://github.com/html2rss/html2rss/releases'
|
22
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
20
23
|
else
|
21
24
|
raise 'RubyGems 2.0 or newer is required to protect against ' \
|
22
|
-
|
25
|
+
'public gem pushes.'
|
23
26
|
end
|
24
27
|
|
25
28
|
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
26
|
-
f.match(%r{^(test|spec|features)/})
|
29
|
+
f.match(%r{^(test|spec|features|support|docs|.github|.yardoc)/})
|
27
30
|
end
|
28
31
|
spec.bindir = 'exe'
|
29
32
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
30
33
|
spec.require_paths = ['lib']
|
31
34
|
|
32
|
-
spec.add_dependency 'activesupport', '>= 5', '< 7'
|
33
35
|
spec.add_dependency 'addressable', '~> 2.7'
|
34
|
-
spec.add_dependency '
|
35
|
-
spec.add_dependency 'faraday'
|
36
|
-
spec.add_dependency 'faraday_middleware'
|
36
|
+
spec.add_dependency 'faraday', '> 2.0.1', '< 3.0'
|
37
|
+
spec.add_dependency 'faraday-follow_redirects'
|
37
38
|
spec.add_dependency 'kramdown'
|
38
39
|
spec.add_dependency 'mime-types', '> 3.0'
|
39
40
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
41
|
+
spec.add_dependency 'regexp_parser'
|
40
42
|
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
41
|
-
spec.add_dependency '
|
42
|
-
spec.add_dependency '
|
43
|
+
spec.add_dependency 'rss'
|
44
|
+
spec.add_dependency 'sanitize', '~> 6.0'
|
45
|
+
spec.add_dependency 'thor'
|
46
|
+
spec.add_dependency 'tzinfo'
|
43
47
|
spec.add_dependency 'zeitwerk'
|
44
|
-
spec.add_development_dependency 'bundler'
|
45
|
-
spec.add_development_dependency 'byebug'
|
46
|
-
spec.add_development_dependency 'rspec', '~> 3.0'
|
47
|
-
spec.add_development_dependency 'rubocop'
|
48
|
-
spec.add_development_dependency 'rubocop-performance'
|
49
|
-
spec.add_development_dependency 'rubocop-rspec'
|
50
|
-
spec.add_development_dependency 'simplecov'
|
51
|
-
spec.add_development_dependency 'vcr'
|
52
|
-
spec.add_development_dependency 'yard'
|
53
48
|
end
|
@@ -1,9 +1,8 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Html2rss
|
4
4
|
module AttributePostProcessors
|
5
5
|
##
|
6
|
-
#
|
7
6
|
# Imagine this HTML:
|
8
7
|
# <h1>Foo bar and boo<h1>
|
9
8
|
#
|
@@ -19,23 +18,46 @@ module Html2rss
|
|
19
18
|
# Would return:
|
20
19
|
# 'Foo bar and baz'
|
21
20
|
#
|
22
|
-
# `pattern` can be a Regexp or a String.
|
21
|
+
# `pattern` can be a Regexp or a String. If it is a String, it will remove
|
22
|
+
# one pair of surrounding slashes ('/') to keep backwards compatibility
|
23
|
+
# and then parse it to build a Regexp.
|
23
24
|
#
|
24
25
|
# `replacement` can be a String or a Hash.
|
25
26
|
#
|
26
27
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
27
28
|
class Gsub
|
28
|
-
|
29
|
+
##
|
30
|
+
# @param value [String]
|
31
|
+
# @param context [Item::Context]
|
32
|
+
def initialize(value, context)
|
29
33
|
@value = value
|
30
|
-
options =
|
31
|
-
@pattern = options[:pattern].to_regexp || options[:pattern]
|
32
|
-
@replacement = options[:replacement]
|
34
|
+
@options = context[:options]
|
33
35
|
end
|
34
36
|
|
35
37
|
##
|
36
38
|
# @return [String]
|
37
39
|
def get
|
38
|
-
@value.to_s.gsub(
|
40
|
+
@value.to_s.gsub(pattern, replacement)
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
##
|
46
|
+
# @return [Regexp]
|
47
|
+
def pattern
|
48
|
+
pattern = @options[:pattern]
|
49
|
+
raise ArgumentError, 'The `pattern` option is missing' unless pattern
|
50
|
+
|
51
|
+
pattern.is_a?(String) ? Utils.build_regexp_from_string(pattern) : pattern
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @return [Hash, String]
|
56
|
+
def replacement
|
57
|
+
replacement = @options[:replacement]
|
58
|
+
return replacement if replacement.is_a?(String) || replacement.is_a?(Hash)
|
59
|
+
|
60
|
+
raise ArgumentError, 'The `replacement` option must be a String or Hash'
|
39
61
|
end
|
40
62
|
end
|
41
63
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'reverse_markdown'
|
2
4
|
|
3
5
|
module Html2rss
|
@@ -25,14 +27,17 @@ module Html2rss
|
|
25
27
|
# Would return:
|
26
28
|
# 'Lorem **ipsum** dolor'
|
27
29
|
class HtmlToMarkdown
|
30
|
+
##
|
31
|
+
# @param value [String]
|
32
|
+
# @param env [Item::Context]
|
28
33
|
def initialize(value, env)
|
29
|
-
@
|
34
|
+
@sanitized_value = SanitizeHtml.new(value, env).get
|
30
35
|
end
|
31
36
|
|
32
37
|
##
|
33
38
|
# @return [String] formatted in Markdown
|
34
39
|
def get
|
35
|
-
ReverseMarkdown.convert
|
40
|
+
ReverseMarkdown.convert(@sanitized_value)
|
36
41
|
end
|
37
42
|
end
|
38
43
|
end
|
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
module HtmlTransformers
|
6
|
+
##
|
7
|
+
# Transformer that converts relative URLs to absolute URLs within specified HTML elements.
|
8
|
+
class TransformUrlsToAbsoluteOnes
|
9
|
+
URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
|
10
|
+
|
11
|
+
def initialize(channel_url)
|
12
|
+
@channel_url = channel_url
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Transforms URLs to absolute ones.
|
17
|
+
def call(node_name:, node:, **_env)
|
18
|
+
return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(node_name)
|
19
|
+
|
20
|
+
url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[node_name]
|
21
|
+
url = node[url_attribute]
|
22
|
+
node[url_attribute] = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
module HtmlTransformers
|
6
|
+
##
|
7
|
+
# Transformer that wraps <img> tags into <a> tags linking to `img.src`.
|
8
|
+
class WrapImgInA
|
9
|
+
##
|
10
|
+
# Wraps <img> tags into <a> tags that link to `img.src`.
|
11
|
+
#
|
12
|
+
# @param node_name [String]
|
13
|
+
# @param node [Nokogiri::XML::Node]
|
14
|
+
# @return [nil]
|
15
|
+
def call(node_name:, node:, **_env)
|
16
|
+
return unless already_wrapped?(node_name, node)
|
17
|
+
|
18
|
+
wrap_image_in_anchor(node)
|
19
|
+
end
|
20
|
+
|
21
|
+
def already_wrapped?(node_name, node)
|
22
|
+
node_name == 'img' && node.parent.name != 'a'
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
##
|
28
|
+
# Wraps the <img> node in an <a> tag.
|
29
|
+
#
|
30
|
+
# @param node [Nokogiri::XML::Node]
|
31
|
+
# @return [nil]
|
32
|
+
def wrap_image_in_anchor(node)
|
33
|
+
anchor = Nokogiri::XML::Node.new('a', node.document)
|
34
|
+
anchor['href'] = node['src']
|
35
|
+
node.add_next_sibling(anchor)
|
36
|
+
anchor.add_child(node.remove)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -1,4 +1,7 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'kramdown'
|
4
|
+
require_relative 'sanitize_html'
|
2
5
|
|
3
6
|
module Html2rss
|
4
7
|
module AttributePostProcessors
|
@@ -30,15 +33,21 @@ module Html2rss
|
|
30
33
|
#
|
31
34
|
# <p>Price: 12.34</p>
|
32
35
|
class MarkdownToHtml
|
36
|
+
##
|
37
|
+
# @param value [String] Markdown content to convert to HTML
|
38
|
+
# @param env [Item::Context] Context object providing additional environment details
|
33
39
|
def initialize(value, env)
|
34
40
|
@value = value
|
35
41
|
@env = env
|
36
42
|
end
|
37
43
|
|
38
44
|
##
|
39
|
-
#
|
45
|
+
# Converts Markdown to sanitized HTML.
|
46
|
+
#
|
47
|
+
# @return [String] Sanitized HTML content
|
40
48
|
def get
|
41
|
-
|
49
|
+
html_content = Kramdown::Document.new(@value).to_html
|
50
|
+
SanitizeHtml.new(html_content, @env).get
|
42
51
|
end
|
43
52
|
end
|
44
53
|
end
|
@@ -1,5 +1,7 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'time'
|
4
|
+
require_relative '../utils'
|
3
5
|
|
4
6
|
module Html2rss
|
5
7
|
module AttributePostProcessors
|
@@ -24,15 +26,20 @@ module Html2rss
|
|
24
26
|
#
|
25
27
|
# It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
|
26
28
|
class ParseTime
|
29
|
+
##
|
30
|
+
# @param value [String] the time to parse
|
31
|
+
# @param env [Item::Context] Context object providing additional environment details
|
27
32
|
def initialize(value, env)
|
28
33
|
@value = value.to_s
|
29
34
|
@time_zone = env[:config].time_zone
|
30
35
|
end
|
31
36
|
|
32
37
|
##
|
33
|
-
#
|
38
|
+
# Converts the provided time string to RFC822 format, taking into account the configured time zone.
|
39
|
+
#
|
40
|
+
# @return [String] RFC822 formatted time
|
34
41
|
def get
|
35
|
-
|
42
|
+
Utils.use_zone(@time_zone) { Time.parse(@value).rfc822 }
|
36
43
|
end
|
37
44
|
end
|
38
45
|
end
|
@@ -1,7 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module AttributePostProcessors
|
3
5
|
##
|
4
6
|
# Returns the URI as String.
|
7
|
+
# If the URL is relative, it builds an absolute one with the channel's URL as base.
|
5
8
|
#
|
6
9
|
# Imagine this HTML structure:
|
7
10
|
#
|
@@ -19,14 +22,21 @@ module Html2rss
|
|
19
22
|
# Would return:
|
20
23
|
# 'http://why-not-use-a-link.uh'
|
21
24
|
class ParseUri
|
22
|
-
|
25
|
+
##
|
26
|
+
# @param value [String]
|
27
|
+
# @param context [Item::Context]
|
28
|
+
def initialize(value, context)
|
23
29
|
@value = value
|
30
|
+
@config_url = context.config.url
|
24
31
|
end
|
25
32
|
|
26
33
|
##
|
27
34
|
# @return [String]
|
28
35
|
def get
|
29
|
-
|
36
|
+
Html2rss::Utils.build_absolute_url_from_relative(
|
37
|
+
Html2rss::Utils.sanitize_url(@value),
|
38
|
+
@config_url
|
39
|
+
).to_s
|
30
40
|
end
|
31
41
|
end
|
32
42
|
end
|
@@ -1,17 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sanitize'
|
4
|
+
require_relative 'html_transformers/transform_urls_to_absolute_ones'
|
5
|
+
require_relative 'html_transformers/wrap_img_in_a'
|
2
6
|
|
3
7
|
module Html2rss
|
4
8
|
module AttributePostProcessors
|
5
9
|
##
|
6
10
|
# Returns sanitized HTML code as String.
|
7
11
|
#
|
8
|
-
# It
|
12
|
+
# It sanitizes by using the [sanitize gem](https://github.com/rgrove/sanitize) with
|
13
|
+
# [Sanitize::Config::RELAXED](https://github.com/rgrove/sanitize#sanitizeconfigrelaxed).
|
14
|
+
#
|
15
|
+
# Furthermore, it adds:
|
9
16
|
#
|
10
17
|
# - `rel="nofollow noopener noreferrer"` to <a> tags
|
11
18
|
# - `referrer-policy='no-referrer'` to <img> tags
|
12
|
-
#
|
13
|
-
# It also:
|
14
|
-
#
|
15
19
|
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
16
20
|
# linking to the <img>'s `src`.
|
17
21
|
#
|
@@ -35,68 +39,60 @@ module Html2rss
|
|
35
39
|
# Would return:
|
36
40
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
37
41
|
class SanitizeHtml
|
38
|
-
|
39
|
-
|
40
|
-
|
42
|
+
##
|
43
|
+
# @param value [String]
|
44
|
+
# @param env [Item::Context]
|
41
45
|
def initialize(value, env)
|
42
46
|
@value = value
|
43
47
|
@channel_url = env[:config].url
|
44
48
|
end
|
45
49
|
|
46
50
|
##
|
47
|
-
# - uses the {https://github.com/rgrove/sanitize sanitize gem}
|
48
|
-
# - uses the config {https://github.com/rgrove/sanitize#sanitizeconfigrelaxed Sanitize::Config::RELAXED}
|
49
|
-
# - adds rel="nofollow noopener noreferrer" to a elements
|
50
|
-
# - adds target="_blank" to a elements
|
51
51
|
# @return [String]
|
52
52
|
def get
|
53
|
-
Sanitize.fragment(@value, sanitize_config)
|
53
|
+
sanitized_html = Sanitize.fragment(@value, sanitize_config)
|
54
|
+
sanitized_html.to_s.gsub(/\s+/, ' ').strip
|
54
55
|
end
|
55
56
|
|
56
57
|
private
|
57
58
|
|
59
|
+
##
|
60
|
+
# @return [Sanitize::Config]
|
58
61
|
def sanitize_config
|
59
62
|
Sanitize::Config.merge(
|
60
63
|
Sanitize::Config::RELAXED,
|
61
64
|
attributes: { all: %w[dir lang alt title translate] },
|
62
|
-
add_attributes
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
65
|
+
add_attributes:,
|
66
|
+
transformers: [
|
67
|
+
method(:transform_urls_to_absolute_ones),
|
68
|
+
method(:wrap_img_in_a)
|
69
|
+
]
|
67
70
|
)
|
68
71
|
end
|
69
72
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
url = env[:node][url_attribute]
|
76
|
-
|
77
|
-
return if URI(url).absolute?
|
78
|
-
|
79
|
-
absolute_url = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
|
80
|
-
|
81
|
-
env[:node][url_attribute] = absolute_url
|
82
|
-
end
|
73
|
+
def add_attributes
|
74
|
+
{
|
75
|
+
'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
|
76
|
+
'img' => { 'referrer-policy' => 'no-referrer' }
|
77
|
+
}
|
83
78
|
end
|
84
79
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
anchor = Nokogiri::XML::Node.new('a', img)
|
94
|
-
anchor[:href] = img[:src]
|
95
|
-
|
96
|
-
anchor.add_child img.dup
|
80
|
+
##
|
81
|
+
# Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
|
82
|
+
#
|
83
|
+
# @param env [Hash]
|
84
|
+
# @return [nil]
|
85
|
+
def transform_urls_to_absolute_ones(env)
|
86
|
+
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(@channel_url).call(**env)
|
87
|
+
end
|
97
88
|
|
98
|
-
|
99
|
-
|
89
|
+
##
|
90
|
+
# Wrapper for wrap_img_in_a.
|
91
|
+
#
|
92
|
+
# @param env [Hash]
|
93
|
+
# @return [nil]
|
94
|
+
def wrap_img_in_a(env)
|
95
|
+
HtmlTransformers::WrapImgInA.new.call(**env)
|
100
96
|
end
|
101
97
|
end
|
102
98
|
end
|
@@ -1,6 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module AttributePostProcessors
|
3
|
-
##
|
5
|
+
##
|
6
|
+
# Returns a defined part of a String.
|
4
7
|
#
|
5
8
|
# Both parameters must be an Integer and they can be negative.
|
6
9
|
# The +end+ parameter can be omitted, in that case it will not cut the
|
@@ -26,16 +29,23 @@ module Html2rss
|
|
26
29
|
# Would return:
|
27
30
|
# 'bar'
|
28
31
|
class Substring
|
32
|
+
##
|
33
|
+
# @param value [String] The original string to extract a substring from.
|
34
|
+
# @param env [Item::Context] Context object providing additional environment details.
|
29
35
|
def initialize(value, env)
|
30
36
|
@value = value
|
31
37
|
@options = env[:options]
|
32
38
|
end
|
33
39
|
|
34
40
|
##
|
35
|
-
#
|
41
|
+
# Extracts the substring from the original string based on the provided start and end indices.
|
42
|
+
#
|
43
|
+
# @return [String] The extracted substring.
|
36
44
|
def get
|
37
|
-
|
38
|
-
@
|
45
|
+
start_index = @options[:start].to_i
|
46
|
+
end_index = @options[:end]&.to_i || @value.length
|
47
|
+
|
48
|
+
@value[start_index..end_index]
|
39
49
|
end
|
40
50
|
end
|
41
51
|
end
|
@@ -1,25 +1,28 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Html2rss
|
4
4
|
module AttributePostProcessors
|
5
|
-
##
|
5
|
+
##
|
6
|
+
# Returns a formatted String according to the string pattern.
|
6
7
|
#
|
7
8
|
# If +self+ is used, the selectors extracted value will be used.
|
8
9
|
# It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
|
9
10
|
#
|
10
11
|
# Imagine this HTML:
|
12
|
+
#
|
11
13
|
# <li>
|
12
14
|
# <h1>Product</h1>
|
13
15
|
# <span class="price">23,42€</span>
|
14
16
|
# </li>
|
15
17
|
#
|
18
|
+
#
|
16
19
|
# YAML usage example:
|
17
20
|
#
|
18
21
|
# selectors:
|
19
22
|
# items:
|
20
23
|
# selector: 'li'
|
21
24
|
# price:
|
22
|
-
#
|
25
|
+
# selector: '.price'
|
23
26
|
# title:
|
24
27
|
# selector: h1
|
25
28
|
# post_process:
|
@@ -29,6 +32,9 @@ module Html2rss
|
|
29
32
|
# Would return:
|
30
33
|
# 'Product (23,42€)'
|
31
34
|
class Template
|
35
|
+
##
|
36
|
+
# @param value [String]
|
37
|
+
# @param env [Item::Context]
|
32
38
|
def initialize(value, env)
|
33
39
|
@value = value
|
34
40
|
@options = env[:options]
|
@@ -39,28 +45,46 @@ module Html2rss
|
|
39
45
|
##
|
40
46
|
# @return [String]
|
41
47
|
def get
|
42
|
-
|
43
|
-
|
44
|
-
names = string.scan(/%[<|{](\w*)[>|}]/)
|
45
|
-
names.flatten!
|
46
|
-
names.compact!
|
47
|
-
names.map!(&:to_sym)
|
48
|
-
|
49
|
-
format(string, names.map { |name| [name, item_value(name)] }.to_h)
|
48
|
+
@options[:methods] ? format_string_with_methods : format_string_with_dynamic_params
|
50
49
|
end
|
51
50
|
|
52
51
|
private
|
53
52
|
|
53
|
+
##
|
54
|
+
# @return [String] the string containing the template
|
54
55
|
attr_reader :string
|
55
56
|
|
57
|
+
##
|
58
|
+
# @return [Array<String>]
|
56
59
|
def methods
|
57
|
-
@methods ||= @options[:methods].map(
|
60
|
+
@methods ||= @options[:methods].map { |method_name| item_value(method_name) }
|
58
61
|
end
|
59
62
|
|
63
|
+
##
|
64
|
+
# Formats a string using methods.
|
65
|
+
#
|
66
|
+
# @return [String]
|
67
|
+
# @deprecated Use %<id>s formatting instead. Will be removed in version 1.0.0. See README / Dynamic parameters.
|
60
68
|
def format_string_with_methods
|
69
|
+
warn '[DEPRECATION] This method of using params is deprecated and \
|
70
|
+
support for it will be removed in version 1.0.0.\
|
71
|
+
Please use dynamic parameters (i.e. %<id>s, see README.md) instead.'
|
72
|
+
|
61
73
|
string % methods
|
62
74
|
end
|
63
75
|
|
76
|
+
##
|
77
|
+
# @return [String]
|
78
|
+
def format_string_with_dynamic_params
|
79
|
+
param_names = string.scan(/%[<|{](\w*)[>|}]/)
|
80
|
+
param_names.flatten!
|
81
|
+
|
82
|
+
format(string, param_names.to_h { |name| [name.to_sym, item_value(name)] })
|
83
|
+
end
|
84
|
+
|
85
|
+
##
|
86
|
+
# @param method_name [String, Symbol]
|
87
|
+
# @return [String]
|
64
88
|
def item_value(method_name)
|
65
89
|
method_name.to_sym == :self ? @value.to_s : @item.public_send(method_name).to_s
|
66
90
|
end
|
@@ -1,13 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
##
|
3
5
|
# Provides a namespace for attribute post processors.
|
4
6
|
module AttributePostProcessors
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
end
|
7
|
+
##
|
8
|
+
# Error raised when an unknown post processor name is requested.
|
9
|
+
class UnknownPostProcessorName < StandardError; end
|
9
10
|
|
10
|
-
|
11
|
+
##
|
12
|
+
# Maps the post processor name to the class implementing the post processor.
|
13
|
+
#
|
14
|
+
# The key is the name to use in the feed config.
|
15
|
+
NAME_TO_CLASS = {
|
16
|
+
gsub: Gsub,
|
17
|
+
html_to_markdown: HtmlToMarkdown,
|
18
|
+
markdown_to_html: MarkdownToHtml,
|
19
|
+
parse_time: ParseTime,
|
20
|
+
parse_uri: ParseUri,
|
21
|
+
sanitize_html: SanitizeHtml,
|
22
|
+
substring: Substring,
|
23
|
+
template: Template
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
##
|
27
|
+
# Retrieves the attribute post processor class based on the given name.
|
28
|
+
#
|
29
|
+
# @param name [Symbol] The name of the post processor.
|
30
|
+
# @return [Class] The attribute post processor class.
|
31
|
+
# @raise [UnknownPostProcessorName] If the requested name is not found in NAME_TO_CLASS.
|
32
|
+
def self.get_processor(name)
|
33
|
+
NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Can't find a post processor named '#{name}'")
|
11
34
|
end
|
12
35
|
end
|
13
36
|
end
|
data/lib/html2rss/cli.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../html2rss'
|
4
|
+
require 'thor'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
##
|
8
|
+
# The Html2rss command line interface.
|
9
|
+
class CLI < Thor
|
10
|
+
def self.exit_on_failure?
|
11
|
+
true
|
12
|
+
end
|
13
|
+
|
14
|
+
desc 'feed YAML_FILE [FEED_NAME] [param=value ...]', 'Print RSS built from the YAML_FILE file to stdout'
|
15
|
+
##
|
16
|
+
# Prints the feed to STDOUT.
|
17
|
+
#
|
18
|
+
# @param yaml_file [String] Path to the YAML configuration file.
|
19
|
+
# @param options [Array<String>] Additional options including feed name and parameters.
|
20
|
+
# @return [nil]
|
21
|
+
def feed(yaml_file, *options)
|
22
|
+
raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
|
23
|
+
|
24
|
+
feed_name = options.shift
|
25
|
+
params = options.to_h { |opt| opt.split('=', 2) }
|
26
|
+
puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|