html2rss 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +323 -270
- data/exe/html2rss +6 -0
- data/html2rss.gemspec +18 -23
- data/lib/html2rss/attribute_post_processors/gsub.rb +30 -8
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +7 -2
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +27 -0
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +41 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +11 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +11 -4
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +40 -44
- data/lib/html2rss/attribute_post_processors/substring.rb +14 -4
- data/lib/html2rss/attribute_post_processors/template.rb +36 -12
- data/lib/html2rss/attribute_post_processors.rb +28 -5
- data/lib/html2rss/cli.rb +29 -0
- data/lib/html2rss/config/channel.rb +117 -0
- data/lib/html2rss/config/selectors.rb +91 -0
- data/lib/html2rss/config.rb +71 -82
- data/lib/html2rss/item.rb +122 -46
- data/lib/html2rss/item_extractors/attribute.rb +20 -7
- data/lib/html2rss/item_extractors/href.rb +20 -4
- data/lib/html2rss/item_extractors/html.rb +18 -6
- data/lib/html2rss/item_extractors/static.rb +18 -7
- data/lib/html2rss/item_extractors/text.rb +17 -5
- data/lib/html2rss/item_extractors.rb +75 -10
- data/lib/html2rss/object_to_xml_converter.rb +56 -0
- data/lib/html2rss/rss_builder/channel.rb +21 -0
- data/lib/html2rss/rss_builder/item.rb +83 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +37 -0
- data/lib/html2rss/rss_builder.rb +96 -0
- data/lib/html2rss/utils.rb +94 -19
- data/lib/html2rss/version.rb +5 -1
- data/lib/html2rss.rb +57 -20
- metadata +53 -165
- data/.gitignore +0 -12
- data/.rspec +0 -4
- data/.rubocop.yml +0 -164
- data/.travis.yml +0 -25
- data/.yardopts +0 -6
- data/CHANGELOG.md +0 -221
- data/Gemfile +0 -8
- data/Gemfile.lock +0 -139
- data/bin/console +0 -15
- data/bin/setup +0 -8
- data/lib/html2rss/feed_builder.rb +0 -81
- data/lib/html2rss/item_extractors/current_time.rb +0 -21
- data/support/logo.png +0 -0
data/exe/html2rss
ADDED
data/html2rss.gemspec
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
lib = File.expand_path('lib', __dir__)
|
2
4
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
5
|
require 'html2rss/version'
|
@@ -8,46 +10,39 @@ Gem::Specification.new do |spec|
|
|
8
10
|
spec.authors = ['Gil Desmarais']
|
9
11
|
spec.email = ['html2rss@desmarais.de']
|
10
12
|
|
11
|
-
spec.summary = '
|
12
|
-
spec.description = '
|
13
|
-
spec.homepage = 'https://github.com/
|
13
|
+
spec.summary = 'Generates RSS feeds from websites by scraping a URL and using CSS selectors to extract item.'
|
14
|
+
spec.description = 'Supports JSON content, custom HTTP headers, and post-processing of extracted content.'
|
15
|
+
spec.homepage = 'https://github.com/html2rss/html2rss'
|
14
16
|
spec.license = 'MIT'
|
15
|
-
spec.required_ruby_version = '>=
|
17
|
+
spec.required_ruby_version = '>= 3.1'
|
16
18
|
|
17
19
|
if spec.respond_to?(:metadata)
|
18
20
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
19
|
-
spec.metadata['changelog_uri'] =
|
21
|
+
spec.metadata['changelog_uri'] = "#{spec.homepage}/releases/tag/v#{spec.version}"
|
22
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
20
23
|
else
|
21
24
|
raise 'RubyGems 2.0 or newer is required to protect against ' \
|
22
|
-
|
25
|
+
'public gem pushes.'
|
23
26
|
end
|
24
27
|
|
25
|
-
spec.files = `git ls-files -z`.split("\x0").
|
26
|
-
f.match(%r{^(
|
28
|
+
spec.files = `git ls-files -z`.split("\x0").select do |f|
|
29
|
+
f.match(%r{^(lib/|exe/|README.md|LICENSE|html2rss.gemspec)})
|
27
30
|
end
|
28
31
|
spec.bindir = 'exe'
|
29
32
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
30
33
|
spec.require_paths = ['lib']
|
31
34
|
|
32
|
-
spec.add_dependency 'activesupport', '>= 5', '< 7'
|
33
35
|
spec.add_dependency 'addressable', '~> 2.7'
|
34
|
-
spec.add_dependency '
|
35
|
-
spec.add_dependency 'faraday'
|
36
|
-
spec.add_dependency 'faraday_middleware'
|
36
|
+
spec.add_dependency 'faraday', '> 2.0.1', '< 3.0'
|
37
|
+
spec.add_dependency 'faraday-follow_redirects'
|
37
38
|
spec.add_dependency 'kramdown'
|
38
39
|
spec.add_dependency 'mime-types', '> 3.0'
|
39
40
|
spec.add_dependency 'nokogiri', '>= 1.10', '< 2.0'
|
41
|
+
spec.add_dependency 'regexp_parser'
|
40
42
|
spec.add_dependency 'reverse_markdown', '~> 2.0'
|
41
|
-
spec.add_dependency '
|
42
|
-
spec.add_dependency '
|
43
|
+
spec.add_dependency 'rss'
|
44
|
+
spec.add_dependency 'sanitize', '~> 6.0'
|
45
|
+
spec.add_dependency 'thor'
|
46
|
+
spec.add_dependency 'tzinfo'
|
43
47
|
spec.add_dependency 'zeitwerk'
|
44
|
-
spec.add_development_dependency 'bundler'
|
45
|
-
spec.add_development_dependency 'byebug'
|
46
|
-
spec.add_development_dependency 'rspec', '~> 3.0'
|
47
|
-
spec.add_development_dependency 'rubocop'
|
48
|
-
spec.add_development_dependency 'rubocop-performance'
|
49
|
-
spec.add_development_dependency 'rubocop-rspec'
|
50
|
-
spec.add_development_dependency 'simplecov'
|
51
|
-
spec.add_development_dependency 'vcr'
|
52
|
-
spec.add_development_dependency 'yard'
|
53
48
|
end
|
@@ -1,9 +1,8 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Html2rss
|
4
4
|
module AttributePostProcessors
|
5
5
|
##
|
6
|
-
#
|
7
6
|
# Imagine this HTML:
|
8
7
|
# <h1>Foo bar and boo<h1>
|
9
8
|
#
|
@@ -19,23 +18,46 @@ module Html2rss
|
|
19
18
|
# Would return:
|
20
19
|
# 'Foo bar and baz'
|
21
20
|
#
|
22
|
-
# `pattern` can be a Regexp or a String.
|
21
|
+
# `pattern` can be a Regexp or a String. If it is a String, it will remove
|
22
|
+
# one pair of surrounding slashes ('/') to keep backwards compatibility
|
23
|
+
# and then parse it to build a Regexp.
|
23
24
|
#
|
24
25
|
# `replacement` can be a String or a Hash.
|
25
26
|
#
|
26
27
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
27
28
|
class Gsub
|
28
|
-
|
29
|
+
##
|
30
|
+
# @param value [String]
|
31
|
+
# @param context [Item::Context]
|
32
|
+
def initialize(value, context)
|
29
33
|
@value = value
|
30
|
-
options =
|
31
|
-
@pattern = options[:pattern].to_regexp || options[:pattern]
|
32
|
-
@replacement = options[:replacement]
|
34
|
+
@options = context[:options]
|
33
35
|
end
|
34
36
|
|
35
37
|
##
|
36
38
|
# @return [String]
|
37
39
|
def get
|
38
|
-
@value.to_s.gsub(
|
40
|
+
@value.to_s.gsub(pattern, replacement)
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
##
|
46
|
+
# @return [Regexp]
|
47
|
+
def pattern
|
48
|
+
pattern = @options[:pattern]
|
49
|
+
raise ArgumentError, 'The `pattern` option is missing' unless pattern
|
50
|
+
|
51
|
+
pattern.is_a?(String) ? Utils.build_regexp_from_string(pattern) : pattern
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# @return [Hash, String]
|
56
|
+
def replacement
|
57
|
+
replacement = @options[:replacement]
|
58
|
+
return replacement if replacement.is_a?(String) || replacement.is_a?(Hash)
|
59
|
+
|
60
|
+
raise ArgumentError, 'The `replacement` option must be a String or Hash'
|
39
61
|
end
|
40
62
|
end
|
41
63
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'reverse_markdown'
|
2
4
|
|
3
5
|
module Html2rss
|
@@ -25,14 +27,17 @@ module Html2rss
|
|
25
27
|
# Would return:
|
26
28
|
# 'Lorem **ipsum** dolor'
|
27
29
|
class HtmlToMarkdown
|
30
|
+
##
|
31
|
+
# @param value [String]
|
32
|
+
# @param env [Item::Context]
|
28
33
|
def initialize(value, env)
|
29
|
-
@
|
34
|
+
@sanitized_value = SanitizeHtml.new(value, env).get
|
30
35
|
end
|
31
36
|
|
32
37
|
##
|
33
38
|
# @return [String] formatted in Markdown
|
34
39
|
def get
|
35
|
-
ReverseMarkdown.convert
|
40
|
+
ReverseMarkdown.convert(@sanitized_value)
|
36
41
|
end
|
37
42
|
end
|
38
43
|
end
|
data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
module HtmlTransformers
|
6
|
+
##
|
7
|
+
# Transformer that converts relative URLs to absolute URLs within specified HTML elements.
|
8
|
+
class TransformUrlsToAbsoluteOnes
|
9
|
+
URL_ELEMENTS_WITH_URL_ATTRIBUTE = { 'a' => :href, 'img' => :src }.freeze
|
10
|
+
|
11
|
+
def initialize(channel_url)
|
12
|
+
@channel_url = channel_url
|
13
|
+
end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Transforms URLs to absolute ones.
|
17
|
+
def call(node_name:, node:, **_env)
|
18
|
+
return unless URL_ELEMENTS_WITH_URL_ATTRIBUTE.key?(node_name)
|
19
|
+
|
20
|
+
url_attribute = URL_ELEMENTS_WITH_URL_ATTRIBUTE[node_name]
|
21
|
+
url = node[url_attribute]
|
22
|
+
node[url_attribute] = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
module AttributePostProcessors
|
5
|
+
module HtmlTransformers
|
6
|
+
##
|
7
|
+
# Transformer that wraps <img> tags into <a> tags linking to `img.src`.
|
8
|
+
class WrapImgInA
|
9
|
+
##
|
10
|
+
# Wraps <img> tags into <a> tags that link to `img.src`.
|
11
|
+
#
|
12
|
+
# @param node_name [String]
|
13
|
+
# @param node [Nokogiri::XML::Node]
|
14
|
+
# @return [nil]
|
15
|
+
def call(node_name:, node:, **_env)
|
16
|
+
return unless already_wrapped?(node_name, node)
|
17
|
+
|
18
|
+
wrap_image_in_anchor(node)
|
19
|
+
end
|
20
|
+
|
21
|
+
def already_wrapped?(node_name, node)
|
22
|
+
node_name == 'img' && node.parent.name != 'a'
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
##
|
28
|
+
# Wraps the <img> node in an <a> tag.
|
29
|
+
#
|
30
|
+
# @param node [Nokogiri::XML::Node]
|
31
|
+
# @return [nil]
|
32
|
+
def wrap_image_in_anchor(node)
|
33
|
+
anchor = Nokogiri::XML::Node.new('a', node.document)
|
34
|
+
anchor['href'] = node['src']
|
35
|
+
node.add_next_sibling(anchor)
|
36
|
+
anchor.add_child(node.remove)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -1,4 +1,7 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'kramdown'
|
4
|
+
require_relative 'sanitize_html'
|
2
5
|
|
3
6
|
module Html2rss
|
4
7
|
module AttributePostProcessors
|
@@ -30,15 +33,21 @@ module Html2rss
|
|
30
33
|
#
|
31
34
|
# <p>Price: 12.34</p>
|
32
35
|
class MarkdownToHtml
|
36
|
+
##
|
37
|
+
# @param value [String] Markdown content to convert to HTML
|
38
|
+
# @param env [Item::Context] Context object providing additional environment details
|
33
39
|
def initialize(value, env)
|
34
40
|
@value = value
|
35
41
|
@env = env
|
36
42
|
end
|
37
43
|
|
38
44
|
##
|
39
|
-
#
|
45
|
+
# Converts Markdown to sanitized HTML.
|
46
|
+
#
|
47
|
+
# @return [String] Sanitized HTML content
|
40
48
|
def get
|
41
|
-
|
49
|
+
html_content = Kramdown::Document.new(@value).to_html
|
50
|
+
SanitizeHtml.new(html_content, @env).get
|
42
51
|
end
|
43
52
|
end
|
44
53
|
end
|
@@ -1,5 +1,7 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'time'
|
4
|
+
require_relative '../utils'
|
3
5
|
|
4
6
|
module Html2rss
|
5
7
|
module AttributePostProcessors
|
@@ -24,15 +26,20 @@ module Html2rss
|
|
24
26
|
#
|
25
27
|
# It uses {https://ruby-doc.org/stdlib-2.5.3/libdoc/time/rdoc/Time.html#method-c-parse Time.parse}.
|
26
28
|
class ParseTime
|
29
|
+
##
|
30
|
+
# @param value [String] the time to parse
|
31
|
+
# @param env [Item::Context] Context object providing additional environment details
|
27
32
|
def initialize(value, env)
|
28
33
|
@value = value.to_s
|
29
34
|
@time_zone = env[:config].time_zone
|
30
35
|
end
|
31
36
|
|
32
37
|
##
|
33
|
-
#
|
38
|
+
# Converts the provided time string to RFC822 format, taking into account the configured time zone.
|
39
|
+
#
|
40
|
+
# @return [String] RFC822 formatted time
|
34
41
|
def get
|
35
|
-
|
42
|
+
Utils.use_zone(@time_zone) { Time.parse(@value).rfc822 }
|
36
43
|
end
|
37
44
|
end
|
38
45
|
end
|
@@ -1,7 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module AttributePostProcessors
|
3
5
|
##
|
4
6
|
# Returns the URI as String.
|
7
|
+
# If the URL is relative, it builds an absolute one with the channel's URL as base.
|
5
8
|
#
|
6
9
|
# Imagine this HTML structure:
|
7
10
|
#
|
@@ -19,14 +22,21 @@ module Html2rss
|
|
19
22
|
# Would return:
|
20
23
|
# 'http://why-not-use-a-link.uh'
|
21
24
|
class ParseUri
|
22
|
-
|
25
|
+
##
|
26
|
+
# @param value [String]
|
27
|
+
# @param context [Item::Context]
|
28
|
+
def initialize(value, context)
|
23
29
|
@value = value
|
30
|
+
@config_url = context.config.url
|
24
31
|
end
|
25
32
|
|
26
33
|
##
|
27
34
|
# @return [String]
|
28
35
|
def get
|
29
|
-
|
36
|
+
Html2rss::Utils.build_absolute_url_from_relative(
|
37
|
+
Html2rss::Utils.sanitize_url(@value),
|
38
|
+
@config_url
|
39
|
+
).to_s
|
30
40
|
end
|
31
41
|
end
|
32
42
|
end
|
@@ -1,17 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'sanitize'
|
4
|
+
require_relative 'html_transformers/transform_urls_to_absolute_ones'
|
5
|
+
require_relative 'html_transformers/wrap_img_in_a'
|
2
6
|
|
3
7
|
module Html2rss
|
4
8
|
module AttributePostProcessors
|
5
9
|
##
|
6
10
|
# Returns sanitized HTML code as String.
|
7
11
|
#
|
8
|
-
# It
|
12
|
+
# It sanitizes by using the [sanitize gem](https://github.com/rgrove/sanitize) with
|
13
|
+
# [Sanitize::Config::RELAXED](https://github.com/rgrove/sanitize#sanitizeconfigrelaxed).
|
14
|
+
#
|
15
|
+
# Furthermore, it adds:
|
9
16
|
#
|
10
17
|
# - `rel="nofollow noopener noreferrer"` to <a> tags
|
11
18
|
# - `referrer-policy='no-referrer'` to <img> tags
|
12
|
-
#
|
13
|
-
# It also:
|
14
|
-
#
|
15
19
|
# - wraps all <img> tags, whose direct parent is not an <a>, into an <a>
|
16
20
|
# linking to the <img>'s `src`.
|
17
21
|
#
|
@@ -35,68 +39,60 @@ module Html2rss
|
|
35
39
|
# Would return:
|
36
40
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
37
41
|
class SanitizeHtml
|
38
|
-
|
39
|
-
|
40
|
-
|
42
|
+
##
|
43
|
+
# @param value [String]
|
44
|
+
# @param env [Item::Context]
|
41
45
|
def initialize(value, env)
|
42
46
|
@value = value
|
43
47
|
@channel_url = env[:config].url
|
44
48
|
end
|
45
49
|
|
46
50
|
##
|
47
|
-
# - uses the {https://github.com/rgrove/sanitize sanitize gem}
|
48
|
-
# - uses the config {https://github.com/rgrove/sanitize#sanitizeconfigrelaxed Sanitize::Config::RELAXED}
|
49
|
-
# - adds rel="nofollow noopener noreferrer" to a elements
|
50
|
-
# - adds target="_blank" to a elements
|
51
51
|
# @return [String]
|
52
52
|
def get
|
53
|
-
Sanitize.fragment(@value, sanitize_config)
|
53
|
+
sanitized_html = Sanitize.fragment(@value, sanitize_config)
|
54
|
+
sanitized_html.to_s.gsub(/\s+/, ' ').strip
|
54
55
|
end
|
55
56
|
|
56
57
|
private
|
57
58
|
|
59
|
+
##
|
60
|
+
# @return [Sanitize::Config]
|
58
61
|
def sanitize_config
|
59
62
|
Sanitize::Config.merge(
|
60
63
|
Sanitize::Config::RELAXED,
|
61
64
|
attributes: { all: %w[dir lang alt title translate] },
|
62
|
-
add_attributes
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
65
|
+
add_attributes:,
|
66
|
+
transformers: [
|
67
|
+
method(:transform_urls_to_absolute_ones),
|
68
|
+
method(:wrap_img_in_a)
|
69
|
+
]
|
67
70
|
)
|
68
71
|
end
|
69
72
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
url = env[:node][url_attribute]
|
76
|
-
|
77
|
-
return if URI(url).absolute?
|
78
|
-
|
79
|
-
absolute_url = Html2rss::Utils.build_absolute_url_from_relative(url, @channel_url)
|
80
|
-
|
81
|
-
env[:node][url_attribute] = absolute_url
|
82
|
-
end
|
73
|
+
def add_attributes
|
74
|
+
{
|
75
|
+
'a' => { 'rel' => 'nofollow noopener noreferrer', 'target' => '_blank' },
|
76
|
+
'img' => { 'referrer-policy' => 'no-referrer' }
|
77
|
+
}
|
83
78
|
end
|
84
79
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
anchor = Nokogiri::XML::Node.new('a', img)
|
94
|
-
anchor[:href] = img[:src]
|
95
|
-
|
96
|
-
anchor.add_child img.dup
|
80
|
+
##
|
81
|
+
# Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
|
82
|
+
#
|
83
|
+
# @param env [Hash]
|
84
|
+
# @return [nil]
|
85
|
+
def transform_urls_to_absolute_ones(env)
|
86
|
+
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(@channel_url).call(**env)
|
87
|
+
end
|
97
88
|
|
98
|
-
|
99
|
-
|
89
|
+
##
|
90
|
+
# Wrapper for wrap_img_in_a.
|
91
|
+
#
|
92
|
+
# @param env [Hash]
|
93
|
+
# @return [nil]
|
94
|
+
def wrap_img_in_a(env)
|
95
|
+
HtmlTransformers::WrapImgInA.new.call(**env)
|
100
96
|
end
|
101
97
|
end
|
102
98
|
end
|
@@ -1,6 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module AttributePostProcessors
|
3
|
-
##
|
5
|
+
##
|
6
|
+
# Returns a defined part of a String.
|
4
7
|
#
|
5
8
|
# Both parameters must be an Integer and they can be negative.
|
6
9
|
# The +end+ parameter can be omitted, in that case it will not cut the
|
@@ -26,16 +29,23 @@ module Html2rss
|
|
26
29
|
# Would return:
|
27
30
|
# 'bar'
|
28
31
|
class Substring
|
32
|
+
##
|
33
|
+
# @param value [String] The original string to extract a substring from.
|
34
|
+
# @param env [Item::Context] Context object providing additional environment details.
|
29
35
|
def initialize(value, env)
|
30
36
|
@value = value
|
31
37
|
@options = env[:options]
|
32
38
|
end
|
33
39
|
|
34
40
|
##
|
35
|
-
#
|
41
|
+
# Extracts the substring from the original string based on the provided start and end indices.
|
42
|
+
#
|
43
|
+
# @return [String] The extracted substring.
|
36
44
|
def get
|
37
|
-
|
38
|
-
@
|
45
|
+
start_index = @options[:start].to_i
|
46
|
+
end_index = @options[:end]&.to_i || @value.length
|
47
|
+
|
48
|
+
@value[start_index..end_index]
|
39
49
|
end
|
40
50
|
end
|
41
51
|
end
|
@@ -1,25 +1,28 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Html2rss
|
4
4
|
module AttributePostProcessors
|
5
|
-
##
|
5
|
+
##
|
6
|
+
# Returns a formatted String according to the string pattern.
|
6
7
|
#
|
7
8
|
# If +self+ is used, the selectors extracted value will be used.
|
8
9
|
# It uses [Kernel#format](https://ruby-doc.org/core/Kernel.html#method-i-format)
|
9
10
|
#
|
10
11
|
# Imagine this HTML:
|
12
|
+
#
|
11
13
|
# <li>
|
12
14
|
# <h1>Product</h1>
|
13
15
|
# <span class="price">23,42€</span>
|
14
16
|
# </li>
|
15
17
|
#
|
18
|
+
#
|
16
19
|
# YAML usage example:
|
17
20
|
#
|
18
21
|
# selectors:
|
19
22
|
# items:
|
20
23
|
# selector: 'li'
|
21
24
|
# price:
|
22
|
-
#
|
25
|
+
# selector: '.price'
|
23
26
|
# title:
|
24
27
|
# selector: h1
|
25
28
|
# post_process:
|
@@ -29,6 +32,9 @@ module Html2rss
|
|
29
32
|
# Would return:
|
30
33
|
# 'Product (23,42€)'
|
31
34
|
class Template
|
35
|
+
##
|
36
|
+
# @param value [String]
|
37
|
+
# @param env [Item::Context]
|
32
38
|
def initialize(value, env)
|
33
39
|
@value = value
|
34
40
|
@options = env[:options]
|
@@ -39,28 +45,46 @@ module Html2rss
|
|
39
45
|
##
|
40
46
|
# @return [String]
|
41
47
|
def get
|
42
|
-
|
43
|
-
|
44
|
-
names = string.scan(/%[<|{](\w*)[>|}]/)
|
45
|
-
names.flatten!
|
46
|
-
names.compact!
|
47
|
-
names.map!(&:to_sym)
|
48
|
-
|
49
|
-
format(string, names.map { |name| [name, item_value(name)] }.to_h)
|
48
|
+
@options[:methods] ? format_string_with_methods : format_string_with_dynamic_params
|
50
49
|
end
|
51
50
|
|
52
51
|
private
|
53
52
|
|
53
|
+
##
|
54
|
+
# @return [String] the string containing the template
|
54
55
|
attr_reader :string
|
55
56
|
|
57
|
+
##
|
58
|
+
# @return [Array<String>]
|
56
59
|
def methods
|
57
|
-
@methods ||= @options[:methods].map(
|
60
|
+
@methods ||= @options[:methods].map { |method_name| item_value(method_name) }
|
58
61
|
end
|
59
62
|
|
63
|
+
##
|
64
|
+
# Formats a string using methods.
|
65
|
+
#
|
66
|
+
# @return [String]
|
67
|
+
# @deprecated Use %<id>s formatting instead. Will be removed in version 1.0.0. See README / Dynamic parameters.
|
60
68
|
def format_string_with_methods
|
69
|
+
warn '[DEPRECATION] This method of using params is deprecated and \
|
70
|
+
support for it will be removed in version 1.0.0.\
|
71
|
+
Please use dynamic parameters (i.e. %<id>s, see README.md) instead.'
|
72
|
+
|
61
73
|
string % methods
|
62
74
|
end
|
63
75
|
|
76
|
+
##
|
77
|
+
# @return [String]
|
78
|
+
def format_string_with_dynamic_params
|
79
|
+
param_names = string.scan(/%[<|{](\w*)[>|}]/)
|
80
|
+
param_names.flatten!
|
81
|
+
|
82
|
+
format(string, param_names.to_h { |name| [name.to_sym, item_value(name)] })
|
83
|
+
end
|
84
|
+
|
85
|
+
##
|
86
|
+
# @param method_name [String, Symbol]
|
87
|
+
# @return [String]
|
64
88
|
def item_value(method_name)
|
65
89
|
method_name.to_sym == :self ? @value.to_s : @item.public_send(method_name).to_s
|
66
90
|
end
|
@@ -1,13 +1,36 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
##
|
3
5
|
# Provides a namespace for attribute post processors.
|
4
6
|
module AttributePostProcessors
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
end
|
7
|
+
##
|
8
|
+
# Error raised when an unknown post processor name is requested.
|
9
|
+
class UnknownPostProcessorName < Html2rss::Error; end
|
9
10
|
|
10
|
-
|
11
|
+
##
|
12
|
+
# Maps the post processor name to the class implementing the post processor.
|
13
|
+
#
|
14
|
+
# The key is the name to use in the feed config.
|
15
|
+
NAME_TO_CLASS = {
|
16
|
+
gsub: Gsub,
|
17
|
+
html_to_markdown: HtmlToMarkdown,
|
18
|
+
markdown_to_html: MarkdownToHtml,
|
19
|
+
parse_time: ParseTime,
|
20
|
+
parse_uri: ParseUri,
|
21
|
+
sanitize_html: SanitizeHtml,
|
22
|
+
substring: Substring,
|
23
|
+
template: Template
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
##
|
27
|
+
# Retrieves the attribute post processor class based on the given name.
|
28
|
+
#
|
29
|
+
# @param name [Symbol] The name of the post processor.
|
30
|
+
# @return [Class] The attribute post processor class.
|
31
|
+
# @raise [UnknownPostProcessorName] If the requested name is not found in NAME_TO_CLASS.
|
32
|
+
def self.get_processor(name)
|
33
|
+
NAME_TO_CLASS[name.to_sym] || raise(UnknownPostProcessorName, "Can't find a post processor named '#{name}'")
|
11
34
|
end
|
12
35
|
end
|
13
36
|
end
|
data/lib/html2rss/cli.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../html2rss'
|
4
|
+
require 'thor'
|
5
|
+
|
6
|
+
module Html2rss
|
7
|
+
##
|
8
|
+
# The Html2rss command line interface.
|
9
|
+
class CLI < Thor
|
10
|
+
def self.exit_on_failure?
|
11
|
+
true
|
12
|
+
end
|
13
|
+
|
14
|
+
desc 'feed YAML_FILE [FEED_NAME] [param=value ...]', 'Print RSS built from the YAML_FILE file to stdout'
|
15
|
+
##
|
16
|
+
# Prints the feed to STDOUT.
|
17
|
+
#
|
18
|
+
# @param yaml_file [String] Path to the YAML configuration file.
|
19
|
+
# @param options [Array<String>] Additional options including feed name and parameters.
|
20
|
+
# @return [nil]
|
21
|
+
def feed(yaml_file, *options)
|
22
|
+
raise "File '#{yaml_file}' does not exist" unless File.exist?(yaml_file)
|
23
|
+
|
24
|
+
feed_name = options.shift
|
25
|
+
params = options.to_h { |opt| opt.split('=', 2) }
|
26
|
+
puts Html2rss.feed_from_yaml_config(yaml_file, feed_name, params:)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|