html2rss 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/html2rss.gemspec +5 -5
- data/lib/html2rss/attribute_post_processors/base.rb +71 -0
- data/lib/html2rss/attribute_post_processors/gsub.rb +17 -17
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +6 -7
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +5 -9
- data/lib/html2rss/attribute_post_processors/parse_time.rb +10 -10
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -9
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +7 -9
- data/lib/html2rss/attribute_post_processors/substring.rb +30 -10
- data/lib/html2rss/attribute_post_processors/template.rb +16 -8
- data/lib/html2rss/attribute_post_processors.rb +9 -1
- data/lib/html2rss/config/channel.rb +9 -9
- data/lib/html2rss/config/selectors.rb +13 -2
- data/lib/html2rss/config.rb +2 -2
- data/lib/html2rss/item.rb +15 -15
- data/lib/html2rss/item_extractors.rb +1 -1
- data/lib/html2rss/version.rb +1 -1
- data/lib/html2rss.rb +6 -0
- metadata +8 -16
- data/.gitignore +0 -12
- data/.mergify.yml +0 -15
- data/.rspec +0 -4
- data/.rubocop.yml +0 -30
- data/.yardopts +0 -6
- data/Gemfile +0 -25
- data/Gemfile.lock +0 -153
- data/bin/console +0 -16
- data/bin/setup +0 -8
- data/rakefile.rb +0 -16
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ebe536d8051a64c6e2adf9fa8e1d9d1f9fa3743541c44ca85022d0603f9032b2
|
4
|
+
data.tar.gz: 7b3aaa213aaf6a37fb6e94fa72c9936ffd2391322297553b253b097edea300cc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '03985002d050b996c1dc315cbe8e3fc79b6619447a048ad3d2dca86f792eab5c2356716cf6198a24efc61de7e7ddceba2780da49c3e68a3c9efe895eb7cf0cf1'
|
7
|
+
data.tar.gz: 8315473528f46a5ba28297af296b879a66ac00f86ba9eb117b4e6c9ec61c285e4090cfd999ff712368f5b988b1cbda460e268aa3ea8928912bcdb1960ae25a4a
|
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|

|
2
2
|
|
3
|
-
[](http://rubygems.org/gems/html2rss/) [](https://www.rubydoc.info/gems/html2rss) 
|
3
|
+
[](http://rubygems.org/gems/html2rss/) [](https://www.rubydoc.info/gems/html2rss) 
|
4
4
|
|
5
5
|
`html2rss` is a Ruby gem that generates RSS 2.0 feeds from a _feed config_.
|
6
6
|
|
data/html2rss.gemspec
CHANGED
@@ -10,23 +10,23 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.authors = ['Gil Desmarais']
|
11
11
|
spec.email = ['html2rss@desmarais.de']
|
12
12
|
|
13
|
-
spec.summary = '
|
14
|
-
spec.description = '
|
13
|
+
spec.summary = 'Generates RSS feeds from websites by scraping a URL and using CSS selectors to extract item.'
|
14
|
+
spec.description = 'Supports JSON content, custom HTTP headers, and post-processing of extracted content.'
|
15
15
|
spec.homepage = 'https://github.com/html2rss/html2rss'
|
16
16
|
spec.license = 'MIT'
|
17
17
|
spec.required_ruby_version = '>= 3.1'
|
18
18
|
|
19
19
|
if spec.respond_to?(:metadata)
|
20
20
|
spec.metadata['allowed_push_host'] = 'https://rubygems.org'
|
21
|
-
spec.metadata['changelog_uri'] =
|
21
|
+
spec.metadata['changelog_uri'] = "#{spec.homepage}/releases/tag/v#{spec.version}"
|
22
22
|
spec.metadata['rubygems_mfa_required'] = 'true'
|
23
23
|
else
|
24
24
|
raise 'RubyGems 2.0 or newer is required to protect against ' \
|
25
25
|
'public gem pushes.'
|
26
26
|
end
|
27
27
|
|
28
|
-
spec.files = `git ls-files -z`.split("\x0").
|
29
|
-
f.match(%r{^(
|
28
|
+
spec.files = `git ls-files -z`.split("\x0").select do |f|
|
29
|
+
f.match(%r{^(lib/|exe/|README.md|LICENSE|html2rss.gemspec)})
|
30
30
|
end
|
31
31
|
spec.bindir = 'exe'
|
32
32
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
##
|
5
|
+
# Provides a namespace for attribute post processors.
|
6
|
+
module AttributePostProcessors
|
7
|
+
##
|
8
|
+
# All post processors must inherit from this base class and implement `self.validate_args!` and `#get`.
|
9
|
+
class Base
|
10
|
+
# Validates the presence of required options in the context
|
11
|
+
#
|
12
|
+
# @param keys [Array<Symbol>] the keys to check for presence
|
13
|
+
# @param context [Hash] the context containing options
|
14
|
+
# @raise [MissingOption] if any key is missing
|
15
|
+
def self.expect_options(keys, context)
|
16
|
+
keys.each do |key|
|
17
|
+
unless (options = context[:options]).key?(key)
|
18
|
+
raise MissingOption, "The `#{key}` option is missing in: #{options.inspect}", [],
|
19
|
+
cause: nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Asserts that the value is of the expected type(s)
|
25
|
+
#
|
26
|
+
# @param value [Object] the value to check
|
27
|
+
# @param types [Array<Class>, Class] the expected type(s)
|
28
|
+
# @param name [String] the name of the option being checked
|
29
|
+
# @raise [InvalidType] if the value is not of the expected type(s)
|
30
|
+
def self.assert_type(value, types = [], name)
|
31
|
+
types = [types] unless types.is_a?(Array)
|
32
|
+
|
33
|
+
return if types.any? { |type| value.is_a?(type) }
|
34
|
+
|
35
|
+
error_message_template = 'The type of `%s` must be %s, but is: %s'
|
36
|
+
raise InvalidType, format(error_message_template, name, types.join(' or '), value.class), [], cause: nil
|
37
|
+
end
|
38
|
+
|
39
|
+
# private_class_method :expect_options, :assert_type
|
40
|
+
|
41
|
+
##
|
42
|
+
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
43
|
+
def self.validate_args!(_value, _context)
|
44
|
+
raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
|
45
|
+
end
|
46
|
+
|
47
|
+
# Initializes the post processor
|
48
|
+
#
|
49
|
+
# @param value [Object] the value to be processed
|
50
|
+
# @param context [Item::Context] the context
|
51
|
+
def initialize(value, context)
|
52
|
+
klass = self.class
|
53
|
+
# TODO: get rid of Hash
|
54
|
+
klass.assert_type(context, [Item::Context, Hash], 'context')
|
55
|
+
klass.validate_args!(value, context)
|
56
|
+
|
57
|
+
@value = value
|
58
|
+
@context = context
|
59
|
+
end
|
60
|
+
|
61
|
+
attr_reader :value, :context
|
62
|
+
|
63
|
+
# Abstract method to be implemented by subclasses
|
64
|
+
#
|
65
|
+
# @raise [NotImplementedError] if not implemented in subclass
|
66
|
+
def get
|
67
|
+
raise NotImplementedError, 'You must implement the `get` method in the post processor'
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -25,39 +25,39 @@ module Html2rss
|
|
25
25
|
# `replacement` can be a String or a Hash.
|
26
26
|
#
|
27
27
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
28
|
-
class Gsub
|
28
|
+
class Gsub < Base
|
29
|
+
def self.validate_args!(value, context)
|
30
|
+
assert_type value, String, :value
|
31
|
+
expect_options(%i[replacement pattern], context)
|
32
|
+
assert_type context.dig(:options, :replacement), [String, Hash], :replacement
|
33
|
+
end
|
34
|
+
|
29
35
|
##
|
30
36
|
# @param value [String]
|
31
37
|
# @param context [Item::Context]
|
32
38
|
def initialize(value, context)
|
33
|
-
|
34
|
-
|
39
|
+
super
|
40
|
+
|
41
|
+
options = context[:options]
|
42
|
+
|
43
|
+
@replacement = options[:replacement]
|
44
|
+
@pattern = options[:pattern]
|
35
45
|
end
|
36
46
|
|
37
47
|
##
|
38
48
|
# @return [String]
|
39
49
|
def get
|
40
|
-
|
50
|
+
value.to_s.gsub(pattern, replacement)
|
41
51
|
end
|
42
52
|
|
43
53
|
private
|
44
54
|
|
55
|
+
attr_accessor :replacement
|
56
|
+
|
45
57
|
##
|
46
58
|
# @return [Regexp]
|
47
59
|
def pattern
|
48
|
-
pattern
|
49
|
-
raise ArgumentError, 'The `pattern` option is missing' unless pattern
|
50
|
-
|
51
|
-
pattern.is_a?(String) ? Utils.build_regexp_from_string(pattern) : pattern
|
52
|
-
end
|
53
|
-
|
54
|
-
##
|
55
|
-
# @return [Hash, String]
|
56
|
-
def replacement
|
57
|
-
replacement = @options[:replacement]
|
58
|
-
return replacement if replacement.is_a?(String) || replacement.is_a?(Hash)
|
59
|
-
|
60
|
-
raise ArgumentError, 'The `replacement` option must be a String or Hash'
|
60
|
+
@pattern.is_a?(String) ? Utils.build_regexp_from_string(@pattern) : @pattern
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
@@ -26,18 +26,17 @@ module Html2rss
|
|
26
26
|
#
|
27
27
|
# Would return:
|
28
28
|
# 'Lorem **ipsum** dolor'
|
29
|
-
class HtmlToMarkdown
|
30
|
-
|
31
|
-
|
32
|
-
# @param env [Item::Context]
|
33
|
-
def initialize(value, env)
|
34
|
-
@sanitized_value = SanitizeHtml.new(value, env).get
|
29
|
+
class HtmlToMarkdown < Base
|
30
|
+
def self.validate_args!(value, _context)
|
31
|
+
assert_type value, String, :value
|
35
32
|
end
|
36
33
|
|
37
34
|
##
|
38
35
|
# @return [String] formatted in Markdown
|
39
36
|
def get
|
40
|
-
|
37
|
+
sanitized_value = SanitizeHtml.new(value, context).get
|
38
|
+
|
39
|
+
ReverseMarkdown.convert(sanitized_value)
|
41
40
|
end
|
42
41
|
end
|
43
42
|
end
|
@@ -32,13 +32,9 @@ module Html2rss
|
|
32
32
|
# <h1>Section</h1>
|
33
33
|
#
|
34
34
|
# <p>Price: 12.34</p>
|
35
|
-
class MarkdownToHtml
|
36
|
-
|
37
|
-
|
38
|
-
# @param env [Item::Context] Context object providing additional environment details
|
39
|
-
def initialize(value, env)
|
40
|
-
@value = value
|
41
|
-
@env = env
|
35
|
+
class MarkdownToHtml < Base
|
36
|
+
def self.validate_args!(value, _context)
|
37
|
+
assert_type value, String, :value
|
42
38
|
end
|
43
39
|
|
44
40
|
##
|
@@ -46,8 +42,8 @@ module Html2rss
|
|
46
42
|
#
|
47
43
|
# @return [String] Sanitized HTML content
|
48
44
|
def get
|
49
|
-
html_content = Kramdown::Document.new(
|
50
|
-
SanitizeHtml.new(html_content,
|
45
|
+
html_content = Kramdown::Document.new(value).to_html
|
46
|
+
SanitizeHtml.new(html_content, context).get
|
51
47
|
end
|
52
48
|
end
|
53
49
|
end
|
@@ -24,22 +24,22 @@ module Html2rss
|
|
24
24
|
# Would return:
|
25
25
|
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
26
26
|
#
|
27
|
-
# It uses
|
28
|
-
class ParseTime
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
def initialize(value, env)
|
33
|
-
@value = value.to_s
|
34
|
-
@time_zone = env[:config].time_zone
|
27
|
+
# It uses `Time.parse`.
|
28
|
+
class ParseTime < Base
|
29
|
+
def self.validate_args!(value, context)
|
30
|
+
assert_type value, String, :value
|
31
|
+
assert_type context[:config].time_zone, String, :time_zone
|
35
32
|
end
|
36
33
|
|
37
34
|
##
|
38
|
-
# Converts the provided time string to RFC822 format, taking into account the
|
35
|
+
# Converts the provided time string to RFC822 format, taking into account the time_zone.
|
39
36
|
#
|
40
37
|
# @return [String] RFC822 formatted time
|
38
|
+
# @raise [TZInfo::InvalidTimezoneIdentifier] if the configured time zone is invalid
|
41
39
|
def get
|
42
|
-
|
40
|
+
time_zone = context[:config].time_zone
|
41
|
+
|
42
|
+
Utils.use_zone(time_zone) { Time.parse(value).rfc822 }
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|
@@ -21,21 +21,24 @@ module Html2rss
|
|
21
21
|
#
|
22
22
|
# Would return:
|
23
23
|
# 'http://why-not-use-a-link.uh'
|
24
|
-
class ParseUri
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
24
|
+
class ParseUri < Base
|
25
|
+
def self.validate_args!(value, context)
|
26
|
+
url_types = [String, URI::HTTP, Addressable::URI].freeze
|
27
|
+
|
28
|
+
assert_type(value, url_types, :value)
|
29
|
+
assert_type(context.config.url, url_types, :url)
|
30
|
+
|
31
|
+
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
31
32
|
end
|
32
33
|
|
33
34
|
##
|
34
35
|
# @return [String]
|
35
36
|
def get
|
37
|
+
config_url = context.config.url
|
38
|
+
|
36
39
|
Html2rss::Utils.build_absolute_url_from_relative(
|
37
|
-
Html2rss::Utils.sanitize_url(
|
38
|
-
|
40
|
+
Html2rss::Utils.sanitize_url(value),
|
41
|
+
config_url
|
39
42
|
).to_s
|
40
43
|
end
|
41
44
|
end
|
@@ -38,19 +38,15 @@ module Html2rss
|
|
38
38
|
#
|
39
39
|
# Would return:
|
40
40
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
41
|
-
class SanitizeHtml
|
42
|
-
|
43
|
-
|
44
|
-
# @param env [Item::Context]
|
45
|
-
def initialize(value, env)
|
46
|
-
@value = value
|
47
|
-
@channel_url = env[:config].url
|
41
|
+
class SanitizeHtml < Base
|
42
|
+
def self.validate_args!(value, _context)
|
43
|
+
assert_type value, String, :value
|
48
44
|
end
|
49
45
|
|
50
46
|
##
|
51
47
|
# @return [String]
|
52
48
|
def get
|
53
|
-
sanitized_html = Sanitize.fragment(
|
49
|
+
sanitized_html = Sanitize.fragment(value, sanitize_config)
|
54
50
|
sanitized_html.to_s.gsub(/\s+/, ' ').strip
|
55
51
|
end
|
56
52
|
|
@@ -77,13 +73,15 @@ module Html2rss
|
|
77
73
|
}
|
78
74
|
end
|
79
75
|
|
76
|
+
def channel_url = context[:config].url
|
77
|
+
|
80
78
|
##
|
81
79
|
# Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
|
82
80
|
#
|
83
81
|
# @param env [Hash]
|
84
82
|
# @return [nil]
|
85
83
|
def transform_urls_to_absolute_ones(env)
|
86
|
-
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(
|
84
|
+
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
|
87
85
|
end
|
88
86
|
|
89
87
|
##
|
@@ -28,13 +28,15 @@ module Html2rss
|
|
28
28
|
#
|
29
29
|
# Would return:
|
30
30
|
# 'bar'
|
31
|
-
class Substring
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
31
|
+
class Substring < Base
|
32
|
+
def self.validate_args!(value, context)
|
33
|
+
assert_type value, String, :value
|
34
|
+
|
35
|
+
options = context[:options]
|
36
|
+
assert_type options[:start], Integer, :start
|
37
|
+
|
38
|
+
end_index = options[:end]
|
39
|
+
assert_type end_index, Integer, :end if end_index
|
38
40
|
end
|
39
41
|
|
40
42
|
##
|
@@ -42,11 +44,29 @@ module Html2rss
|
|
42
44
|
#
|
43
45
|
# @return [String] The extracted substring.
|
44
46
|
def get
|
45
|
-
|
46
|
-
|
47
|
+
value[range]
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# Determines the range for the substring extraction based on the provided start and end indices.
|
52
|
+
#
|
53
|
+
# @return [Range] The range object representing the start and end/Infinity (integers).
|
54
|
+
def range
|
55
|
+
return (start_index..) unless end_index?
|
56
|
+
|
57
|
+
if start_index == end_index
|
58
|
+
raise ArgumentError,
|
59
|
+
'The `start` value must be unequal to the `end` value.'
|
60
|
+
end
|
47
61
|
|
48
|
-
|
62
|
+
(start_index..end_index)
|
49
63
|
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def end_index? = !context[:options][:end].to_s.empty?
|
68
|
+
def end_index = context[:options][:end].to_i
|
69
|
+
def start_index = context[:options][:start].to_i
|
50
70
|
end
|
51
71
|
end
|
52
72
|
end
|
@@ -31,15 +31,23 @@ module Html2rss
|
|
31
31
|
#
|
32
32
|
# Would return:
|
33
33
|
# 'Product (23,42€)'
|
34
|
-
class Template
|
34
|
+
class Template < Base
|
35
|
+
def self.validate_args!(value, context)
|
36
|
+
assert_type value, String, :value
|
37
|
+
|
38
|
+
string = context[:options]&.dig(:string).to_s
|
39
|
+
raise InvalidType, 'The `string` template is absent.' if string.empty?
|
40
|
+
end
|
41
|
+
|
35
42
|
##
|
36
43
|
# @param value [String]
|
37
|
-
# @param
|
38
|
-
def initialize(value,
|
39
|
-
|
40
|
-
|
41
|
-
@
|
42
|
-
@
|
44
|
+
# @param context [Item::Context]
|
45
|
+
def initialize(value, context)
|
46
|
+
super
|
47
|
+
|
48
|
+
@options = context[:options] || {}
|
49
|
+
@item = context[:item]
|
50
|
+
@string = @options[:string].to_s
|
43
51
|
end
|
44
52
|
|
45
53
|
##
|
@@ -86,7 +94,7 @@ module Html2rss
|
|
86
94
|
# @param method_name [String, Symbol]
|
87
95
|
# @return [String]
|
88
96
|
def item_value(method_name)
|
89
|
-
method_name.to_sym == :self ?
|
97
|
+
method_name.to_sym == :self ? value : @item.public_send(method_name).to_s
|
90
98
|
end
|
91
99
|
end
|
92
100
|
end
|
@@ -6,7 +6,15 @@ module Html2rss
|
|
6
6
|
module AttributePostProcessors
|
7
7
|
##
|
8
8
|
# Error raised when an unknown post processor name is requested.
|
9
|
-
class UnknownPostProcessorName <
|
9
|
+
class UnknownPostProcessorName < Html2rss::Error; end
|
10
|
+
|
11
|
+
##
|
12
|
+
# Error raised when a required option is missing.
|
13
|
+
class MissingOption < Html2rss::Error; end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Error raised when an invalid type is provided.
|
17
|
+
class InvalidType < Html2rss::Error; end
|
10
18
|
|
11
19
|
##
|
12
20
|
# Maps the post processor name to the class implementing the post processor.
|
@@ -11,6 +11,15 @@ module Html2rss
|
|
11
11
|
# 1. the RSS channel attributes
|
12
12
|
# 2. html2rss options like json or custom HTTP-headers for the request
|
13
13
|
class Channel
|
14
|
+
##
|
15
|
+
# @param config [Hash<Symbol, Object>]
|
16
|
+
# @return [Set<String>] the required parameter names
|
17
|
+
def self.required_params_for_config(config)
|
18
|
+
config.each_with_object(Set.new) do |(_, value), required_params|
|
19
|
+
required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
14
23
|
##
|
15
24
|
# @param channel [Hash<Symbol, Object>]
|
16
25
|
# @param params [Hash]
|
@@ -77,15 +86,6 @@ module Html2rss
|
|
77
86
|
config.fetch(:json, false)
|
78
87
|
end
|
79
88
|
|
80
|
-
##
|
81
|
-
# @param config [Hash<Symbol, Object>]
|
82
|
-
# @return [Set<String>] the required parameter names
|
83
|
-
def self.required_params_for_config(config)
|
84
|
-
config.each_with_object(Set.new) do |(_, value), required_params|
|
85
|
-
required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
89
|
private
|
90
90
|
|
91
91
|
# @return [Hash<Symbol, Object>]
|
@@ -10,6 +10,9 @@ module Html2rss
|
|
10
10
|
# Struct to represent a selector with associated attributes for extraction and processing.
|
11
11
|
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
|
12
12
|
|
13
|
+
# raised when an invalid selector name is used
|
14
|
+
class InvalidSelectorName < Html2rss::Error; end
|
15
|
+
|
13
16
|
##
|
14
17
|
# @param config [Hash<Symbol, Object>]
|
15
18
|
def initialize(config)
|
@@ -28,9 +31,15 @@ module Html2rss
|
|
28
31
|
# @param name [Symbol]
|
29
32
|
# @return [Selector]
|
30
33
|
def selector(name)
|
31
|
-
raise
|
34
|
+
raise InvalidSelectorName, "invalid selector name: #{name}" unless selector?(name)
|
35
|
+
|
36
|
+
keywords = config[name].slice(*available_keys)
|
32
37
|
|
33
|
-
|
38
|
+
if (additional_keys = available_keys - keywords.keys).any?
|
39
|
+
warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
|
40
|
+
end
|
41
|
+
|
42
|
+
Selector.new(keywords)
|
34
43
|
end
|
35
44
|
|
36
45
|
##
|
@@ -86,6 +95,8 @@ module Html2rss
|
|
86
95
|
array.map!(&:to_sym)
|
87
96
|
end.to_set
|
88
97
|
end
|
98
|
+
|
99
|
+
def available_keys = @available_keys ||= Selector.members
|
89
100
|
end
|
90
101
|
end
|
91
102
|
end
|
data/lib/html2rss/config.rb
CHANGED
@@ -12,11 +12,11 @@ module Html2rss
|
|
12
12
|
##
|
13
13
|
# The Error class to be thrown when a feed config requires params, but none
|
14
14
|
# were passed to Config.
|
15
|
-
class ParamsMissing <
|
15
|
+
class ParamsMissing < Html2rss::Error; end
|
16
16
|
|
17
17
|
##
|
18
18
|
# Thrown when the feed config does not contain a value at `:channel`.
|
19
|
-
class ChannelMissing <
|
19
|
+
class ChannelMissing < Html2rss::Error; end
|
20
20
|
|
21
21
|
# Struct to store XML Stylesheet attributes
|
22
22
|
Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
|
data/lib/html2rss/item.rb
CHANGED
@@ -16,6 +16,21 @@ module Html2rss
|
|
16
16
|
# Class to keep an Item's <enclosure>.
|
17
17
|
Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
|
18
18
|
|
19
|
+
##
|
20
|
+
# Fetches items from a given URL using configuration settings.
|
21
|
+
#
|
22
|
+
# @param url [String] URL to fetch items from.
|
23
|
+
# @param config [Html2rss::Config] Configuration object.
|
24
|
+
# @return [Array<Html2rss::Item>] list of items fetched.
|
25
|
+
def self.from_url(url, config)
|
26
|
+
body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
|
27
|
+
|
28
|
+
Nokogiri.HTML(body)
|
29
|
+
.css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
|
30
|
+
.map { |xml| new(xml, config) }
|
31
|
+
.select(&:valid?)
|
32
|
+
end
|
33
|
+
|
19
34
|
##
|
20
35
|
# @param xml [Nokogiri::XML::Element]
|
21
36
|
# @param config [Html2rss::Config]
|
@@ -122,21 +137,6 @@ module Html2rss
|
|
122
137
|
)
|
123
138
|
end
|
124
139
|
|
125
|
-
##
|
126
|
-
# Fetches items from a given URL using configuration settings.
|
127
|
-
#
|
128
|
-
# @param url [String] URL to fetch items from.
|
129
|
-
# @param config [Html2rss::Config] Configuration object.
|
130
|
-
# @return [Array<Html2rss::Item>] list of items fetched.
|
131
|
-
def self.from_url(url, config)
|
132
|
-
body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
|
133
|
-
|
134
|
-
Nokogiri.HTML(body)
|
135
|
-
.css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
|
136
|
-
.map { |xml| new(xml, config) }
|
137
|
-
.select(&:valid?)
|
138
|
-
end
|
139
|
-
|
140
140
|
private
|
141
141
|
|
142
142
|
# @return [Nokogiri::XML::Element] XML element representing the item.
|
@@ -6,7 +6,7 @@ module Html2rss
|
|
6
6
|
module ItemExtractors
|
7
7
|
##
|
8
8
|
# The Error class to be thrown when an unknown extractor name is requested.
|
9
|
-
class UnknownExtractorName <
|
9
|
+
class UnknownExtractorName < Html2rss::Error; end
|
10
10
|
|
11
11
|
##
|
12
12
|
# Maps the extractor name to the class implementing the extractor.
|
data/lib/html2rss/version.rb
CHANGED
data/lib/html2rss.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-08-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -204,8 +204,8 @@ dependencies:
|
|
204
204
|
- - ">="
|
205
205
|
- !ruby/object:Gem::Version
|
206
206
|
version: '0'
|
207
|
-
description:
|
208
|
-
|
207
|
+
description: Supports JSON content, custom HTTP headers, and post-processing of extracted
|
208
|
+
content.
|
209
209
|
email:
|
210
210
|
- html2rss@desmarais.de
|
211
211
|
executables:
|
@@ -213,21 +213,13 @@ executables:
|
|
213
213
|
extensions: []
|
214
214
|
extra_rdoc_files: []
|
215
215
|
files:
|
216
|
-
- ".gitignore"
|
217
|
-
- ".mergify.yml"
|
218
|
-
- ".rspec"
|
219
|
-
- ".rubocop.yml"
|
220
|
-
- ".yardopts"
|
221
|
-
- Gemfile
|
222
|
-
- Gemfile.lock
|
223
216
|
- LICENSE
|
224
217
|
- README.md
|
225
|
-
- bin/console
|
226
|
-
- bin/setup
|
227
218
|
- exe/html2rss
|
228
219
|
- html2rss.gemspec
|
229
220
|
- lib/html2rss.rb
|
230
221
|
- lib/html2rss/attribute_post_processors.rb
|
222
|
+
- lib/html2rss/attribute_post_processors/base.rb
|
231
223
|
- lib/html2rss/attribute_post_processors/gsub.rb
|
232
224
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
233
225
|
- lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
|
@@ -256,13 +248,12 @@ files:
|
|
256
248
|
- lib/html2rss/rss_builder/stylesheet.rb
|
257
249
|
- lib/html2rss/utils.rb
|
258
250
|
- lib/html2rss/version.rb
|
259
|
-
- rakefile.rb
|
260
251
|
homepage: https://github.com/html2rss/html2rss
|
261
252
|
licenses:
|
262
253
|
- MIT
|
263
254
|
metadata:
|
264
255
|
allowed_push_host: https://rubygems.org
|
265
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases
|
256
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.12.0
|
266
257
|
rubygems_mfa_required: 'true'
|
267
258
|
post_install_message:
|
268
259
|
rdoc_options: []
|
@@ -282,5 +273,6 @@ requirements: []
|
|
282
273
|
rubygems_version: 3.5.11
|
283
274
|
signing_key:
|
284
275
|
specification_version: 4
|
285
|
-
summary:
|
276
|
+
summary: Generates RSS feeds from websites by scraping a URL and using CSS selectors
|
277
|
+
to extract item.
|
286
278
|
test_files: []
|
data/.gitignore
DELETED
data/.mergify.yml
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
queue_rules:
|
2
|
-
- name: dependabot
|
3
|
-
conditions:
|
4
|
-
- author=dependabot[bot]
|
5
|
-
- status-success=test
|
6
|
-
- base=master
|
7
|
-
|
8
|
-
pull_request_rules:
|
9
|
-
- name: automatic merge for Dependabot pull requests
|
10
|
-
conditions:
|
11
|
-
- author=dependabot[bot]
|
12
|
-
actions:
|
13
|
-
queue:
|
14
|
-
method: squash
|
15
|
-
name: dependabot
|
data/.rspec
DELETED
data/.rubocop.yml
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
require:
|
2
|
-
- rubocop-performance
|
3
|
-
- rubocop-rspec
|
4
|
-
- rubocop-md
|
5
|
-
- rubocop-rake
|
6
|
-
|
7
|
-
AllCops:
|
8
|
-
DisplayCopNames: true
|
9
|
-
NewCops: enable
|
10
|
-
Exclude:
|
11
|
-
- vendor/**/*
|
12
|
-
|
13
|
-
Metrics/BlockLength:
|
14
|
-
Exclude:
|
15
|
-
- "spec/**/*_spec.rb"
|
16
|
-
- html2rss.gemspec
|
17
|
-
|
18
|
-
RSpec/NestedGroups:
|
19
|
-
Exclude:
|
20
|
-
- spec/html2rss_spec.rb
|
21
|
-
|
22
|
-
RSpec/DescribeClass:
|
23
|
-
Exclude:
|
24
|
-
- spec/exe/**/*_spec.rb
|
25
|
-
|
26
|
-
RSpec/NamedSubject:
|
27
|
-
Enabled: false
|
28
|
-
|
29
|
-
Naming/RescuedExceptionsVariableName:
|
30
|
-
PreferredName: error
|
data/.yardopts
DELETED
data/Gemfile
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
source 'https://rubygems.org'
|
4
|
-
|
5
|
-
git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
|
6
|
-
|
7
|
-
# Specify your gem's dependencies in html2rss.gemspec
|
8
|
-
gemspec
|
9
|
-
|
10
|
-
group :development, :test do
|
11
|
-
gem 'byebug'
|
12
|
-
gem 'rake'
|
13
|
-
gem 'rspec', '~> 3.0'
|
14
|
-
gem 'rubocop'
|
15
|
-
gem 'rubocop-md'
|
16
|
-
gem 'rubocop-performance'
|
17
|
-
gem 'rubocop-rake'
|
18
|
-
gem 'rubocop-rspec'
|
19
|
-
gem 'vcr'
|
20
|
-
gem 'yard'
|
21
|
-
end
|
22
|
-
|
23
|
-
group :test do
|
24
|
-
gem 'simplecov', require: false
|
25
|
-
end
|
data/Gemfile.lock
DELETED
@@ -1,153 +0,0 @@
|
|
1
|
-
PATH
|
2
|
-
remote: .
|
3
|
-
specs:
|
4
|
-
html2rss (0.10.0)
|
5
|
-
addressable (~> 2.7)
|
6
|
-
faraday (> 2.0.1, < 3.0)
|
7
|
-
faraday-follow_redirects
|
8
|
-
kramdown
|
9
|
-
mime-types (> 3.0)
|
10
|
-
nokogiri (>= 1.10, < 2.0)
|
11
|
-
regexp_parser
|
12
|
-
reverse_markdown (~> 2.0)
|
13
|
-
rss
|
14
|
-
sanitize (~> 6.0)
|
15
|
-
thor
|
16
|
-
tzinfo
|
17
|
-
zeitwerk
|
18
|
-
|
19
|
-
GEM
|
20
|
-
remote: https://rubygems.org/
|
21
|
-
specs:
|
22
|
-
addressable (2.8.6)
|
23
|
-
public_suffix (>= 2.0.2, < 6.0)
|
24
|
-
ast (2.4.2)
|
25
|
-
byebug (11.1.3)
|
26
|
-
concurrent-ruby (1.2.3)
|
27
|
-
crass (1.0.6)
|
28
|
-
diff-lcs (1.5.1)
|
29
|
-
docile (1.4.0)
|
30
|
-
faraday (2.9.0)
|
31
|
-
faraday-net_http (>= 2.0, < 3.2)
|
32
|
-
faraday-follow_redirects (0.3.0)
|
33
|
-
faraday (>= 1, < 3)
|
34
|
-
faraday-net_http (3.1.0)
|
35
|
-
net-http
|
36
|
-
json (2.7.2)
|
37
|
-
kramdown (2.4.0)
|
38
|
-
rexml
|
39
|
-
language_server-protocol (3.17.0.3)
|
40
|
-
mime-types (3.5.2)
|
41
|
-
mime-types-data (~> 3.2015)
|
42
|
-
mime-types-data (3.2024.0305)
|
43
|
-
mini_portile2 (2.8.6)
|
44
|
-
net-http (0.4.1)
|
45
|
-
uri
|
46
|
-
nokogiri (1.16.5)
|
47
|
-
mini_portile2 (~> 2.8.2)
|
48
|
-
racc (~> 1.4)
|
49
|
-
nokogiri (1.16.5-x86_64-darwin)
|
50
|
-
racc (~> 1.4)
|
51
|
-
nokogiri (1.16.5-x86_64-linux)
|
52
|
-
racc (~> 1.4)
|
53
|
-
parallel (1.24.0)
|
54
|
-
parser (3.3.1.0)
|
55
|
-
ast (~> 2.4.1)
|
56
|
-
racc
|
57
|
-
public_suffix (5.0.5)
|
58
|
-
racc (1.7.3)
|
59
|
-
rainbow (3.1.1)
|
60
|
-
rake (13.2.1)
|
61
|
-
regexp_parser (2.9.0)
|
62
|
-
reverse_markdown (2.1.1)
|
63
|
-
nokogiri
|
64
|
-
rexml (3.3.2)
|
65
|
-
strscan
|
66
|
-
rspec (3.13.0)
|
67
|
-
rspec-core (~> 3.13.0)
|
68
|
-
rspec-expectations (~> 3.13.0)
|
69
|
-
rspec-mocks (~> 3.13.0)
|
70
|
-
rspec-core (3.13.0)
|
71
|
-
rspec-support (~> 3.13.0)
|
72
|
-
rspec-expectations (3.13.0)
|
73
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
74
|
-
rspec-support (~> 3.13.0)
|
75
|
-
rspec-mocks (3.13.0)
|
76
|
-
diff-lcs (>= 1.2.0, < 2.0)
|
77
|
-
rspec-support (~> 3.13.0)
|
78
|
-
rspec-support (3.13.1)
|
79
|
-
rss (0.3.0)
|
80
|
-
rexml
|
81
|
-
rubocop (1.63.4)
|
82
|
-
json (~> 2.3)
|
83
|
-
language_server-protocol (>= 3.17.0)
|
84
|
-
parallel (~> 1.10)
|
85
|
-
parser (>= 3.3.0.2)
|
86
|
-
rainbow (>= 2.2.2, < 4.0)
|
87
|
-
regexp_parser (>= 1.8, < 3.0)
|
88
|
-
rexml (>= 3.2.5, < 4.0)
|
89
|
-
rubocop-ast (>= 1.31.1, < 2.0)
|
90
|
-
ruby-progressbar (~> 1.7)
|
91
|
-
unicode-display_width (>= 2.4.0, < 3.0)
|
92
|
-
rubocop-ast (1.31.3)
|
93
|
-
parser (>= 3.3.1.0)
|
94
|
-
rubocop-capybara (2.20.0)
|
95
|
-
rubocop (~> 1.41)
|
96
|
-
rubocop-factory_bot (2.25.1)
|
97
|
-
rubocop (~> 1.41)
|
98
|
-
rubocop-md (1.2.2)
|
99
|
-
rubocop (>= 1.0)
|
100
|
-
rubocop-performance (1.21.0)
|
101
|
-
rubocop (>= 1.48.1, < 2.0)
|
102
|
-
rubocop-ast (>= 1.31.1, < 2.0)
|
103
|
-
rubocop-rake (0.6.0)
|
104
|
-
rubocop (~> 1.0)
|
105
|
-
rubocop-rspec (2.29.1)
|
106
|
-
rubocop (~> 1.40)
|
107
|
-
rubocop-capybara (~> 2.17)
|
108
|
-
rubocop-factory_bot (~> 2.22)
|
109
|
-
rubocop-rspec_rails (~> 2.28)
|
110
|
-
rubocop-rspec_rails (2.28.3)
|
111
|
-
rubocop (~> 1.40)
|
112
|
-
ruby-progressbar (1.13.0)
|
113
|
-
sanitize (6.1.0)
|
114
|
-
crass (~> 1.0.2)
|
115
|
-
nokogiri (>= 1.12.0)
|
116
|
-
simplecov (0.22.0)
|
117
|
-
docile (~> 1.1)
|
118
|
-
simplecov-html (~> 0.11)
|
119
|
-
simplecov_json_formatter (~> 0.1)
|
120
|
-
simplecov-html (0.12.3)
|
121
|
-
simplecov_json_formatter (0.1.4)
|
122
|
-
strscan (3.1.0)
|
123
|
-
thor (1.3.1)
|
124
|
-
tzinfo (2.0.6)
|
125
|
-
concurrent-ruby (~> 1.0)
|
126
|
-
unicode-display_width (2.5.0)
|
127
|
-
uri (0.13.0)
|
128
|
-
vcr (6.2.0)
|
129
|
-
yard (0.9.36)
|
130
|
-
zeitwerk (2.6.13)
|
131
|
-
|
132
|
-
PLATFORMS
|
133
|
-
ruby
|
134
|
-
x86_64-darwin
|
135
|
-
x86_64-darwin-20
|
136
|
-
x86_64-linux
|
137
|
-
|
138
|
-
DEPENDENCIES
|
139
|
-
byebug
|
140
|
-
html2rss!
|
141
|
-
rake
|
142
|
-
rspec (~> 3.0)
|
143
|
-
rubocop
|
144
|
-
rubocop-md
|
145
|
-
rubocop-performance
|
146
|
-
rubocop-rake
|
147
|
-
rubocop-rspec
|
148
|
-
simplecov
|
149
|
-
vcr
|
150
|
-
yard
|
151
|
-
|
152
|
-
BUNDLED WITH
|
153
|
-
2.4.1
|
data/bin/console
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
# frozen_string_literal: true
|
3
|
-
|
4
|
-
require 'bundler/setup'
|
5
|
-
require 'html2rss'
|
6
|
-
require 'byebug'
|
7
|
-
|
8
|
-
# You can add fixtures and/or initialization code here to make experimenting
|
9
|
-
# with your gem easier. You can also use a different console, if you like.
|
10
|
-
|
11
|
-
# (If you use this, don't forget to add pry to your Gemfile!)
|
12
|
-
# require "pry"
|
13
|
-
# Pry.start
|
14
|
-
|
15
|
-
require 'irb'
|
16
|
-
IRB.start(__FILE__)
|
data/bin/setup
DELETED
data/rakefile.rb
DELETED
@@ -1,16 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'bundler'
|
4
|
-
require 'rake'
|
5
|
-
require 'rspec'
|
6
|
-
require 'rspec/core/rake_task'
|
7
|
-
|
8
|
-
Bundler.setup
|
9
|
-
Bundler::GemHelper.install_tasks
|
10
|
-
|
11
|
-
task default: [:spec]
|
12
|
-
|
13
|
-
desc 'Run all examples'
|
14
|
-
RSpec::Core::RakeTask.new(:spec) do |t|
|
15
|
-
t.ruby_opts = %w[-w]
|
16
|
-
end
|