html2rss 0.11.0 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/html2rss/attribute_post_processors/base.rb +71 -0
- data/lib/html2rss/attribute_post_processors/gsub.rb +17 -17
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +6 -7
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +5 -9
- data/lib/html2rss/attribute_post_processors/parse_time.rb +10 -10
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -9
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +7 -9
- data/lib/html2rss/attribute_post_processors/substring.rb +30 -10
- data/lib/html2rss/attribute_post_processors/template.rb +16 -8
- data/lib/html2rss/attribute_post_processors.rb +8 -0
- data/lib/html2rss/config/selectors.rb +13 -2
- data/lib/html2rss/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ebe536d8051a64c6e2adf9fa8e1d9d1f9fa3743541c44ca85022d0603f9032b2
|
4
|
+
data.tar.gz: 7b3aaa213aaf6a37fb6e94fa72c9936ffd2391322297553b253b097edea300cc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '03985002d050b996c1dc315cbe8e3fc79b6619447a048ad3d2dca86f792eab5c2356716cf6198a24efc61de7e7ddceba2780da49c3e68a3c9efe895eb7cf0cf1'
|
7
|
+
data.tar.gz: 8315473528f46a5ba28297af296b879a66ac00f86ba9eb117b4e6c9ec61c285e4090cfd999ff712368f5b988b1cbda460e268aa3ea8928912bcdb1960ae25a4a
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
##
|
5
|
+
# Provides a namespace for attribute post processors.
|
6
|
+
module AttributePostProcessors
|
7
|
+
##
|
8
|
+
# All post processors must inherit from this base class and implement `self.validate_args!` and `#get`.
|
9
|
+
class Base
|
10
|
+
# Validates the presence of required options in the context
|
11
|
+
#
|
12
|
+
# @param keys [Array<Symbol>] the keys to check for presence
|
13
|
+
# @param context [Hash] the context containing options
|
14
|
+
# @raise [MissingOption] if any key is missing
|
15
|
+
def self.expect_options(keys, context)
|
16
|
+
keys.each do |key|
|
17
|
+
unless (options = context[:options]).key?(key)
|
18
|
+
raise MissingOption, "The `#{key}` option is missing in: #{options.inspect}", [],
|
19
|
+
cause: nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Asserts that the value is of the expected type(s)
|
25
|
+
#
|
26
|
+
# @param value [Object] the value to check
|
27
|
+
# @param types [Array<Class>, Class] the expected type(s)
|
28
|
+
# @param name [String] the name of the option being checked
|
29
|
+
# @raise [InvalidType] if the value is not of the expected type(s)
|
30
|
+
def self.assert_type(value, types = [], name)
|
31
|
+
types = [types] unless types.is_a?(Array)
|
32
|
+
|
33
|
+
return if types.any? { |type| value.is_a?(type) }
|
34
|
+
|
35
|
+
error_message_template = 'The type of `%s` must be %s, but is: %s'
|
36
|
+
raise InvalidType, format(error_message_template, name, types.join(' or '), value.class), [], cause: nil
|
37
|
+
end
|
38
|
+
|
39
|
+
# private_class_method :expect_options, :assert_type
|
40
|
+
|
41
|
+
##
|
42
|
+
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
43
|
+
def self.validate_args!(_value, _context)
|
44
|
+
raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
|
45
|
+
end
|
46
|
+
|
47
|
+
# Initializes the post processor
|
48
|
+
#
|
49
|
+
# @param value [Object] the value to be processed
|
50
|
+
# @param context [Item::Context] the context
|
51
|
+
def initialize(value, context)
|
52
|
+
klass = self.class
|
53
|
+
# TODO: get rid of Hash
|
54
|
+
klass.assert_type(context, [Item::Context, Hash], 'context')
|
55
|
+
klass.validate_args!(value, context)
|
56
|
+
|
57
|
+
@value = value
|
58
|
+
@context = context
|
59
|
+
end
|
60
|
+
|
61
|
+
attr_reader :value, :context
|
62
|
+
|
63
|
+
# Abstract method to be implemented by subclasses
|
64
|
+
#
|
65
|
+
# @raise [NotImplementedError] if not implemented in subclass
|
66
|
+
def get
|
67
|
+
raise NotImplementedError, 'You must implement the `get` method in the post processor'
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -25,39 +25,39 @@ module Html2rss
|
|
25
25
|
# `replacement` can be a String or a Hash.
|
26
26
|
#
|
27
27
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
28
|
-
class Gsub
|
28
|
+
class Gsub < Base
|
29
|
+
def self.validate_args!(value, context)
|
30
|
+
assert_type value, String, :value
|
31
|
+
expect_options(%i[replacement pattern], context)
|
32
|
+
assert_type context.dig(:options, :replacement), [String, Hash], :replacement
|
33
|
+
end
|
34
|
+
|
29
35
|
##
|
30
36
|
# @param value [String]
|
31
37
|
# @param context [Item::Context]
|
32
38
|
def initialize(value, context)
|
33
|
-
|
34
|
-
|
39
|
+
super
|
40
|
+
|
41
|
+
options = context[:options]
|
42
|
+
|
43
|
+
@replacement = options[:replacement]
|
44
|
+
@pattern = options[:pattern]
|
35
45
|
end
|
36
46
|
|
37
47
|
##
|
38
48
|
# @return [String]
|
39
49
|
def get
|
40
|
-
|
50
|
+
value.to_s.gsub(pattern, replacement)
|
41
51
|
end
|
42
52
|
|
43
53
|
private
|
44
54
|
|
55
|
+
attr_accessor :replacement
|
56
|
+
|
45
57
|
##
|
46
58
|
# @return [Regexp]
|
47
59
|
def pattern
|
48
|
-
pattern
|
49
|
-
raise ArgumentError, 'The `pattern` option is missing' unless pattern
|
50
|
-
|
51
|
-
pattern.is_a?(String) ? Utils.build_regexp_from_string(pattern) : pattern
|
52
|
-
end
|
53
|
-
|
54
|
-
##
|
55
|
-
# @return [Hash, String]
|
56
|
-
def replacement
|
57
|
-
replacement = @options[:replacement]
|
58
|
-
return replacement if replacement.is_a?(String) || replacement.is_a?(Hash)
|
59
|
-
|
60
|
-
raise ArgumentError, 'The `replacement` option must be a String or Hash'
|
60
|
+
@pattern.is_a?(String) ? Utils.build_regexp_from_string(@pattern) : @pattern
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
@@ -26,18 +26,17 @@ module Html2rss
|
|
26
26
|
#
|
27
27
|
# Would return:
|
28
28
|
# 'Lorem **ipsum** dolor'
|
29
|
-
class HtmlToMarkdown
|
30
|
-
|
31
|
-
|
32
|
-
# @param env [Item::Context]
|
33
|
-
def initialize(value, env)
|
34
|
-
@sanitized_value = SanitizeHtml.new(value, env).get
|
29
|
+
class HtmlToMarkdown < Base
|
30
|
+
def self.validate_args!(value, _context)
|
31
|
+
assert_type value, String, :value
|
35
32
|
end
|
36
33
|
|
37
34
|
##
|
38
35
|
# @return [String] formatted in Markdown
|
39
36
|
def get
|
40
|
-
|
37
|
+
sanitized_value = SanitizeHtml.new(value, context).get
|
38
|
+
|
39
|
+
ReverseMarkdown.convert(sanitized_value)
|
41
40
|
end
|
42
41
|
end
|
43
42
|
end
|
@@ -32,13 +32,9 @@ module Html2rss
|
|
32
32
|
# <h1>Section</h1>
|
33
33
|
#
|
34
34
|
# <p>Price: 12.34</p>
|
35
|
-
class MarkdownToHtml
|
36
|
-
|
37
|
-
|
38
|
-
# @param env [Item::Context] Context object providing additional environment details
|
39
|
-
def initialize(value, env)
|
40
|
-
@value = value
|
41
|
-
@env = env
|
35
|
+
class MarkdownToHtml < Base
|
36
|
+
def self.validate_args!(value, _context)
|
37
|
+
assert_type value, String, :value
|
42
38
|
end
|
43
39
|
|
44
40
|
##
|
@@ -46,8 +42,8 @@ module Html2rss
|
|
46
42
|
#
|
47
43
|
# @return [String] Sanitized HTML content
|
48
44
|
def get
|
49
|
-
html_content = Kramdown::Document.new(
|
50
|
-
SanitizeHtml.new(html_content,
|
45
|
+
html_content = Kramdown::Document.new(value).to_html
|
46
|
+
SanitizeHtml.new(html_content, context).get
|
51
47
|
end
|
52
48
|
end
|
53
49
|
end
|
@@ -24,22 +24,22 @@ module Html2rss
|
|
24
24
|
# Would return:
|
25
25
|
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
26
26
|
#
|
27
|
-
# It uses
|
28
|
-
class ParseTime
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
def initialize(value, env)
|
33
|
-
@value = value.to_s
|
34
|
-
@time_zone = env[:config].time_zone
|
27
|
+
# It uses `Time.parse`.
|
28
|
+
class ParseTime < Base
|
29
|
+
def self.validate_args!(value, context)
|
30
|
+
assert_type value, String, :value
|
31
|
+
assert_type context[:config].time_zone, String, :time_zone
|
35
32
|
end
|
36
33
|
|
37
34
|
##
|
38
|
-
# Converts the provided time string to RFC822 format, taking into account the
|
35
|
+
# Converts the provided time string to RFC822 format, taking into account the time_zone.
|
39
36
|
#
|
40
37
|
# @return [String] RFC822 formatted time
|
38
|
+
# @raise [TZInfo::InvalidTimezoneIdentifier] if the configured time zone is invalid
|
41
39
|
def get
|
42
|
-
|
40
|
+
time_zone = context[:config].time_zone
|
41
|
+
|
42
|
+
Utils.use_zone(time_zone) { Time.parse(value).rfc822 }
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|
@@ -21,21 +21,24 @@ module Html2rss
|
|
21
21
|
#
|
22
22
|
# Would return:
|
23
23
|
# 'http://why-not-use-a-link.uh'
|
24
|
-
class ParseUri
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
24
|
+
class ParseUri < Base
|
25
|
+
def self.validate_args!(value, context)
|
26
|
+
url_types = [String, URI::HTTP, Addressable::URI].freeze
|
27
|
+
|
28
|
+
assert_type(value, url_types, :value)
|
29
|
+
assert_type(context.config.url, url_types, :url)
|
30
|
+
|
31
|
+
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
31
32
|
end
|
32
33
|
|
33
34
|
##
|
34
35
|
# @return [String]
|
35
36
|
def get
|
37
|
+
config_url = context.config.url
|
38
|
+
|
36
39
|
Html2rss::Utils.build_absolute_url_from_relative(
|
37
|
-
Html2rss::Utils.sanitize_url(
|
38
|
-
|
40
|
+
Html2rss::Utils.sanitize_url(value),
|
41
|
+
config_url
|
39
42
|
).to_s
|
40
43
|
end
|
41
44
|
end
|
@@ -38,19 +38,15 @@ module Html2rss
|
|
38
38
|
#
|
39
39
|
# Would return:
|
40
40
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
41
|
-
class SanitizeHtml
|
42
|
-
|
43
|
-
|
44
|
-
# @param env [Item::Context]
|
45
|
-
def initialize(value, env)
|
46
|
-
@value = value
|
47
|
-
@channel_url = env[:config].url
|
41
|
+
class SanitizeHtml < Base
|
42
|
+
def self.validate_args!(value, _context)
|
43
|
+
assert_type value, String, :value
|
48
44
|
end
|
49
45
|
|
50
46
|
##
|
51
47
|
# @return [String]
|
52
48
|
def get
|
53
|
-
sanitized_html = Sanitize.fragment(
|
49
|
+
sanitized_html = Sanitize.fragment(value, sanitize_config)
|
54
50
|
sanitized_html.to_s.gsub(/\s+/, ' ').strip
|
55
51
|
end
|
56
52
|
|
@@ -77,13 +73,15 @@ module Html2rss
|
|
77
73
|
}
|
78
74
|
end
|
79
75
|
|
76
|
+
def channel_url = context[:config].url
|
77
|
+
|
80
78
|
##
|
81
79
|
# Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
|
82
80
|
#
|
83
81
|
# @param env [Hash]
|
84
82
|
# @return [nil]
|
85
83
|
def transform_urls_to_absolute_ones(env)
|
86
|
-
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(
|
84
|
+
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
|
87
85
|
end
|
88
86
|
|
89
87
|
##
|
@@ -28,13 +28,15 @@ module Html2rss
|
|
28
28
|
#
|
29
29
|
# Would return:
|
30
30
|
# 'bar'
|
31
|
-
class Substring
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
31
|
+
class Substring < Base
|
32
|
+
def self.validate_args!(value, context)
|
33
|
+
assert_type value, String, :value
|
34
|
+
|
35
|
+
options = context[:options]
|
36
|
+
assert_type options[:start], Integer, :start
|
37
|
+
|
38
|
+
end_index = options[:end]
|
39
|
+
assert_type end_index, Integer, :end if end_index
|
38
40
|
end
|
39
41
|
|
40
42
|
##
|
@@ -42,11 +44,29 @@ module Html2rss
|
|
42
44
|
#
|
43
45
|
# @return [String] The extracted substring.
|
44
46
|
def get
|
45
|
-
|
46
|
-
|
47
|
+
value[range]
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# Determines the range for the substring extraction based on the provided start and end indices.
|
52
|
+
#
|
53
|
+
# @return [Range] The range object representing the start and end/Infinity (integers).
|
54
|
+
def range
|
55
|
+
return (start_index..) unless end_index?
|
56
|
+
|
57
|
+
if start_index == end_index
|
58
|
+
raise ArgumentError,
|
59
|
+
'The `start` value must be unequal to the `end` value.'
|
60
|
+
end
|
47
61
|
|
48
|
-
|
62
|
+
(start_index..end_index)
|
49
63
|
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def end_index? = !context[:options][:end].to_s.empty?
|
68
|
+
def end_index = context[:options][:end].to_i
|
69
|
+
def start_index = context[:options][:start].to_i
|
50
70
|
end
|
51
71
|
end
|
52
72
|
end
|
@@ -31,15 +31,23 @@ module Html2rss
|
|
31
31
|
#
|
32
32
|
# Would return:
|
33
33
|
# 'Product (23,42€)'
|
34
|
-
class Template
|
34
|
+
class Template < Base
|
35
|
+
def self.validate_args!(value, context)
|
36
|
+
assert_type value, String, :value
|
37
|
+
|
38
|
+
string = context[:options]&.dig(:string).to_s
|
39
|
+
raise InvalidType, 'The `string` template is absent.' if string.empty?
|
40
|
+
end
|
41
|
+
|
35
42
|
##
|
36
43
|
# @param value [String]
|
37
|
-
# @param
|
38
|
-
def initialize(value,
|
39
|
-
|
40
|
-
|
41
|
-
@
|
42
|
-
@
|
44
|
+
# @param context [Item::Context]
|
45
|
+
def initialize(value, context)
|
46
|
+
super
|
47
|
+
|
48
|
+
@options = context[:options] || {}
|
49
|
+
@item = context[:item]
|
50
|
+
@string = @options[:string].to_s
|
43
51
|
end
|
44
52
|
|
45
53
|
##
|
@@ -86,7 +94,7 @@ module Html2rss
|
|
86
94
|
# @param method_name [String, Symbol]
|
87
95
|
# @return [String]
|
88
96
|
def item_value(method_name)
|
89
|
-
method_name.to_sym == :self ?
|
97
|
+
method_name.to_sym == :self ? value : @item.public_send(method_name).to_s
|
90
98
|
end
|
91
99
|
end
|
92
100
|
end
|
@@ -8,6 +8,14 @@ module Html2rss
|
|
8
8
|
# Error raised when an unknown post processor name is requested.
|
9
9
|
class UnknownPostProcessorName < Html2rss::Error; end
|
10
10
|
|
11
|
+
##
|
12
|
+
# Error raised when a required option is missing.
|
13
|
+
class MissingOption < Html2rss::Error; end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Error raised when an invalid type is provided.
|
17
|
+
class InvalidType < Html2rss::Error; end
|
18
|
+
|
11
19
|
##
|
12
20
|
# Maps the post processor name to the class implementing the post processor.
|
13
21
|
#
|
@@ -10,6 +10,9 @@ module Html2rss
|
|
10
10
|
# Struct to represent a selector with associated attributes for extraction and processing.
|
11
11
|
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
|
12
12
|
|
13
|
+
# raised when an invalid selector name is used
|
14
|
+
class InvalidSelectorName < Html2rss::Error; end
|
15
|
+
|
13
16
|
##
|
14
17
|
# @param config [Hash<Symbol, Object>]
|
15
18
|
def initialize(config)
|
@@ -28,9 +31,15 @@ module Html2rss
|
|
28
31
|
# @param name [Symbol]
|
29
32
|
# @return [Selector]
|
30
33
|
def selector(name)
|
31
|
-
raise
|
34
|
+
raise InvalidSelectorName, "invalid selector name: #{name}" unless selector?(name)
|
35
|
+
|
36
|
+
keywords = config[name].slice(*available_keys)
|
32
37
|
|
33
|
-
|
38
|
+
if (additional_keys = available_keys - keywords.keys).any?
|
39
|
+
warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
|
40
|
+
end
|
41
|
+
|
42
|
+
Selector.new(keywords)
|
34
43
|
end
|
35
44
|
|
36
45
|
##
|
@@ -86,6 +95,8 @@ module Html2rss
|
|
86
95
|
array.map!(&:to_sym)
|
87
96
|
end.to_set
|
88
97
|
end
|
98
|
+
|
99
|
+
def available_keys = @available_keys ||= Selector.members
|
89
100
|
end
|
90
101
|
end
|
91
102
|
end
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -219,6 +219,7 @@ files:
|
|
219
219
|
- html2rss.gemspec
|
220
220
|
- lib/html2rss.rb
|
221
221
|
- lib/html2rss/attribute_post_processors.rb
|
222
|
+
- lib/html2rss/attribute_post_processors/base.rb
|
222
223
|
- lib/html2rss/attribute_post_processors/gsub.rb
|
223
224
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
224
225
|
- lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
|
@@ -252,7 +253,7 @@ licenses:
|
|
252
253
|
- MIT
|
253
254
|
metadata:
|
254
255
|
allowed_push_host: https://rubygems.org
|
255
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
256
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.12.0
|
256
257
|
rubygems_mfa_required: 'true'
|
257
258
|
post_install_message:
|
258
259
|
rdoc_options: []
|