html2rss 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/html2rss/attribute_post_processors/base.rb +71 -0
- data/lib/html2rss/attribute_post_processors/gsub.rb +17 -17
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +6 -7
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +5 -9
- data/lib/html2rss/attribute_post_processors/parse_time.rb +10 -10
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -9
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +7 -9
- data/lib/html2rss/attribute_post_processors/substring.rb +30 -10
- data/lib/html2rss/attribute_post_processors/template.rb +16 -8
- data/lib/html2rss/attribute_post_processors.rb +8 -0
- data/lib/html2rss/config/selectors.rb +13 -2
- data/lib/html2rss/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ebe536d8051a64c6e2adf9fa8e1d9d1f9fa3743541c44ca85022d0603f9032b2
|
4
|
+
data.tar.gz: 7b3aaa213aaf6a37fb6e94fa72c9936ffd2391322297553b253b097edea300cc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '03985002d050b996c1dc315cbe8e3fc79b6619447a048ad3d2dca86f792eab5c2356716cf6198a24efc61de7e7ddceba2780da49c3e68a3c9efe895eb7cf0cf1'
|
7
|
+
data.tar.gz: 8315473528f46a5ba28297af296b879a66ac00f86ba9eb117b4e6c9ec61c285e4090cfd999ff712368f5b988b1cbda460e268aa3ea8928912bcdb1960ae25a4a
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
##
|
5
|
+
# Provides a namespace for attribute post processors.
|
6
|
+
module AttributePostProcessors
|
7
|
+
##
|
8
|
+
# All post processors must inherit from this base class and implement `self.validate_args!` and `#get`.
|
9
|
+
class Base
|
10
|
+
# Validates the presence of required options in the context
|
11
|
+
#
|
12
|
+
# @param keys [Array<Symbol>] the keys to check for presence
|
13
|
+
# @param context [Hash] the context containing options
|
14
|
+
# @raise [MissingOption] if any key is missing
|
15
|
+
def self.expect_options(keys, context)
|
16
|
+
keys.each do |key|
|
17
|
+
unless (options = context[:options]).key?(key)
|
18
|
+
raise MissingOption, "The `#{key}` option is missing in: #{options.inspect}", [],
|
19
|
+
cause: nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Asserts that the value is of the expected type(s)
|
25
|
+
#
|
26
|
+
# @param value [Object] the value to check
|
27
|
+
# @param types [Array<Class>, Class] the expected type(s)
|
28
|
+
# @param name [String] the name of the option being checked
|
29
|
+
# @raise [InvalidType] if the value is not of the expected type(s)
|
30
|
+
def self.assert_type(value, types = [], name)
|
31
|
+
types = [types] unless types.is_a?(Array)
|
32
|
+
|
33
|
+
return if types.any? { |type| value.is_a?(type) }
|
34
|
+
|
35
|
+
error_message_template = 'The type of `%s` must be %s, but is: %s'
|
36
|
+
raise InvalidType, format(error_message_template, name, types.join(' or '), value.class), [], cause: nil
|
37
|
+
end
|
38
|
+
|
39
|
+
# private_class_method :expect_options, :assert_type
|
40
|
+
|
41
|
+
##
|
42
|
+
# This method validates the arguments passed to the post processor. Must be implemented by subclasses.
|
43
|
+
def self.validate_args!(_value, _context)
|
44
|
+
raise NotImplementedError, 'You must implement the `validate_args!` method in the post processor'
|
45
|
+
end
|
46
|
+
|
47
|
+
# Initializes the post processor
|
48
|
+
#
|
49
|
+
# @param value [Object] the value to be processed
|
50
|
+
# @param context [Item::Context] the context
|
51
|
+
def initialize(value, context)
|
52
|
+
klass = self.class
|
53
|
+
# TODO: get rid of Hash
|
54
|
+
klass.assert_type(context, [Item::Context, Hash], 'context')
|
55
|
+
klass.validate_args!(value, context)
|
56
|
+
|
57
|
+
@value = value
|
58
|
+
@context = context
|
59
|
+
end
|
60
|
+
|
61
|
+
attr_reader :value, :context
|
62
|
+
|
63
|
+
# Abstract method to be implemented by subclasses
|
64
|
+
#
|
65
|
+
# @raise [NotImplementedError] if not implemented in subclass
|
66
|
+
def get
|
67
|
+
raise NotImplementedError, 'You must implement the `get` method in the post processor'
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -25,39 +25,39 @@ module Html2rss
|
|
25
25
|
# `replacement` can be a String or a Hash.
|
26
26
|
#
|
27
27
|
# See the doc on [String#gsub](https://ruby-doc.org/core/String.html#method-i-gsub) for more info.
|
28
|
-
class Gsub
|
28
|
+
class Gsub < Base
|
29
|
+
def self.validate_args!(value, context)
|
30
|
+
assert_type value, String, :value
|
31
|
+
expect_options(%i[replacement pattern], context)
|
32
|
+
assert_type context.dig(:options, :replacement), [String, Hash], :replacement
|
33
|
+
end
|
34
|
+
|
29
35
|
##
|
30
36
|
# @param value [String]
|
31
37
|
# @param context [Item::Context]
|
32
38
|
def initialize(value, context)
|
33
|
-
|
34
|
-
|
39
|
+
super
|
40
|
+
|
41
|
+
options = context[:options]
|
42
|
+
|
43
|
+
@replacement = options[:replacement]
|
44
|
+
@pattern = options[:pattern]
|
35
45
|
end
|
36
46
|
|
37
47
|
##
|
38
48
|
# @return [String]
|
39
49
|
def get
|
40
|
-
|
50
|
+
value.to_s.gsub(pattern, replacement)
|
41
51
|
end
|
42
52
|
|
43
53
|
private
|
44
54
|
|
55
|
+
attr_accessor :replacement
|
56
|
+
|
45
57
|
##
|
46
58
|
# @return [Regexp]
|
47
59
|
def pattern
|
48
|
-
pattern
|
49
|
-
raise ArgumentError, 'The `pattern` option is missing' unless pattern
|
50
|
-
|
51
|
-
pattern.is_a?(String) ? Utils.build_regexp_from_string(pattern) : pattern
|
52
|
-
end
|
53
|
-
|
54
|
-
##
|
55
|
-
# @return [Hash, String]
|
56
|
-
def replacement
|
57
|
-
replacement = @options[:replacement]
|
58
|
-
return replacement if replacement.is_a?(String) || replacement.is_a?(Hash)
|
59
|
-
|
60
|
-
raise ArgumentError, 'The `replacement` option must be a String or Hash'
|
60
|
+
@pattern.is_a?(String) ? Utils.build_regexp_from_string(@pattern) : @pattern
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
@@ -26,18 +26,17 @@ module Html2rss
|
|
26
26
|
#
|
27
27
|
# Would return:
|
28
28
|
# 'Lorem **ipsum** dolor'
|
29
|
-
class HtmlToMarkdown
|
30
|
-
|
31
|
-
|
32
|
-
# @param env [Item::Context]
|
33
|
-
def initialize(value, env)
|
34
|
-
@sanitized_value = SanitizeHtml.new(value, env).get
|
29
|
+
class HtmlToMarkdown < Base
|
30
|
+
def self.validate_args!(value, _context)
|
31
|
+
assert_type value, String, :value
|
35
32
|
end
|
36
33
|
|
37
34
|
##
|
38
35
|
# @return [String] formatted in Markdown
|
39
36
|
def get
|
40
|
-
|
37
|
+
sanitized_value = SanitizeHtml.new(value, context).get
|
38
|
+
|
39
|
+
ReverseMarkdown.convert(sanitized_value)
|
41
40
|
end
|
42
41
|
end
|
43
42
|
end
|
@@ -32,13 +32,9 @@ module Html2rss
|
|
32
32
|
# <h1>Section</h1>
|
33
33
|
#
|
34
34
|
# <p>Price: 12.34</p>
|
35
|
-
class MarkdownToHtml
|
36
|
-
|
37
|
-
|
38
|
-
# @param env [Item::Context] Context object providing additional environment details
|
39
|
-
def initialize(value, env)
|
40
|
-
@value = value
|
41
|
-
@env = env
|
35
|
+
class MarkdownToHtml < Base
|
36
|
+
def self.validate_args!(value, _context)
|
37
|
+
assert_type value, String, :value
|
42
38
|
end
|
43
39
|
|
44
40
|
##
|
@@ -46,8 +42,8 @@ module Html2rss
|
|
46
42
|
#
|
47
43
|
# @return [String] Sanitized HTML content
|
48
44
|
def get
|
49
|
-
html_content = Kramdown::Document.new(
|
50
|
-
SanitizeHtml.new(html_content,
|
45
|
+
html_content = Kramdown::Document.new(value).to_html
|
46
|
+
SanitizeHtml.new(html_content, context).get
|
51
47
|
end
|
52
48
|
end
|
53
49
|
end
|
@@ -24,22 +24,22 @@ module Html2rss
|
|
24
24
|
# Would return:
|
25
25
|
# "Tue, 02 Jul 2019 00:00:00 +0200"
|
26
26
|
#
|
27
|
-
# It uses
|
28
|
-
class ParseTime
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
def initialize(value, env)
|
33
|
-
@value = value.to_s
|
34
|
-
@time_zone = env[:config].time_zone
|
27
|
+
# It uses `Time.parse`.
|
28
|
+
class ParseTime < Base
|
29
|
+
def self.validate_args!(value, context)
|
30
|
+
assert_type value, String, :value
|
31
|
+
assert_type context[:config].time_zone, String, :time_zone
|
35
32
|
end
|
36
33
|
|
37
34
|
##
|
38
|
-
# Converts the provided time string to RFC822 format, taking into account the
|
35
|
+
# Converts the provided time string to RFC822 format, taking into account the time_zone.
|
39
36
|
#
|
40
37
|
# @return [String] RFC822 formatted time
|
38
|
+
# @raise [TZInfo::InvalidTimezoneIdentifier] if the configured time zone is invalid
|
41
39
|
def get
|
42
|
-
|
40
|
+
time_zone = context[:config].time_zone
|
41
|
+
|
42
|
+
Utils.use_zone(time_zone) { Time.parse(value).rfc822 }
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|
@@ -21,21 +21,24 @@ module Html2rss
|
|
21
21
|
#
|
22
22
|
# Would return:
|
23
23
|
# 'http://why-not-use-a-link.uh'
|
24
|
-
class ParseUri
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
24
|
+
class ParseUri < Base
|
25
|
+
def self.validate_args!(value, context)
|
26
|
+
url_types = [String, URI::HTTP, Addressable::URI].freeze
|
27
|
+
|
28
|
+
assert_type(value, url_types, :value)
|
29
|
+
assert_type(context.config.url, url_types, :url)
|
30
|
+
|
31
|
+
raise ArgumentError, 'The `value` option is missing or empty.' if value.to_s.empty?
|
31
32
|
end
|
32
33
|
|
33
34
|
##
|
34
35
|
# @return [String]
|
35
36
|
def get
|
37
|
+
config_url = context.config.url
|
38
|
+
|
36
39
|
Html2rss::Utils.build_absolute_url_from_relative(
|
37
|
-
Html2rss::Utils.sanitize_url(
|
38
|
-
|
40
|
+
Html2rss::Utils.sanitize_url(value),
|
41
|
+
config_url
|
39
42
|
).to_s
|
40
43
|
end
|
41
44
|
end
|
@@ -38,19 +38,15 @@ module Html2rss
|
|
38
38
|
#
|
39
39
|
# Would return:
|
40
40
|
# '<p>Lorem <b>ipsum</b> dolor ...</p>'
|
41
|
-
class SanitizeHtml
|
42
|
-
|
43
|
-
|
44
|
-
# @param env [Item::Context]
|
45
|
-
def initialize(value, env)
|
46
|
-
@value = value
|
47
|
-
@channel_url = env[:config].url
|
41
|
+
class SanitizeHtml < Base
|
42
|
+
def self.validate_args!(value, _context)
|
43
|
+
assert_type value, String, :value
|
48
44
|
end
|
49
45
|
|
50
46
|
##
|
51
47
|
# @return [String]
|
52
48
|
def get
|
53
|
-
sanitized_html = Sanitize.fragment(
|
49
|
+
sanitized_html = Sanitize.fragment(value, sanitize_config)
|
54
50
|
sanitized_html.to_s.gsub(/\s+/, ' ').strip
|
55
51
|
end
|
56
52
|
|
@@ -77,13 +73,15 @@ module Html2rss
|
|
77
73
|
}
|
78
74
|
end
|
79
75
|
|
76
|
+
def channel_url = context[:config].url
|
77
|
+
|
80
78
|
##
|
81
79
|
# Wrapper for transform_urls_to_absolute_ones to pass the channel_url.
|
82
80
|
#
|
83
81
|
# @param env [Hash]
|
84
82
|
# @return [nil]
|
85
83
|
def transform_urls_to_absolute_ones(env)
|
86
|
-
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(
|
84
|
+
HtmlTransformers::TransformUrlsToAbsoluteOnes.new(channel_url).call(**env)
|
87
85
|
end
|
88
86
|
|
89
87
|
##
|
@@ -28,13 +28,15 @@ module Html2rss
|
|
28
28
|
#
|
29
29
|
# Would return:
|
30
30
|
# 'bar'
|
31
|
-
class Substring
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
31
|
+
class Substring < Base
|
32
|
+
def self.validate_args!(value, context)
|
33
|
+
assert_type value, String, :value
|
34
|
+
|
35
|
+
options = context[:options]
|
36
|
+
assert_type options[:start], Integer, :start
|
37
|
+
|
38
|
+
end_index = options[:end]
|
39
|
+
assert_type end_index, Integer, :end if end_index
|
38
40
|
end
|
39
41
|
|
40
42
|
##
|
@@ -42,11 +44,29 @@ module Html2rss
|
|
42
44
|
#
|
43
45
|
# @return [String] The extracted substring.
|
44
46
|
def get
|
45
|
-
|
46
|
-
|
47
|
+
value[range]
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# Determines the range for the substring extraction based on the provided start and end indices.
|
52
|
+
#
|
53
|
+
# @return [Range] The range object representing the start and end/Infinity (integers).
|
54
|
+
def range
|
55
|
+
return (start_index..) unless end_index?
|
56
|
+
|
57
|
+
if start_index == end_index
|
58
|
+
raise ArgumentError,
|
59
|
+
'The `start` value must be unequal to the `end` value.'
|
60
|
+
end
|
47
61
|
|
48
|
-
|
62
|
+
(start_index..end_index)
|
49
63
|
end
|
64
|
+
|
65
|
+
private
|
66
|
+
|
67
|
+
def end_index? = !context[:options][:end].to_s.empty?
|
68
|
+
def end_index = context[:options][:end].to_i
|
69
|
+
def start_index = context[:options][:start].to_i
|
50
70
|
end
|
51
71
|
end
|
52
72
|
end
|
@@ -31,15 +31,23 @@ module Html2rss
|
|
31
31
|
#
|
32
32
|
# Would return:
|
33
33
|
# 'Product (23,42€)'
|
34
|
-
class Template
|
34
|
+
class Template < Base
|
35
|
+
def self.validate_args!(value, context)
|
36
|
+
assert_type value, String, :value
|
37
|
+
|
38
|
+
string = context[:options]&.dig(:string).to_s
|
39
|
+
raise InvalidType, 'The `string` template is absent.' if string.empty?
|
40
|
+
end
|
41
|
+
|
35
42
|
##
|
36
43
|
# @param value [String]
|
37
|
-
# @param
|
38
|
-
def initialize(value,
|
39
|
-
|
40
|
-
|
41
|
-
@
|
42
|
-
@
|
44
|
+
# @param context [Item::Context]
|
45
|
+
def initialize(value, context)
|
46
|
+
super
|
47
|
+
|
48
|
+
@options = context[:options] || {}
|
49
|
+
@item = context[:item]
|
50
|
+
@string = @options[:string].to_s
|
43
51
|
end
|
44
52
|
|
45
53
|
##
|
@@ -86,7 +94,7 @@ module Html2rss
|
|
86
94
|
# @param method_name [String, Symbol]
|
87
95
|
# @return [String]
|
88
96
|
def item_value(method_name)
|
89
|
-
method_name.to_sym == :self ?
|
97
|
+
method_name.to_sym == :self ? value : @item.public_send(method_name).to_s
|
90
98
|
end
|
91
99
|
end
|
92
100
|
end
|
@@ -8,6 +8,14 @@ module Html2rss
|
|
8
8
|
# Error raised when an unknown post processor name is requested.
|
9
9
|
class UnknownPostProcessorName < Html2rss::Error; end
|
10
10
|
|
11
|
+
##
|
12
|
+
# Error raised when a required option is missing.
|
13
|
+
class MissingOption < Html2rss::Error; end
|
14
|
+
|
15
|
+
##
|
16
|
+
# Error raised when an invalid type is provided.
|
17
|
+
class InvalidType < Html2rss::Error; end
|
18
|
+
|
11
19
|
##
|
12
20
|
# Maps the post processor name to the class implementing the post processor.
|
13
21
|
#
|
@@ -10,6 +10,9 @@ module Html2rss
|
|
10
10
|
# Struct to represent a selector with associated attributes for extraction and processing.
|
11
11
|
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
|
12
12
|
|
13
|
+
# raised when an invalid selector name is used
|
14
|
+
class InvalidSelectorName < Html2rss::Error; end
|
15
|
+
|
13
16
|
##
|
14
17
|
# @param config [Hash<Symbol, Object>]
|
15
18
|
def initialize(config)
|
@@ -28,9 +31,15 @@ module Html2rss
|
|
28
31
|
# @param name [Symbol]
|
29
32
|
# @return [Selector]
|
30
33
|
def selector(name)
|
31
|
-
raise
|
34
|
+
raise InvalidSelectorName, "invalid selector name: #{name}" unless selector?(name)
|
35
|
+
|
36
|
+
keywords = config[name].slice(*available_keys)
|
32
37
|
|
33
|
-
|
38
|
+
if (additional_keys = available_keys - keywords.keys).any?
|
39
|
+
warn "additional keys (#{additional_keys.join(', ')}) present in selector #{name}"
|
40
|
+
end
|
41
|
+
|
42
|
+
Selector.new(keywords)
|
34
43
|
end
|
35
44
|
|
36
45
|
##
|
@@ -86,6 +95,8 @@ module Html2rss
|
|
86
95
|
array.map!(&:to_sym)
|
87
96
|
end.to_set
|
88
97
|
end
|
98
|
+
|
99
|
+
def available_keys = @available_keys ||= Selector.members
|
89
100
|
end
|
90
101
|
end
|
91
102
|
end
|
data/lib/html2rss/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2rss
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gil Desmarais
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: addressable
|
@@ -219,6 +219,7 @@ files:
|
|
219
219
|
- html2rss.gemspec
|
220
220
|
- lib/html2rss.rb
|
221
221
|
- lib/html2rss/attribute_post_processors.rb
|
222
|
+
- lib/html2rss/attribute_post_processors/base.rb
|
222
223
|
- lib/html2rss/attribute_post_processors/gsub.rb
|
223
224
|
- lib/html2rss/attribute_post_processors/html_to_markdown.rb
|
224
225
|
- lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb
|
@@ -252,7 +253,7 @@ licenses:
|
|
252
253
|
- MIT
|
253
254
|
metadata:
|
254
255
|
allowed_push_host: https://rubygems.org
|
255
|
-
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.
|
256
|
+
changelog_uri: https://github.com/html2rss/html2rss/releases/tag/v0.12.0
|
256
257
|
rubygems_mfa_required: 'true'
|
257
258
|
post_install_message:
|
258
259
|
rdoc_options: []
|