html2rss 0.9.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -1
- data/.mergify.yml +15 -0
- data/.rubocop.yml +11 -145
- data/Gemfile +19 -2
- data/Gemfile.lock +111 -97
- data/README.md +323 -270
- data/bin/console +1 -0
- data/exe/html2rss +6 -0
- data/html2rss.gemspec +15 -20
- data/lib/html2rss/attribute_post_processors/gsub.rb +30 -8
- data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +7 -2
- data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +27 -0
- data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +41 -0
- data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +11 -2
- data/lib/html2rss/attribute_post_processors/parse_time.rb +11 -4
- data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -2
- data/lib/html2rss/attribute_post_processors/sanitize_html.rb +40 -44
- data/lib/html2rss/attribute_post_processors/substring.rb +14 -4
- data/lib/html2rss/attribute_post_processors/template.rb +36 -12
- data/lib/html2rss/attribute_post_processors.rb +28 -5
- data/lib/html2rss/cli.rb +29 -0
- data/lib/html2rss/config/channel.rb +117 -0
- data/lib/html2rss/config/selectors.rb +91 -0
- data/lib/html2rss/config.rb +71 -82
- data/lib/html2rss/item.rb +118 -42
- data/lib/html2rss/item_extractors/attribute.rb +20 -7
- data/lib/html2rss/item_extractors/href.rb +20 -4
- data/lib/html2rss/item_extractors/html.rb +18 -6
- data/lib/html2rss/item_extractors/static.rb +18 -7
- data/lib/html2rss/item_extractors/text.rb +17 -5
- data/lib/html2rss/item_extractors.rb +75 -10
- data/lib/html2rss/object_to_xml_converter.rb +56 -0
- data/lib/html2rss/rss_builder/channel.rb +21 -0
- data/lib/html2rss/rss_builder/item.rb +83 -0
- data/lib/html2rss/rss_builder/stylesheet.rb +37 -0
- data/lib/html2rss/rss_builder.rb +96 -0
- data/lib/html2rss/utils.rb +94 -19
- data/lib/html2rss/version.rb +5 -1
- data/lib/html2rss.rb +51 -20
- data/rakefile.rb +16 -0
- metadata +51 -154
- data/.travis.yml +0 -25
- data/CHANGELOG.md +0 -221
- data/lib/html2rss/feed_builder.rb +0 -81
- data/lib/html2rss/item_extractors/current_time.rb +0 -21
- data/support/logo.png +0 -0
@@ -0,0 +1,117 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'addressable'
|
4
|
+
|
5
|
+
module Html2rss
|
6
|
+
class Config
|
7
|
+
##
|
8
|
+
# Holds the configuration for the feed's channel options.
|
9
|
+
# This contains:
|
10
|
+
#
|
11
|
+
# 1. the RSS channel attributes
|
12
|
+
# 2. html2rss options like json or custom HTTP-headers for the request
|
13
|
+
class Channel
|
14
|
+
##
|
15
|
+
# @param channel [Hash<Symbol, Object>]
|
16
|
+
# @param params [Hash]
|
17
|
+
def initialize(channel, params: {})
|
18
|
+
raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
|
19
|
+
raise ArgumentError, 'missing key :url' unless channel[:url].is_a?(String)
|
20
|
+
|
21
|
+
@config = process_params(channel, params.transform_keys(&:to_sym))
|
22
|
+
end
|
23
|
+
|
24
|
+
##
|
25
|
+
# The HTTP headers to use for the request.
|
26
|
+
#
|
27
|
+
# @return [Hash<Symbol, String>]
|
28
|
+
def headers
|
29
|
+
config.fetch(:headers, {})
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# @return [String]
|
34
|
+
def author
|
35
|
+
config.fetch(:author, 'html2rss')
|
36
|
+
end
|
37
|
+
|
38
|
+
##
|
39
|
+
# @return [Integer]
|
40
|
+
def ttl
|
41
|
+
config.fetch(:ttl, 360)
|
42
|
+
end
|
43
|
+
|
44
|
+
##
|
45
|
+
# @return [String]
|
46
|
+
def title
|
47
|
+
config.fetch(:title) { Utils.titleized_url(url) }
|
48
|
+
end
|
49
|
+
|
50
|
+
##
|
51
|
+
# @return [String] language code
|
52
|
+
def language
|
53
|
+
config.fetch(:language, 'en')
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# @return [String]
|
58
|
+
def description
|
59
|
+
config.fetch(:description) { "Latest items from #{url}." }
|
60
|
+
end
|
61
|
+
|
62
|
+
##
|
63
|
+
# @return [Addressable::URI]
|
64
|
+
def url
|
65
|
+
Addressable::URI.parse(config[:url]).normalize
|
66
|
+
end
|
67
|
+
|
68
|
+
##
|
69
|
+
# @return [String] time_zone name
|
70
|
+
def time_zone
|
71
|
+
config.fetch(:time_zone, 'UTC')
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# @return [true, false]
|
76
|
+
def json?
|
77
|
+
config.fetch(:json, false)
|
78
|
+
end
|
79
|
+
|
80
|
+
##
|
81
|
+
# @param config [Hash<Symbol, Object>]
|
82
|
+
# @return [Set<String>] the required parameter names
|
83
|
+
def self.required_params_for_config(config)
|
84
|
+
config.each_with_object(Set.new) do |(_, value), required_params|
|
85
|
+
required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
# @return [Hash<Symbol, Object>]
|
92
|
+
attr_reader :config
|
93
|
+
|
94
|
+
##
|
95
|
+
# @param config [Hash<Symbol, Object>]
|
96
|
+
# @param params [Hash<Symbol, String>]
|
97
|
+
# @return [nil]
|
98
|
+
def assert_required_params_presence(config, params)
|
99
|
+
missing_params = self.class.required_params_for_config(config) - params.keys.map(&:to_s)
|
100
|
+
raise ParamsMissing, missing_params.to_a.join(', ') unless missing_params.empty?
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Sets the variables used in the feed config's channel.
|
105
|
+
#
|
106
|
+
# @param config [Hash<Symbol, Object>]
|
107
|
+
# @param params [Hash<Symbol, Object>]
|
108
|
+
# @return [Hash<Symbol, Object>]
|
109
|
+
def process_params(config, params)
|
110
|
+
assert_required_params_presence(config, params)
|
111
|
+
config.transform_values do |value|
|
112
|
+
value.is_a?(String) ? format(value, params) : value
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Html2rss
|
4
|
+
class Config
|
5
|
+
##
|
6
|
+
# Holds the configurations of the selectors.
|
7
|
+
class Selectors
|
8
|
+
ITEMS_SELECTOR_NAME = :items
|
9
|
+
|
10
|
+
# Struct to represent a selector with associated attributes for extraction and processing.
|
11
|
+
Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
|
12
|
+
|
13
|
+
##
|
14
|
+
# @param config [Hash<Symbol, Object>]
|
15
|
+
def initialize(config)
|
16
|
+
validate_config(config)
|
17
|
+
@config = config
|
18
|
+
end
|
19
|
+
|
20
|
+
##
|
21
|
+
# @param name [Symbol]
|
22
|
+
# @return [true, false]
|
23
|
+
def selector?(name)
|
24
|
+
name != ITEMS_SELECTOR_NAME && item_selector_names.include?(name)
|
25
|
+
end
|
26
|
+
|
27
|
+
##
|
28
|
+
# @param name [Symbol]
|
29
|
+
# @return [Selector]
|
30
|
+
def selector(name)
|
31
|
+
raise ArgumentError, "invalid item's selector name: #{name}" unless selector?(name)
|
32
|
+
|
33
|
+
Selector.new(config[name])
|
34
|
+
end
|
35
|
+
|
36
|
+
##
|
37
|
+
# @return [Set<Symbol>]
|
38
|
+
def category_selector_names
|
39
|
+
selector_keys_for(:categories)
|
40
|
+
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# @return [Set<Symbol>]
|
44
|
+
def guid_selector_names
|
45
|
+
selector_keys_for(:guid, default: :title_or_description)
|
46
|
+
end
|
47
|
+
|
48
|
+
##
|
49
|
+
# Returns the CSS/XPath selector.
|
50
|
+
#
|
51
|
+
# @param name [Symbol]
|
52
|
+
# @return [String]
|
53
|
+
def selector_string(name)
|
54
|
+
Selector.new(config[name]).selector
|
55
|
+
end
|
56
|
+
|
57
|
+
##
|
58
|
+
# @return [Set<Symbol>]
|
59
|
+
def item_selector_names
|
60
|
+
@item_selector_names ||= config.keys.reject { |key| key == ITEMS_SELECTOR_NAME }.to_set
|
61
|
+
end
|
62
|
+
|
63
|
+
##
|
64
|
+
# @return [Symbol, nil]
|
65
|
+
def items_order
|
66
|
+
config.dig(ITEMS_SELECTOR_NAME, :order)&.to_sym
|
67
|
+
end
|
68
|
+
|
69
|
+
private
|
70
|
+
|
71
|
+
attr_reader :config
|
72
|
+
|
73
|
+
def validate_config(config)
|
74
|
+
raise ArgumentError, 'selector for items is required' unless config[ITEMS_SELECTOR_NAME].is_a?(Hash)
|
75
|
+
end
|
76
|
+
|
77
|
+
##
|
78
|
+
# Returns the selector keys for the selector named `name`. If none, returns [default].
|
79
|
+
#
|
80
|
+
# @param name [Symbol]
|
81
|
+
# @param default [String, Symbol]
|
82
|
+
# @return [Set<Symbol>]
|
83
|
+
def selector_keys_for(name, default: nil)
|
84
|
+
config.fetch(name) { Array(default) }.tap do |array|
|
85
|
+
array.reject! { |entry| entry.to_s == '' }
|
86
|
+
array.map!(&:to_sym)
|
87
|
+
end.to_set
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
data/lib/html2rss/config.rb
CHANGED
@@ -1,95 +1,84 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'forwardable'
|
2
4
|
|
3
5
|
module Html2rss
|
4
6
|
##
|
5
7
|
# The Config class abstracts from the config data structure and
|
6
8
|
# provides default values.
|
7
9
|
class Config
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
channel_config
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
10
|
+
extend Forwardable
|
11
|
+
|
12
|
+
##
|
13
|
+
# The Error class to be thrown when a feed config requires params, but none
|
14
|
+
# were passed to Config.
|
15
|
+
class ParamsMissing < StandardError; end
|
16
|
+
|
17
|
+
##
|
18
|
+
# Thrown when the feed config does not contain a value at `:channel`.
|
19
|
+
class ChannelMissing < StandardError; end
|
20
|
+
|
21
|
+
# Struct to store XML Stylesheet attributes
|
22
|
+
Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
|
23
|
+
|
24
|
+
def_delegator :@channel, :author
|
25
|
+
def_delegator :@channel, :ttl
|
26
|
+
def_delegator :@channel, :title
|
27
|
+
def_delegator :@channel, :language
|
28
|
+
def_delegator :@channel, :description
|
29
|
+
def_delegator :@channel, :url
|
30
|
+
def_delegator :@channel, :url, :link
|
31
|
+
def_delegator :@channel, :time_zone
|
32
|
+
def_delegator :@channel, :json?
|
33
|
+
|
34
|
+
def_delegator :@selectors, :item_selector_names
|
35
|
+
def_delegator :@selectors, :selector?
|
36
|
+
def_delegator :@selectors, :category_selector_names
|
37
|
+
def_delegator :@selectors, :guid_selector_names
|
38
|
+
def_delegator :@selectors, :items_order
|
39
|
+
def_delegator :@selectors, :selector_string
|
40
|
+
|
41
|
+
##
|
42
|
+
# Initializes the Config object with feed configuration, global settings, and parameters.
|
43
|
+
#
|
44
|
+
# @param feed_config [Hash<Symbol, Object>] The configuration hash containing `:channel` and `:selectors`.
|
45
|
+
# @param global [Hash<Symbol, Object>] Global settings hash.
|
46
|
+
# @param params [Hash<Symbol, String>] Parameters hash.
|
47
|
+
def initialize(feed_config, global = {}, params = {})
|
48
|
+
channel_config = feed_config[:channel]
|
49
|
+
raise ChannelMissing, 'Channel configuration is missing in feed_config' unless channel_config
|
50
|
+
|
51
|
+
@channel = Channel.new(channel_config, params:)
|
52
|
+
@selectors = Selectors.new(feed_config[:selectors])
|
53
|
+
@global = global
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# Retrieves selector attributes merged with channel attributes.
|
58
|
+
#
|
59
|
+
# @param name [Symbol] Selector name.
|
60
|
+
# @return [Hash<Symbol, Object>] Merged attributes hash.
|
61
|
+
def selector_attributes_with_channel(name)
|
62
|
+
@selectors.selector(name).to_h.merge(channel: @channel)
|
63
|
+
end
|
64
|
+
|
65
|
+
##
|
66
|
+
# Retrieves headers merged from global settings and channel headers.
|
67
|
+
#
|
68
|
+
# @return [Hash] Merged headers hash.
|
56
69
|
def headers
|
57
|
-
|
58
|
-
end
|
59
|
-
|
60
|
-
def attribute_options(name)
|
61
|
-
feed_config.dig(:selectors).fetch(name, {}).merge(channel: channel_config)
|
62
|
-
end
|
63
|
-
|
64
|
-
def attribute?(name)
|
65
|
-
attribute_names.include?(name)
|
70
|
+
@global.fetch(:headers, {}).merge(@channel.headers)
|
66
71
|
end
|
67
72
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
categories.uniq!
|
75
|
-
|
76
|
-
categories
|
73
|
+
##
|
74
|
+
# Retrieves stylesheets from global settings.
|
75
|
+
#
|
76
|
+
# @return [Array<Stylesheet>] Array of Stylesheet structs.
|
77
|
+
def stylesheets
|
78
|
+
@global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
|
77
79
|
end
|
78
80
|
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
def attribute_names
|
84
|
-
@attribute_names ||= feed_config.fetch(:selectors, {}).keys.tap { |attrs| attrs.delete(:items) }
|
85
|
-
end
|
86
|
-
|
87
|
-
def items_order
|
88
|
-
feed_config.dig(:selectors, :items, :order)&.to_sym
|
89
|
-
end
|
90
|
-
|
91
|
-
private
|
92
|
-
|
93
|
-
attr_reader :feed_config, :channel_config, :global_config
|
81
|
+
# Provides read-only access to the channel object.
|
82
|
+
attr_reader :channel
|
94
83
|
end
|
95
84
|
end
|
data/lib/html2rss/item.rb
CHANGED
@@ -1,12 +1,24 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
3
|
require 'nokogiri'
|
4
4
|
|
5
5
|
module Html2rss
|
6
6
|
##
|
7
|
-
# Takes the selected Nokogiri::HTML and responds to
|
7
|
+
# Takes the selected Nokogiri::HTML and responds to accessor names
|
8
8
|
# defined in the feed config.
|
9
|
+
#
|
10
|
+
# Instances can only be created via `.from_url` and
|
11
|
+
# each represents an internally used "RSS item".
|
12
|
+
# Such an item provides dynamically defined attributes as methods.
|
9
13
|
class Item
|
14
|
+
# A context instance is passed to Item Extractors.
|
15
|
+
Context = Struct.new('Context', :options, :item, :config, keyword_init: true)
|
16
|
+
# Class to keep an Item's <enclosure>.
|
17
|
+
Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
|
18
|
+
|
19
|
+
##
|
20
|
+
# @param xml [Nokogiri::XML::Element]
|
21
|
+
# @param config [Html2rss::Config]
|
10
22
|
def initialize(xml, config)
|
11
23
|
@xml = xml
|
12
24
|
@config = config
|
@@ -14,86 +26,150 @@ module Html2rss
|
|
14
26
|
|
15
27
|
private_class_method :new
|
16
28
|
|
29
|
+
##
|
30
|
+
# Checks if the object responds to a method dynamically based on the configuration.
|
31
|
+
#
|
32
|
+
# @param method_name [Symbol]
|
33
|
+
# @param _include_private [true, false]
|
34
|
+
# @return [true, false]
|
17
35
|
def respond_to_missing?(method_name, _include_private = false)
|
18
|
-
config.
|
36
|
+
config.selector?(method_name) || super
|
19
37
|
end
|
20
38
|
|
39
|
+
##
|
40
|
+
# Dynamically extracts data based on the method name.
|
41
|
+
#
|
42
|
+
# @param method_name [Symbol]
|
43
|
+
# @param _args [Array]
|
44
|
+
# @return [String] extracted value for the selector.
|
21
45
|
def method_missing(method_name, *_args)
|
22
46
|
return super unless respond_to_missing?(method_name)
|
23
47
|
|
24
|
-
|
48
|
+
extract(method_name)
|
49
|
+
end
|
25
50
|
|
26
|
-
|
27
|
-
|
51
|
+
##
|
52
|
+
# Selects and processes data according to the selector name.
|
53
|
+
#
|
54
|
+
# @param tag [Symbol]
|
55
|
+
# @return [String] the extracted value for the selector.
|
56
|
+
def extract(tag)
|
57
|
+
attribute_options = config.selector_attributes_with_channel(tag.to_sym)
|
58
|
+
|
59
|
+
post_process(
|
60
|
+
ItemExtractors.item_extractor_factory(attribute_options, xml).get,
|
61
|
+
attribute_options.fetch(:post_process, false)
|
62
|
+
)
|
63
|
+
end
|
28
64
|
|
29
|
-
|
65
|
+
##
|
66
|
+
# Checks if the item is valid accordin to RSS 2.0 spec,
|
67
|
+
# by ensuring it has at least a title or a description.
|
68
|
+
#
|
69
|
+
# @return [true, false]
|
70
|
+
def valid?
|
71
|
+
title_or_description.to_s != ''
|
30
72
|
end
|
31
73
|
|
32
|
-
|
33
|
-
|
34
|
-
|
74
|
+
##
|
75
|
+
# Returns either the title or the description, preferring title if available.
|
76
|
+
#
|
77
|
+
# @return [String, nil]
|
78
|
+
def title_or_description
|
79
|
+
return title if config.selector?(:title)
|
80
|
+
|
81
|
+
description if config.selector?(:description)
|
35
82
|
end
|
36
83
|
|
37
84
|
##
|
38
|
-
#
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
85
|
+
#
|
86
|
+
# @return [String] SHA1 hashed GUID.
|
87
|
+
def guid
|
88
|
+
content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join
|
89
|
+
|
90
|
+
Digest::SHA1.hexdigest(content)
|
43
91
|
end
|
44
92
|
|
45
93
|
##
|
46
|
-
#
|
94
|
+
# Retrieves categories for the item based on configured category selectors.
|
95
|
+
#
|
96
|
+
# @return [Array<String>] list of categories.
|
47
97
|
def categories
|
48
|
-
config.
|
98
|
+
config.category_selector_names.map { |method_name| public_send(method_name) }
|
49
99
|
end
|
50
100
|
|
101
|
+
##
|
102
|
+
# Checks if the item has an enclosure based on configuration.
|
103
|
+
#
|
104
|
+
# @return [true, false]
|
51
105
|
def enclosure?
|
52
|
-
config.
|
106
|
+
config.selector?(:enclosure)
|
53
107
|
end
|
54
108
|
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
109
|
+
##
|
110
|
+
# Retrieves enclosure details for the item.
|
111
|
+
#
|
112
|
+
# @return [Enclosure] enclosure details.
|
113
|
+
def enclosure
|
114
|
+
url = enclosure_url
|
115
|
+
|
116
|
+
raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
|
117
|
+
|
118
|
+
Enclosure.new(
|
119
|
+
type: Html2rss::Utils.guess_content_type_from_url(url),
|
120
|
+
bits_length: 0,
|
121
|
+
url: url.to_s
|
122
|
+
)
|
59
123
|
end
|
60
124
|
|
61
125
|
##
|
62
|
-
#
|
126
|
+
# Fetches items from a given URL using configuration settings.
|
127
|
+
#
|
128
|
+
# @param url [String] URL to fetch items from.
|
129
|
+
# @param config [Html2rss::Config] Configuration object.
|
130
|
+
# @return [Array<Html2rss::Item>] list of items fetched.
|
63
131
|
def self.from_url(url, config)
|
64
|
-
body =
|
132
|
+
body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
|
65
133
|
|
66
|
-
Nokogiri.HTML(body)
|
67
|
-
.
|
68
|
-
.
|
134
|
+
Nokogiri.HTML(body)
|
135
|
+
.css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
|
136
|
+
.map { |xml| new(xml, config) }
|
137
|
+
.select(&:valid?)
|
69
138
|
end
|
70
139
|
|
71
140
|
private
|
72
141
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
end
|
78
|
-
|
79
|
-
body = request.get.body
|
80
|
-
|
81
|
-
config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body
|
82
|
-
end
|
83
|
-
private_class_method :get_body_from_url
|
84
|
-
|
85
|
-
attr_reader :xml, :config
|
142
|
+
# @return [Nokogiri::XML::Element] XML element representing the item.
|
143
|
+
attr_reader :xml
|
144
|
+
# @return [Html2rss::Config] Configuration object for the item.
|
145
|
+
attr_reader :config
|
86
146
|
|
147
|
+
##
|
148
|
+
# Processes the extracted value according to post-processing options.
|
149
|
+
#
|
150
|
+
# @param value [String] extracted value.
|
151
|
+
# @param post_process_options [Hash<Symbol, Object>] post-processing options.
|
152
|
+
# @return [String] processed value.
|
87
153
|
def post_process(value, post_process_options)
|
88
154
|
return value unless post_process_options
|
89
155
|
|
90
156
|
[post_process_options].flatten.each do |options|
|
91
157
|
value = AttributePostProcessors.get_processor(options[:name])
|
92
|
-
.new(value, options
|
158
|
+
.new(value, Context.new(options:, item: self, config:))
|
93
159
|
.get
|
94
160
|
end
|
95
161
|
|
96
162
|
value
|
97
163
|
end
|
164
|
+
|
165
|
+
##
|
166
|
+
# Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute.
|
167
|
+
#
|
168
|
+
# @return [Addressable::URI, nil] absolute URL of the enclosure.
|
169
|
+
def enclosure_url
|
170
|
+
enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure))
|
171
|
+
|
172
|
+
Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure
|
173
|
+
end
|
98
174
|
end
|
99
175
|
end
|
@@ -1,9 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
4
6
|
# Returns the value of the attribute.
|
5
7
|
#
|
6
|
-
# Imagine this +time+ HTML
|
8
|
+
# Imagine this +time+ HTML tag with a +datetime+ attribute:
|
7
9
|
#
|
8
10
|
# <time datetime="2019-07-01">...</time>
|
9
11
|
#
|
@@ -18,19 +20,30 @@ module Html2rss
|
|
18
20
|
# Would return:
|
19
21
|
# '2019-07-01'
|
20
22
|
#
|
21
|
-
# In case you're extracting a date or a time,
|
22
|
-
# during post processing with
|
23
|
-
# {AttributePostProcessors::ParseTime}[rdoc-ref:Html2rss::AttributePostProcessors::ParseTime].
|
23
|
+
# In case you're extracting a date or a time, consider parsing it
|
24
|
+
# during post processing with {AttributePostProcessors::ParseTime}.
|
24
25
|
class Attribute
|
26
|
+
# The available options for the attribute extractor.
|
27
|
+
Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
|
28
|
+
|
29
|
+
##
|
30
|
+
# Initializes the Attribute extractor.
|
31
|
+
#
|
32
|
+
# @param xml [Nokogiri::XML::Element]
|
33
|
+
# @param options [Options]
|
25
34
|
def initialize(xml, options)
|
26
35
|
@options = options
|
27
|
-
@element = ItemExtractors.element(xml, options)
|
36
|
+
@element = ItemExtractors.element(xml, options.selector)
|
28
37
|
end
|
29
38
|
|
30
39
|
##
|
31
|
-
#
|
40
|
+
# Retrieves and returns the attribute's value as a string.
|
41
|
+
#
|
42
|
+
# @return [String] The value of the attribute.
|
32
43
|
def get
|
33
|
-
@element.attr(@options
|
44
|
+
@element.attr(@options.attribute).to_s.freeze
|
45
|
+
rescue NoMethodError => error
|
46
|
+
raise "Failed to extract attribute: #{error.message}"
|
34
47
|
end
|
35
48
|
end
|
36
49
|
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Html2rss
|
2
4
|
module ItemExtractors
|
3
5
|
##
|
@@ -21,15 +23,29 @@ module Html2rss
|
|
21
23
|
# Would return:
|
22
24
|
# 'http://blog-without-a-feed.example.com/posts/latest-findings'
|
23
25
|
class Href
|
26
|
+
# The available options for the href (attribute) extractor.
|
27
|
+
Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
|
28
|
+
|
29
|
+
##
|
30
|
+
# Initializes the Href extractor.
|
31
|
+
#
|
32
|
+
# @param xml [Nokogiri::XML::Element]
|
33
|
+
# @param options [Options]
|
24
34
|
def initialize(xml, options)
|
25
35
|
@options = options
|
26
|
-
element = ItemExtractors.element(xml, options)
|
27
|
-
@href =
|
36
|
+
@element = ItemExtractors.element(xml, options.selector)
|
37
|
+
@href = @element.attr('href').to_s
|
28
38
|
end
|
29
39
|
|
30
|
-
|
40
|
+
##
|
41
|
+
# Retrieves and returns the normalized absolute URL.
|
42
|
+
#
|
43
|
+
# @return [String] The absolute URL.
|
31
44
|
def get
|
32
|
-
|
45
|
+
return nil unless @href
|
46
|
+
|
47
|
+
sanitized_href = Html2rss::Utils.sanitize_url(@href)
|
48
|
+
Html2rss::Utils.build_absolute_url_from_relative(sanitized_href, @options.channel.url)
|
33
49
|
end
|
34
50
|
end
|
35
51
|
end
|