html2rss 0.9.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/.mergify.yml +15 -0
  4. data/.rubocop.yml +11 -145
  5. data/Gemfile +19 -2
  6. data/Gemfile.lock +111 -97
  7. data/README.md +323 -270
  8. data/bin/console +1 -0
  9. data/exe/html2rss +6 -0
  10. data/html2rss.gemspec +15 -20
  11. data/lib/html2rss/attribute_post_processors/gsub.rb +30 -8
  12. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +7 -2
  13. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +27 -0
  14. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +41 -0
  15. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +11 -2
  16. data/lib/html2rss/attribute_post_processors/parse_time.rb +11 -4
  17. data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -2
  18. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +40 -44
  19. data/lib/html2rss/attribute_post_processors/substring.rb +14 -4
  20. data/lib/html2rss/attribute_post_processors/template.rb +36 -12
  21. data/lib/html2rss/attribute_post_processors.rb +28 -5
  22. data/lib/html2rss/cli.rb +29 -0
  23. data/lib/html2rss/config/channel.rb +117 -0
  24. data/lib/html2rss/config/selectors.rb +91 -0
  25. data/lib/html2rss/config.rb +71 -82
  26. data/lib/html2rss/item.rb +118 -42
  27. data/lib/html2rss/item_extractors/attribute.rb +20 -7
  28. data/lib/html2rss/item_extractors/href.rb +20 -4
  29. data/lib/html2rss/item_extractors/html.rb +18 -6
  30. data/lib/html2rss/item_extractors/static.rb +18 -7
  31. data/lib/html2rss/item_extractors/text.rb +17 -5
  32. data/lib/html2rss/item_extractors.rb +75 -10
  33. data/lib/html2rss/object_to_xml_converter.rb +56 -0
  34. data/lib/html2rss/rss_builder/channel.rb +21 -0
  35. data/lib/html2rss/rss_builder/item.rb +83 -0
  36. data/lib/html2rss/rss_builder/stylesheet.rb +37 -0
  37. data/lib/html2rss/rss_builder.rb +96 -0
  38. data/lib/html2rss/utils.rb +94 -19
  39. data/lib/html2rss/version.rb +5 -1
  40. data/lib/html2rss.rb +51 -20
  41. data/rakefile.rb +16 -0
  42. metadata +51 -154
  43. data/.travis.yml +0 -25
  44. data/CHANGELOG.md +0 -221
  45. data/lib/html2rss/feed_builder.rb +0 -81
  46. data/lib/html2rss/item_extractors/current_time.rb +0 -21
  47. data/support/logo.png +0 -0
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable'
4
+
5
+ module Html2rss
6
+ class Config
7
+ ##
8
+ # Holds the configuration for the feed's channel options.
9
+ # This contains:
10
+ #
11
+ # 1. the RSS channel attributes
12
+ # 2. html2rss options like json or custom HTTP-headers for the request
13
+ class Channel
14
+ ##
15
+ # @param channel [Hash<Symbol, Object>]
16
+ # @param params [Hash]
17
+ def initialize(channel, params: {})
18
+ raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
19
+ raise ArgumentError, 'missing key :url' unless channel[:url].is_a?(String)
20
+
21
+ @config = process_params(channel, params.transform_keys(&:to_sym))
22
+ end
23
+
24
+ ##
25
+ # The HTTP headers to use for the request.
26
+ #
27
+ # @return [Hash<Symbol, String>]
28
+ def headers
29
+ config.fetch(:headers, {})
30
+ end
31
+
32
+ ##
33
+ # @return [String]
34
+ def author
35
+ config.fetch(:author, 'html2rss')
36
+ end
37
+
38
+ ##
39
+ # @return [Integer]
40
+ def ttl
41
+ config.fetch(:ttl, 360)
42
+ end
43
+
44
+ ##
45
+ # @return [String]
46
+ def title
47
+ config.fetch(:title) { Utils.titleized_url(url) }
48
+ end
49
+
50
+ ##
51
+ # @return [String] language code
52
+ def language
53
+ config.fetch(:language, 'en')
54
+ end
55
+
56
+ ##
57
+ # @return [String]
58
+ def description
59
+ config.fetch(:description) { "Latest items from #{url}." }
60
+ end
61
+
62
+ ##
63
+ # @return [Addressable::URI]
64
+ def url
65
+ Addressable::URI.parse(config[:url]).normalize
66
+ end
67
+
68
+ ##
69
+ # @return [String] time_zone name
70
+ def time_zone
71
+ config.fetch(:time_zone, 'UTC')
72
+ end
73
+
74
+ ##
75
+ # @return [true, false]
76
+ def json?
77
+ config.fetch(:json, false)
78
+ end
79
+
80
+ ##
81
+ # @param config [Hash<Symbol, Object>]
82
+ # @return [Set<String>] the required parameter names
83
+ def self.required_params_for_config(config)
84
+ config.each_with_object(Set.new) do |(_, value), required_params|
85
+ required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
86
+ end
87
+ end
88
+
89
+ private
90
+
91
+ # @return [Hash<Symbol, Object>]
92
+ attr_reader :config
93
+
94
+ ##
95
+ # @param config [Hash<Symbol, Object>]
96
+ # @param params [Hash<Symbol, String>]
97
+ # @return [nil]
98
+ def assert_required_params_presence(config, params)
99
+ missing_params = self.class.required_params_for_config(config) - params.keys.map(&:to_s)
100
+ raise ParamsMissing, missing_params.to_a.join(', ') unless missing_params.empty?
101
+ end
102
+
103
+ ##
104
+ # Sets the variables used in the feed config's channel.
105
+ #
106
+ # @param config [Hash<Symbol, Object>]
107
+ # @param params [Hash<Symbol, Object>]
108
+ # @return [Hash<Symbol, Object>]
109
+ def process_params(config, params)
110
+ assert_required_params_presence(config, params)
111
+ config.transform_values do |value|
112
+ value.is_a?(String) ? format(value, params) : value
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ ##
6
+ # Holds the configurations of the selectors.
7
+ class Selectors
8
+ ITEMS_SELECTOR_NAME = :items
9
+
10
+ # Struct to represent a selector with associated attributes for extraction and processing.
11
+ Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
12
+
13
+ ##
14
+ # @param config [Hash<Symbol, Object>]
15
+ def initialize(config)
16
+ validate_config(config)
17
+ @config = config
18
+ end
19
+
20
+ ##
21
+ # @param name [Symbol]
22
+ # @return [true, false]
23
+ def selector?(name)
24
+ name != ITEMS_SELECTOR_NAME && item_selector_names.include?(name)
25
+ end
26
+
27
+ ##
28
+ # @param name [Symbol]
29
+ # @return [Selector]
30
+ def selector(name)
31
+ raise ArgumentError, "invalid item's selector name: #{name}" unless selector?(name)
32
+
33
+ Selector.new(config[name])
34
+ end
35
+
36
+ ##
37
+ # @return [Set<Symbol>]
38
+ def category_selector_names
39
+ selector_keys_for(:categories)
40
+ end
41
+
42
+ ##
43
+ # @return [Set<Symbol>]
44
+ def guid_selector_names
45
+ selector_keys_for(:guid, default: :title_or_description)
46
+ end
47
+
48
+ ##
49
+ # Returns the CSS/XPath selector.
50
+ #
51
+ # @param name [Symbol]
52
+ # @return [String]
53
+ def selector_string(name)
54
+ Selector.new(config[name]).selector
55
+ end
56
+
57
+ ##
58
+ # @return [Set<Symbol>]
59
+ def item_selector_names
60
+ @item_selector_names ||= config.keys.reject { |key| key == ITEMS_SELECTOR_NAME }.to_set
61
+ end
62
+
63
+ ##
64
+ # @return [Symbol, nil]
65
+ def items_order
66
+ config.dig(ITEMS_SELECTOR_NAME, :order)&.to_sym
67
+ end
68
+
69
+ private
70
+
71
+ attr_reader :config
72
+
73
+ def validate_config(config)
74
+ raise ArgumentError, 'selector for items is required' unless config[ITEMS_SELECTOR_NAME].is_a?(Hash)
75
+ end
76
+
77
+ ##
78
+ # Returns the selector keys for the selector named `name`. If none, returns [default].
79
+ #
80
+ # @param name [Symbol]
81
+ # @param default [String, Symbol]
82
+ # @return [Set<Symbol>]
83
+ def selector_keys_for(name, default: nil)
84
+ config.fetch(name) { Array(default) }.tap do |array|
85
+ array.reject! { |entry| entry.to_s == '' }
86
+ array.map!(&:to_sym)
87
+ end.to_set
88
+ end
89
+ end
90
+ end
91
+ end
@@ -1,95 +1,84 @@
1
- require 'active_support/core_ext/hash'
1
+ # frozen_string_literal: true
2
+
3
+ require 'forwardable'
2
4
 
3
5
  module Html2rss
4
6
  ##
5
7
  # The Config class abstracts from the config data structure and
6
8
  # provides default values.
7
9
  class Config
8
- def initialize(feed_config, global_config = {})
9
- @global_config = global_config.deep_symbolize_keys
10
- @feed_config = feed_config.deep_symbolize_keys
11
- @channel_config = @feed_config.fetch(:channel, {})
12
- end
13
-
14
- def author
15
- channel_config.fetch :author, 'html2rss'
16
- end
17
-
18
- def ttl
19
- channel_config.fetch :ttl, 360
20
- end
21
-
22
- def title
23
- channel_config.fetch(:title) { generated_title }
24
- end
25
-
26
- def generated_title
27
- uri = URI(url)
28
-
29
- nicer_path = uri.path.split('/')
30
- nicer_path.reject! { |part| part == '' }
31
-
32
- nicer_path.any? ? "#{uri.host}: #{nicer_path.join(' ').titleize}" : uri.host
33
- end
34
-
35
- def language
36
- channel_config.fetch :language, 'en'
37
- end
38
-
39
- def description
40
- channel_config.fetch :description, "Latest items from #{url}."
41
- end
42
-
43
- def url
44
- channel_config.dig :url
45
- end
46
- alias link url
47
-
48
- def time_zone
49
- channel_config.fetch :time_zone, 'UTC'
50
- end
51
-
52
- def json?
53
- channel_config.fetch :json, false
54
- end
55
-
10
+ extend Forwardable
11
+
12
+ ##
13
+ # The Error class to be thrown when a feed config requires params, but none
14
+ # were passed to Config.
15
+ class ParamsMissing < StandardError; end
16
+
17
+ ##
18
+ # Thrown when the feed config does not contain a value at `:channel`.
19
+ class ChannelMissing < StandardError; end
20
+
21
+ # Struct to store XML Stylesheet attributes
22
+ Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
23
+
24
+ def_delegator :@channel, :author
25
+ def_delegator :@channel, :ttl
26
+ def_delegator :@channel, :title
27
+ def_delegator :@channel, :language
28
+ def_delegator :@channel, :description
29
+ def_delegator :@channel, :url
30
+ def_delegator :@channel, :url, :link
31
+ def_delegator :@channel, :time_zone
32
+ def_delegator :@channel, :json?
33
+
34
+ def_delegator :@selectors, :item_selector_names
35
+ def_delegator :@selectors, :selector?
36
+ def_delegator :@selectors, :category_selector_names
37
+ def_delegator :@selectors, :guid_selector_names
38
+ def_delegator :@selectors, :items_order
39
+ def_delegator :@selectors, :selector_string
40
+
41
+ ##
42
+ # Initializes the Config object with feed configuration, global settings, and parameters.
43
+ #
44
+ # @param feed_config [Hash<Symbol, Object>] The configuration hash containing `:channel` and `:selectors`.
45
+ # @param global [Hash<Symbol, Object>] Global settings hash.
46
+ # @param params [Hash<Symbol, String>] Parameters hash.
47
+ def initialize(feed_config, global = {}, params = {})
48
+ channel_config = feed_config[:channel]
49
+ raise ChannelMissing, 'Channel configuration is missing in feed_config' unless channel_config
50
+
51
+ @channel = Channel.new(channel_config, params:)
52
+ @selectors = Selectors.new(feed_config[:selectors])
53
+ @global = global
54
+ end
55
+
56
+ ##
57
+ # Retrieves selector attributes merged with channel attributes.
58
+ #
59
+ # @param name [Symbol] Selector name.
60
+ # @return [Hash<Symbol, Object>] Merged attributes hash.
61
+ def selector_attributes_with_channel(name)
62
+ @selectors.selector(name).to_h.merge(channel: @channel)
63
+ end
64
+
65
+ ##
66
+ # Retrieves headers merged from global settings and channel headers.
67
+ #
68
+ # @return [Hash] Merged headers hash.
56
69
  def headers
57
- global_config.fetch(:headers, {}).merge(channel_config.fetch(:headers, {}))
58
- end
59
-
60
- def attribute_options(name)
61
- feed_config.dig(:selectors).fetch(name, {}).merge(channel: channel_config)
62
- end
63
-
64
- def attribute?(name)
65
- attribute_names.include?(name)
70
+ @global.fetch(:headers, {}).merge(@channel.headers)
66
71
  end
67
72
 
68
- def category_selectors
69
- categories = feed_config.dig(:selectors, :categories)
70
- return [] unless categories
71
-
72
- categories = categories.keep_if { |category| category.to_s != '' }
73
- categories.map!(&:to_sym)
74
- categories.uniq!
75
-
76
- categories
73
+ ##
74
+ # Retrieves stylesheets from global settings.
75
+ #
76
+ # @return [Array<Stylesheet>] Array of Stylesheet structs.
77
+ def stylesheets
78
+ @global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
77
79
  end
78
80
 
79
- def selector(name)
80
- feed_config.dig(:selectors, name, :selector)
81
- end
82
-
83
- def attribute_names
84
- @attribute_names ||= feed_config.fetch(:selectors, {}).keys.tap { |attrs| attrs.delete(:items) }
85
- end
86
-
87
- def items_order
88
- feed_config.dig(:selectors, :items, :order)&.to_sym
89
- end
90
-
91
- private
92
-
93
- attr_reader :feed_config, :channel_config, :global_config
81
+ # Provides read-only access to the channel object.
82
+ attr_reader :channel
94
83
  end
95
84
  end
data/lib/html2rss/item.rb CHANGED
@@ -1,12 +1,24 @@
1
- require 'faraday'
2
- require 'faraday_middleware'
1
+ # frozen_string_literal: true
2
+
3
3
  require 'nokogiri'
4
4
 
5
5
  module Html2rss
6
6
  ##
7
- # Takes the selected Nokogiri::HTML and responds to accessors names
7
+ # Takes the selected Nokogiri::HTML and responds to accessor names
8
8
  # defined in the feed config.
9
+ #
10
+ # Instances can only be created via `.from_url` and
11
+ # each represents an internally used "RSS item".
12
+ # Such an item provides dynamically defined attributes as methods.
9
13
  class Item
14
+ # A context instance is passed to Item Extractors.
15
+ Context = Struct.new('Context', :options, :item, :config, keyword_init: true)
16
+ # Class to keep an Item's <enclosure>.
17
+ Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
18
+
19
+ ##
20
+ # @param xml [Nokogiri::XML::Element]
21
+ # @param config [Html2rss::Config]
10
22
  def initialize(xml, config)
11
23
  @xml = xml
12
24
  @config = config
@@ -14,86 +26,150 @@ module Html2rss
14
26
 
15
27
  private_class_method :new
16
28
 
29
+ ##
30
+ # Checks if the object responds to a method dynamically based on the configuration.
31
+ #
32
+ # @param method_name [Symbol]
33
+ # @param _include_private [true, false]
34
+ # @return [true, false]
17
35
  def respond_to_missing?(method_name, _include_private = false)
18
- config.attribute?(method_name) || super
36
+ config.selector?(method_name) || super
19
37
  end
20
38
 
39
+ ##
40
+ # Dynamically extracts data based on the method name.
41
+ #
42
+ # @param method_name [Symbol]
43
+ # @param _args [Array]
44
+ # @return [String] extracted value for the selector.
21
45
  def method_missing(method_name, *_args)
22
46
  return super unless respond_to_missing?(method_name)
23
47
 
24
- attribute_options = config.attribute_options(method_name)
48
+ extract(method_name)
49
+ end
25
50
 
26
- extractor = ItemExtractors.get_extractor(attribute_options[:extractor])
27
- value = extractor.new(xml, attribute_options).get
51
+ ##
52
+ # Selects and processes data according to the selector name.
53
+ #
54
+ # @param tag [Symbol]
55
+ # @return [String] the extracted value for the selector.
56
+ def extract(tag)
57
+ attribute_options = config.selector_attributes_with_channel(tag.to_sym)
58
+
59
+ post_process(
60
+ ItemExtractors.item_extractor_factory(attribute_options, xml).get,
61
+ attribute_options.fetch(:post_process, false)
62
+ )
63
+ end
28
64
 
29
- post_process(value, attribute_options.fetch(:post_process, false))
65
+ ##
66
+ # Checks if the item is valid accordin to RSS 2.0 spec,
67
+ # by ensuring it has at least a title or a description.
68
+ #
69
+ # @return [true, false]
70
+ def valid?
71
+ title_or_description.to_s != ''
30
72
  end
31
73
 
32
- def available_attributes
33
- @available_attributes ||= (%i[title link description author comments updated] &
34
- @config.attribute_names) - %i[categories enclosure]
74
+ ##
75
+ # Returns either the title or the description, preferring title if available.
76
+ #
77
+ # @return [String, nil]
78
+ def title_or_description
79
+ return title if config.selector?(:title)
80
+
81
+ description if config.selector?(:description)
35
82
  end
36
83
 
37
84
  ##
38
- # At least a title or a description is required to be a valid RSS 2.0 item.
39
- def valid?
40
- title = self.title if config.attribute?(:title)
41
- description = self.description if config.attribute?(:description)
42
- [title, description].join != ''
85
+ #
86
+ # @return [String] SHA1 hashed GUID.
87
+ def guid
88
+ content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join
89
+
90
+ Digest::SHA1.hexdigest(content)
43
91
  end
44
92
 
45
93
  ##
46
- # @return [Array]
94
+ # Retrieves categories for the item based on configured category selectors.
95
+ #
96
+ # @return [Array<String>] list of categories.
47
97
  def categories
48
- config.category_selectors.map(&method(:method_missing))
98
+ config.category_selector_names.map { |method_name| public_send(method_name) }
49
99
  end
50
100
 
101
+ ##
102
+ # Checks if the item has an enclosure based on configuration.
103
+ #
104
+ # @return [true, false]
51
105
  def enclosure?
52
- config.attribute?(:enclosure)
106
+ config.selector?(:enclosure)
53
107
  end
54
108
 
55
- def enclosure_url
56
- enclosure = Html2rss::Utils.sanitize_url(method_missing(:enclosure))
57
-
58
- Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s if enclosure
109
+ ##
110
+ # Retrieves enclosure details for the item.
111
+ #
112
+ # @return [Enclosure] enclosure details.
113
+ def enclosure
114
+ url = enclosure_url
115
+
116
+ raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
117
+
118
+ Enclosure.new(
119
+ type: Html2rss::Utils.guess_content_type_from_url(url),
120
+ bits_length: 0,
121
+ url: url.to_s
122
+ )
59
123
  end
60
124
 
61
125
  ##
62
- # @return [Array]
126
+ # Fetches items from a given URL using configuration settings.
127
+ #
128
+ # @param url [String] URL to fetch items from.
129
+ # @param config [Html2rss::Config] Configuration object.
130
+ # @return [Array<Html2rss::Item>] list of items fetched.
63
131
  def self.from_url(url, config)
64
- body = get_body_from_url(url, config)
132
+ body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
65
133
 
66
- Nokogiri.HTML(body).css(config.selector(:items))
67
- .map { |xml_item| new xml_item, config }
68
- .keep_if(&:valid?)
134
+ Nokogiri.HTML(body)
135
+ .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
136
+ .map { |xml| new(xml, config) }
137
+ .select(&:valid?)
69
138
  end
70
139
 
71
140
  private
72
141
 
73
- def self.get_body_from_url(url, config)
74
- request = Faraday.new(url: url, headers: config.headers) do |faraday|
75
- faraday.use FaradayMiddleware::FollowRedirects
76
- faraday.adapter Faraday.default_adapter
77
- end
78
-
79
- body = request.get.body
80
-
81
- config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body
82
- end
83
- private_class_method :get_body_from_url
84
-
85
- attr_reader :xml, :config
142
+ # @return [Nokogiri::XML::Element] XML element representing the item.
143
+ attr_reader :xml
144
+ # @return [Html2rss::Config] Configuration object for the item.
145
+ attr_reader :config
86
146
 
147
+ ##
148
+ # Processes the extracted value according to post-processing options.
149
+ #
150
+ # @param value [String] extracted value.
151
+ # @param post_process_options [Hash<Symbol, Object>] post-processing options.
152
+ # @return [String] processed value.
87
153
  def post_process(value, post_process_options)
88
154
  return value unless post_process_options
89
155
 
90
156
  [post_process_options].flatten.each do |options|
91
157
  value = AttributePostProcessors.get_processor(options[:name])
92
- .new(value, options: options, item: self, config: @config)
158
+ .new(value, Context.new(options:, item: self, config:))
93
159
  .get
94
160
  end
95
161
 
96
162
  value
97
163
  end
164
+
165
+ ##
166
+ # Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute.
167
+ #
168
+ # @return [Addressable::URI, nil] absolute URL of the enclosure.
169
+ def enclosure_url
170
+ enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure))
171
+
172
+ Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure
173
+ end
98
174
  end
99
175
  end
@@ -1,9 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Html2rss
2
4
  module ItemExtractors
3
5
  ##
4
6
  # Returns the value of the attribute.
5
7
  #
6
- # Imagine this +time+ HTML element with a +datetime+ attribute:
8
+ # Imagine this +time+ HTML tag with a +datetime+ attribute:
7
9
  #
8
10
  # <time datetime="2019-07-01">...</time>
9
11
  #
@@ -18,19 +20,30 @@ module Html2rss
18
20
  # Would return:
19
21
  # '2019-07-01'
20
22
  #
21
- # In case you're extracting a date or a time, do not forget to parse it
22
- # during post processing with
23
- # {AttributePostProcessors::ParseTime}[rdoc-ref:Html2rss::AttributePostProcessors::ParseTime].
23
+ # In case you're extracting a date or a time, consider parsing it
24
+ # during post processing with {AttributePostProcessors::ParseTime}.
24
25
  class Attribute
26
+ # The available options for the attribute extractor.
27
+ Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
28
+
29
+ ##
30
+ # Initializes the Attribute extractor.
31
+ #
32
+ # @param xml [Nokogiri::XML::Element]
33
+ # @param options [Options]
25
34
  def initialize(xml, options)
26
35
  @options = options
27
- @element = ItemExtractors.element(xml, options)
36
+ @element = ItemExtractors.element(xml, options.selector)
28
37
  end
29
38
 
30
39
  ##
31
- # @return [String]
40
+ # Retrieves and returns the attribute's value as a string.
41
+ #
42
+ # @return [String] The value of the attribute.
32
43
  def get
33
- @element.attr(@options[:attribute]).to_s
44
+ @element.attr(@options.attribute).to_s.freeze
45
+ rescue NoMethodError => error
46
+ raise "Failed to extract attribute: #{error.message}"
34
47
  end
35
48
  end
36
49
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Html2rss
2
4
  module ItemExtractors
3
5
  ##
@@ -21,15 +23,29 @@ module Html2rss
21
23
  # Would return:
22
24
  # 'http://blog-without-a-feed.example.com/posts/latest-findings'
23
25
  class Href
26
+ # The available options for the href (attribute) extractor.
27
+ Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
28
+
29
+ ##
30
+ # Initializes the Href extractor.
31
+ #
32
+ # @param xml [Nokogiri::XML::Element]
33
+ # @param options [Options]
24
34
  def initialize(xml, options)
25
35
  @options = options
26
- element = ItemExtractors.element(xml, options)
27
- @href = Html2rss::Utils.sanitize_url(element.attr('href'))
36
+ @element = ItemExtractors.element(xml, options.selector)
37
+ @href = @element.attr('href').to_s
28
38
  end
29
39
 
30
- # @return [URI::HTTPS, URI::HTTP]
40
+ ##
41
+ # Retrieves and returns the normalized absolute URL.
42
+ #
43
+ # @return [String] The absolute URL.
31
44
  def get
32
- Html2rss::Utils.build_absolute_url_from_relative(@href, @options[:channel][:url])
45
+ return nil unless @href
46
+
47
+ sanitized_href = Html2rss::Utils.sanitize_url(@href)
48
+ Html2rss::Utils.build_absolute_url_from_relative(sanitized_href, @options.channel.url)
33
49
  end
34
50
  end
35
51
  end