html2rss 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +323 -270
  3. data/exe/html2rss +6 -0
  4. data/html2rss.gemspec +18 -23
  5. data/lib/html2rss/attribute_post_processors/gsub.rb +30 -8
  6. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +7 -2
  7. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +27 -0
  8. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +41 -0
  9. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +11 -2
  10. data/lib/html2rss/attribute_post_processors/parse_time.rb +11 -4
  11. data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -2
  12. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +40 -44
  13. data/lib/html2rss/attribute_post_processors/substring.rb +14 -4
  14. data/lib/html2rss/attribute_post_processors/template.rb +36 -12
  15. data/lib/html2rss/attribute_post_processors.rb +28 -5
  16. data/lib/html2rss/cli.rb +29 -0
  17. data/lib/html2rss/config/channel.rb +117 -0
  18. data/lib/html2rss/config/selectors.rb +91 -0
  19. data/lib/html2rss/config.rb +71 -82
  20. data/lib/html2rss/item.rb +122 -46
  21. data/lib/html2rss/item_extractors/attribute.rb +20 -7
  22. data/lib/html2rss/item_extractors/href.rb +20 -4
  23. data/lib/html2rss/item_extractors/html.rb +18 -6
  24. data/lib/html2rss/item_extractors/static.rb +18 -7
  25. data/lib/html2rss/item_extractors/text.rb +17 -5
  26. data/lib/html2rss/item_extractors.rb +75 -10
  27. data/lib/html2rss/object_to_xml_converter.rb +56 -0
  28. data/lib/html2rss/rss_builder/channel.rb +21 -0
  29. data/lib/html2rss/rss_builder/item.rb +83 -0
  30. data/lib/html2rss/rss_builder/stylesheet.rb +37 -0
  31. data/lib/html2rss/rss_builder.rb +96 -0
  32. data/lib/html2rss/utils.rb +94 -19
  33. data/lib/html2rss/version.rb +5 -1
  34. data/lib/html2rss.rb +57 -20
  35. metadata +53 -165
  36. data/.gitignore +0 -12
  37. data/.rspec +0 -4
  38. data/.rubocop.yml +0 -164
  39. data/.travis.yml +0 -25
  40. data/.yardopts +0 -6
  41. data/CHANGELOG.md +0 -221
  42. data/Gemfile +0 -8
  43. data/Gemfile.lock +0 -139
  44. data/bin/console +0 -15
  45. data/bin/setup +0 -8
  46. data/lib/html2rss/feed_builder.rb +0 -81
  47. data/lib/html2rss/item_extractors/current_time.rb +0 -21
  48. data/support/logo.png +0 -0
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable'
4
+
5
+ module Html2rss
6
+ class Config
7
+ ##
8
+ # Holds the configuration for the feed's channel options.
9
+ # This contains:
10
+ #
11
+ # 1. the RSS channel attributes
12
+ # 2. html2rss options like json or custom HTTP-headers for the request
13
+ class Channel
14
+ ##
15
+ # @param config [Hash<Symbol, Object>]
16
+ # @return [Set<String>] the required parameter names
17
+ def self.required_params_for_config(config)
18
+ config.each_with_object(Set.new) do |(_, value), required_params|
19
+ required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
20
+ end
21
+ end
22
+
23
+ ##
24
+ # @param channel [Hash<Symbol, Object>]
25
+ # @param params [Hash]
26
+ def initialize(channel, params: {})
27
+ raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
28
+ raise ArgumentError, 'missing key :url' unless channel[:url].is_a?(String)
29
+
30
+ @config = process_params(channel, params.transform_keys(&:to_sym))
31
+ end
32
+
33
+ ##
34
+ # The HTTP headers to use for the request.
35
+ #
36
+ # @return [Hash<Symbol, String>]
37
+ def headers
38
+ config.fetch(:headers, {})
39
+ end
40
+
41
+ ##
42
+ # @return [String]
43
+ def author
44
+ config.fetch(:author, 'html2rss')
45
+ end
46
+
47
+ ##
48
+ # @return [Integer]
49
+ def ttl
50
+ config.fetch(:ttl, 360)
51
+ end
52
+
53
+ ##
54
+ # @return [String]
55
+ def title
56
+ config.fetch(:title) { Utils.titleized_url(url) }
57
+ end
58
+
59
+ ##
60
+ # @return [String] language code
61
+ def language
62
+ config.fetch(:language, 'en')
63
+ end
64
+
65
+ ##
66
+ # @return [String]
67
+ def description
68
+ config.fetch(:description) { "Latest items from #{url}." }
69
+ end
70
+
71
+ ##
72
+ # @return [Addressable::URI]
73
+ def url
74
+ Addressable::URI.parse(config[:url]).normalize
75
+ end
76
+
77
+ ##
78
+ # @return [String] time_zone name
79
+ def time_zone
80
+ config.fetch(:time_zone, 'UTC')
81
+ end
82
+
83
+ ##
84
+ # @return [true, false]
85
+ def json?
86
+ config.fetch(:json, false)
87
+ end
88
+
89
+ private
90
+
91
+ # @return [Hash<Symbol, Object>]
92
+ attr_reader :config
93
+
94
+ ##
95
+ # @param config [Hash<Symbol, Object>]
96
+ # @param params [Hash<Symbol, String>]
97
+ # @return [nil]
98
+ def assert_required_params_presence(config, params)
99
+ missing_params = self.class.required_params_for_config(config) - params.keys.map(&:to_s)
100
+ raise ParamsMissing, missing_params.to_a.join(', ') unless missing_params.empty?
101
+ end
102
+
103
+ ##
104
+ # Sets the variables used in the feed config's channel.
105
+ #
106
+ # @param config [Hash<Symbol, Object>]
107
+ # @param params [Hash<Symbol, Object>]
108
+ # @return [Hash<Symbol, Object>]
109
+ def process_params(config, params)
110
+ assert_required_params_presence(config, params)
111
+ config.transform_values do |value|
112
+ value.is_a?(String) ? format(value, params) : value
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ ##
6
+ # Holds the configurations of the selectors.
7
+ class Selectors
8
+ ITEMS_SELECTOR_NAME = :items
9
+
10
+ # Struct to represent a selector with associated attributes for extraction and processing.
11
+ Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
12
+
13
+ ##
14
+ # @param config [Hash<Symbol, Object>]
15
+ def initialize(config)
16
+ validate_config(config)
17
+ @config = config
18
+ end
19
+
20
+ ##
21
+ # @param name [Symbol]
22
+ # @return [true, false]
23
+ def selector?(name)
24
+ name != ITEMS_SELECTOR_NAME && item_selector_names.include?(name)
25
+ end
26
+
27
+ ##
28
+ # @param name [Symbol]
29
+ # @return [Selector]
30
+ def selector(name)
31
+ raise ArgumentError, "invalid item's selector name: #{name}" unless selector?(name)
32
+
33
+ Selector.new(config[name])
34
+ end
35
+
36
+ ##
37
+ # @return [Set<Symbol>]
38
+ def category_selector_names
39
+ selector_keys_for(:categories)
40
+ end
41
+
42
+ ##
43
+ # @return [Set<Symbol>]
44
+ def guid_selector_names
45
+ selector_keys_for(:guid, default: :title_or_description)
46
+ end
47
+
48
+ ##
49
+ # Returns the CSS/XPath selector.
50
+ #
51
+ # @param name [Symbol]
52
+ # @return [String]
53
+ def selector_string(name)
54
+ Selector.new(config[name]).selector
55
+ end
56
+
57
+ ##
58
+ # @return [Set<Symbol>]
59
+ def item_selector_names
60
+ @item_selector_names ||= config.keys.reject { |key| key == ITEMS_SELECTOR_NAME }.to_set
61
+ end
62
+
63
+ ##
64
+ # @return [Symbol, nil]
65
+ def items_order
66
+ config.dig(ITEMS_SELECTOR_NAME, :order)&.to_sym
67
+ end
68
+
69
+ private
70
+
71
+ attr_reader :config
72
+
73
+ def validate_config(config)
74
+ raise ArgumentError, 'selector for items is required' unless config[ITEMS_SELECTOR_NAME].is_a?(Hash)
75
+ end
76
+
77
+ ##
78
+ # Returns the selector keys for the selector named `name`. If none, returns [default].
79
+ #
80
+ # @param name [Symbol]
81
+ # @param default [String, Symbol]
82
+ # @return [Set<Symbol>]
83
+ def selector_keys_for(name, default: nil)
84
+ config.fetch(name) { Array(default) }.tap do |array|
85
+ array.reject! { |entry| entry.to_s == '' }
86
+ array.map!(&:to_sym)
87
+ end.to_set
88
+ end
89
+ end
90
+ end
91
+ end
@@ -1,95 +1,84 @@
1
- require 'active_support/core_ext/hash'
1
+ # frozen_string_literal: true
2
+
3
+ require 'forwardable'
2
4
 
3
5
  module Html2rss
4
6
  ##
5
7
  # The Config class abstracts from the config data structure and
6
8
  # provides default values.
7
9
  class Config
8
- def initialize(feed_config, global_config = {})
9
- @global_config = global_config.deep_symbolize_keys
10
- @feed_config = feed_config.deep_symbolize_keys
11
- @channel_config = @feed_config.fetch(:channel, {})
12
- end
13
-
14
- def author
15
- channel_config.fetch :author, 'html2rss'
16
- end
17
-
18
- def ttl
19
- channel_config.fetch :ttl, 360
20
- end
21
-
22
- def title
23
- channel_config.fetch(:title) { generated_title }
24
- end
25
-
26
- def generated_title
27
- uri = URI(url)
28
-
29
- nicer_path = uri.path.split('/')
30
- nicer_path.reject! { |part| part == '' }
31
-
32
- nicer_path.any? ? "#{uri.host}: #{nicer_path.join(' ').titleize}" : uri.host
33
- end
34
-
35
- def language
36
- channel_config.fetch :language, 'en'
37
- end
38
-
39
- def description
40
- channel_config.fetch :description, "Latest items from #{url}."
41
- end
42
-
43
- def url
44
- channel_config.dig :url
45
- end
46
- alias link url
47
-
48
- def time_zone
49
- channel_config.fetch :time_zone, 'UTC'
50
- end
51
-
52
- def json?
53
- channel_config.fetch :json, false
54
- end
55
-
10
+ extend Forwardable
11
+
12
+ ##
13
+ # The Error class to be thrown when a feed config requires params, but none
14
+ # were passed to Config.
15
+ class ParamsMissing < Html2rss::Error; end
16
+
17
+ ##
18
+ # Thrown when the feed config does not contain a value at `:channel`.
19
+ class ChannelMissing < Html2rss::Error; end
20
+
21
+ # Struct to store XML Stylesheet attributes
22
+ Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
23
+
24
+ def_delegator :@channel, :author
25
+ def_delegator :@channel, :ttl
26
+ def_delegator :@channel, :title
27
+ def_delegator :@channel, :language
28
+ def_delegator :@channel, :description
29
+ def_delegator :@channel, :url
30
+ def_delegator :@channel, :url, :link
31
+ def_delegator :@channel, :time_zone
32
+ def_delegator :@channel, :json?
33
+
34
+ def_delegator :@selectors, :item_selector_names
35
+ def_delegator :@selectors, :selector?
36
+ def_delegator :@selectors, :category_selector_names
37
+ def_delegator :@selectors, :guid_selector_names
38
+ def_delegator :@selectors, :items_order
39
+ def_delegator :@selectors, :selector_string
40
+
41
+ ##
42
+ # Initializes the Config object with feed configuration, global settings, and parameters.
43
+ #
44
+ # @param feed_config [Hash<Symbol, Object>] The configuration hash containing `:channel` and `:selectors`.
45
+ # @param global [Hash<Symbol, Object>] Global settings hash.
46
+ # @param params [Hash<Symbol, String>] Parameters hash.
47
+ def initialize(feed_config, global = {}, params = {})
48
+ channel_config = feed_config[:channel]
49
+ raise ChannelMissing, 'Channel configuration is missing in feed_config' unless channel_config
50
+
51
+ @channel = Channel.new(channel_config, params:)
52
+ @selectors = Selectors.new(feed_config[:selectors])
53
+ @global = global
54
+ end
55
+
56
+ ##
57
+ # Retrieves selector attributes merged with channel attributes.
58
+ #
59
+ # @param name [Symbol] Selector name.
60
+ # @return [Hash<Symbol, Object>] Merged attributes hash.
61
+ def selector_attributes_with_channel(name)
62
+ @selectors.selector(name).to_h.merge(channel: @channel)
63
+ end
64
+
65
+ ##
66
+ # Retrieves headers merged from global settings and channel headers.
67
+ #
68
+ # @return [Hash] Merged headers hash.
56
69
  def headers
57
- global_config.fetch(:headers, {}).merge(channel_config.fetch(:headers, {}))
58
- end
59
-
60
- def attribute_options(name)
61
- feed_config.dig(:selectors).fetch(name, {}).merge(channel: channel_config)
62
- end
63
-
64
- def attribute?(name)
65
- attribute_names.include?(name)
70
+ @global.fetch(:headers, {}).merge(@channel.headers)
66
71
  end
67
72
 
68
- def category_selectors
69
- categories = feed_config.dig(:selectors, :categories)
70
- return [] unless categories
71
-
72
- categories = categories.keep_if { |category| category.to_s != '' }
73
- categories.map!(&:to_sym)
74
- categories.uniq!
75
-
76
- categories
73
+ ##
74
+ # Retrieves stylesheets from global settings.
75
+ #
76
+ # @return [Array<Stylesheet>] Array of Stylesheet structs.
77
+ def stylesheets
78
+ @global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
77
79
  end
78
80
 
79
- def selector(name)
80
- feed_config.dig(:selectors, name, :selector)
81
- end
82
-
83
- def attribute_names
84
- @attribute_names ||= feed_config.fetch(:selectors, {}).keys.tap { |attrs| attrs.delete(:items) }
85
- end
86
-
87
- def items_order
88
- feed_config.dig(:selectors, :items, :order)&.to_sym
89
- end
90
-
91
- private
92
-
93
- attr_reader :feed_config, :channel_config, :global_config
81
+ # Provides read-only access to the channel object.
82
+ attr_reader :channel
94
83
  end
95
84
  end
data/lib/html2rss/item.rb CHANGED
@@ -1,12 +1,39 @@
1
- require 'faraday'
2
- require 'faraday_middleware'
1
+ # frozen_string_literal: true
2
+
3
3
  require 'nokogiri'
4
4
 
5
5
  module Html2rss
6
6
  ##
7
- # Takes the selected Nokogiri::HTML and responds to accessors names
7
+ # Takes the selected Nokogiri::HTML and responds to accessor names
8
8
  # defined in the feed config.
9
+ #
10
+ # Instances can only be created via `.from_url` and
11
+ # each represents an internally used "RSS item".
12
+ # Such an item provides dynamically defined attributes as methods.
9
13
  class Item
14
+ # A context instance is passed to Item Extractors.
15
+ Context = Struct.new('Context', :options, :item, :config, keyword_init: true)
16
+ # Class to keep an Item's <enclosure>.
17
+ Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
18
+
19
+ ##
20
+ # Fetches items from a given URL using configuration settings.
21
+ #
22
+ # @param url [String] URL to fetch items from.
23
+ # @param config [Html2rss::Config] Configuration object.
24
+ # @return [Array<Html2rss::Item>] list of items fetched.
25
+ def self.from_url(url, config)
26
+ body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
27
+
28
+ Nokogiri.HTML(body)
29
+ .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
30
+ .map { |xml| new(xml, config) }
31
+ .select(&:valid?)
32
+ end
33
+
34
+ ##
35
+ # @param xml [Nokogiri::XML::Element]
36
+ # @param config [Html2rss::Config]
10
37
  def initialize(xml, config)
11
38
  @xml = xml
12
39
  @config = config
@@ -14,86 +41,135 @@ module Html2rss
14
41
 
15
42
  private_class_method :new
16
43
 
44
+ ##
45
+ # Checks if the object responds to a method dynamically based on the configuration.
46
+ #
47
+ # @param method_name [Symbol]
48
+ # @param _include_private [true, false]
49
+ # @return [true, false]
17
50
  def respond_to_missing?(method_name, _include_private = false)
18
- config.attribute?(method_name) || super
51
+ config.selector?(method_name) || super
19
52
  end
20
53
 
54
+ ##
55
+ # Dynamically extracts data based on the method name.
56
+ #
57
+ # @param method_name [Symbol]
58
+ # @param _args [Array]
59
+ # @return [String] extracted value for the selector.
21
60
  def method_missing(method_name, *_args)
22
61
  return super unless respond_to_missing?(method_name)
23
62
 
24
- attribute_options = config.attribute_options(method_name)
25
-
26
- extractor = ItemExtractors.get_extractor(attribute_options[:extractor])
27
- value = extractor.new(xml, attribute_options).get
28
-
29
- post_process(value, attribute_options.fetch(:post_process, false))
63
+ extract(method_name)
30
64
  end
31
65
 
32
- def available_attributes
33
- @available_attributes ||= (%i[title link description author comments updated] &
34
- @config.attribute_names) - %i[categories enclosure]
66
+ ##
67
+ # Selects and processes data according to the selector name.
68
+ #
69
+ # @param tag [Symbol]
70
+ # @return [String] the extracted value for the selector.
71
+ def extract(tag)
72
+ attribute_options = config.selector_attributes_with_channel(tag.to_sym)
73
+
74
+ post_process(
75
+ ItemExtractors.item_extractor_factory(attribute_options, xml).get,
76
+ attribute_options.fetch(:post_process, false)
77
+ )
35
78
  end
36
79
 
37
80
  ##
38
- # At least a title or a description is required to be a valid RSS 2.0 item.
81
+ # Checks if the item is valid accordin to RSS 2.0 spec,
82
+ # by ensuring it has at least a title or a description.
83
+ #
84
+ # @return [true, false]
39
85
  def valid?
40
- title = self.title if config.attribute?(:title)
41
- description = self.description if config.attribute?(:description)
42
- [title, description].join != ''
86
+ title_or_description.to_s != ''
43
87
  end
44
88
 
45
89
  ##
46
- # @return [Array]
47
- def categories
48
- config.category_selectors.map(&method(:method_missing))
49
- end
90
+ # Returns either the title or the description, preferring title if available.
91
+ #
92
+ # @return [String, nil]
93
+ def title_or_description
94
+ return title if config.selector?(:title)
50
95
 
51
- def enclosure?
52
- config.attribute?(:enclosure)
96
+ description if config.selector?(:description)
53
97
  end
54
98
 
55
- def enclosure_url
56
- enclosure = Html2rss::Utils.sanitize_url(method_missing(:enclosure))
99
+ ##
100
+ #
101
+ # @return [String] SHA1 hashed GUID.
102
+ def guid
103
+ content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join
57
104
 
58
- Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s if enclosure
105
+ Digest::SHA1.hexdigest(content)
59
106
  end
60
107
 
61
108
  ##
62
- # @return [Array]
63
- def self.from_url(url, config)
64
- body = get_body_from_url(url, config)
65
-
66
- Nokogiri.HTML(body).css(config.selector(:items))
67
- .map { |xml_item| new xml_item, config }
68
- .keep_if(&:valid?)
109
+ # Retrieves categories for the item based on configured category selectors.
110
+ #
111
+ # @return [Array<String>] list of categories.
112
+ def categories
113
+ config.category_selector_names.map { |method_name| public_send(method_name) }
69
114
  end
70
115
 
71
- private
72
-
73
- def self.get_body_from_url(url, config)
74
- request = Faraday.new(url: url, headers: config.headers) do |faraday|
75
- faraday.use FaradayMiddleware::FollowRedirects
76
- faraday.adapter Faraday.default_adapter
77
- end
78
-
79
- body = request.get.body
116
+ ##
117
+ # Checks if the item has an enclosure based on configuration.
118
+ #
119
+ # @return [true, false]
120
+ def enclosure?
121
+ config.selector?(:enclosure)
122
+ end
80
123
 
81
- config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body
124
+ ##
125
+ # Retrieves enclosure details for the item.
126
+ #
127
+ # @return [Enclosure] enclosure details.
128
+ def enclosure
129
+ url = enclosure_url
130
+
131
+ raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
132
+
133
+ Enclosure.new(
134
+ type: Html2rss::Utils.guess_content_type_from_url(url),
135
+ bits_length: 0,
136
+ url: url.to_s
137
+ )
82
138
  end
83
- private_class_method :get_body_from_url
84
139
 
85
- attr_reader :xml, :config
140
+ private
141
+
142
+ # @return [Nokogiri::XML::Element] XML element representing the item.
143
+ attr_reader :xml
144
+ # @return [Html2rss::Config] Configuration object for the item.
145
+ attr_reader :config
86
146
 
147
+ ##
148
+ # Processes the extracted value according to post-processing options.
149
+ #
150
+ # @param value [String] extracted value.
151
+ # @param post_process_options [Hash<Symbol, Object>] post-processing options.
152
+ # @return [String] processed value.
87
153
  def post_process(value, post_process_options)
88
154
  return value unless post_process_options
89
155
 
90
156
  [post_process_options].flatten.each do |options|
91
157
  value = AttributePostProcessors.get_processor(options[:name])
92
- .new(value, options: options, item: self, config: @config)
158
+ .new(value, Context.new(options:, item: self, config:))
93
159
  .get
94
160
  end
95
161
 
96
162
  value
97
163
  end
164
+
165
+ ##
166
+ # Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute.
167
+ #
168
+ # @return [Addressable::URI, nil] absolute URL of the enclosure.
169
+ def enclosure_url
170
+ enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure))
171
+
172
+ Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure
173
+ end
98
174
  end
99
175
  end
@@ -1,9 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Html2rss
2
4
  module ItemExtractors
3
5
  ##
4
6
  # Returns the value of the attribute.
5
7
  #
6
- # Imagine this +time+ HTML element with a +datetime+ attribute:
8
+ # Imagine this +time+ HTML tag with a +datetime+ attribute:
7
9
  #
8
10
  # <time datetime="2019-07-01">...</time>
9
11
  #
@@ -18,19 +20,30 @@ module Html2rss
18
20
  # Would return:
19
21
  # '2019-07-01'
20
22
  #
21
- # In case you're extracting a date or a time, do not forget to parse it
22
- # during post processing with
23
- # {AttributePostProcessors::ParseTime}[rdoc-ref:Html2rss::AttributePostProcessors::ParseTime].
23
+ # In case you're extracting a date or a time, consider parsing it
24
+ # during post processing with {AttributePostProcessors::ParseTime}.
24
25
  class Attribute
26
+ # The available options for the attribute extractor.
27
+ Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
28
+
29
+ ##
30
+ # Initializes the Attribute extractor.
31
+ #
32
+ # @param xml [Nokogiri::XML::Element]
33
+ # @param options [Options]
25
34
  def initialize(xml, options)
26
35
  @options = options
27
- @element = ItemExtractors.element(xml, options)
36
+ @element = ItemExtractors.element(xml, options.selector)
28
37
  end
29
38
 
30
39
  ##
31
- # @return [String]
40
+ # Retrieves and returns the attribute's value as a string.
41
+ #
42
+ # @return [String] The value of the attribute.
32
43
  def get
33
- @element.attr(@options[:attribute]).to_s
44
+ @element.attr(@options.attribute).to_s.freeze
45
+ rescue NoMethodError => error
46
+ raise "Failed to extract attribute: #{error.message}"
34
47
  end
35
48
  end
36
49
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Html2rss
2
4
  module ItemExtractors
3
5
  ##
@@ -21,15 +23,29 @@ module Html2rss
21
23
  # Would return:
22
24
  # 'http://blog-without-a-feed.example.com/posts/latest-findings'
23
25
  class Href
26
+ # The available options for the href (attribute) extractor.
27
+ Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
28
+
29
+ ##
30
+ # Initializes the Href extractor.
31
+ #
32
+ # @param xml [Nokogiri::XML::Element]
33
+ # @param options [Options]
24
34
  def initialize(xml, options)
25
35
  @options = options
26
- element = ItemExtractors.element(xml, options)
27
- @href = Html2rss::Utils.sanitize_url(element.attr('href'))
36
+ @element = ItemExtractors.element(xml, options.selector)
37
+ @href = @element.attr('href').to_s
28
38
  end
29
39
 
30
- # @return [URI::HTTPS, URI::HTTP]
40
+ ##
41
+ # Retrieves and returns the normalized absolute URL.
42
+ #
43
+ # @return [String] The absolute URL.
31
44
  def get
32
- Html2rss::Utils.build_absolute_url_from_relative(@href, @options[:channel][:url])
45
+ return nil unless @href
46
+
47
+ sanitized_href = Html2rss::Utils.sanitize_url(@href)
48
+ Html2rss::Utils.build_absolute_url_from_relative(sanitized_href, @options.channel.url)
33
49
  end
34
50
  end
35
51
  end