html2rss 0.8.2 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -1
  3. data/.mergify.yml +15 -0
  4. data/.rubocop.yml +13 -42
  5. data/Gemfile +19 -2
  6. data/Gemfile.lock +116 -94
  7. data/README.md +326 -253
  8. data/bin/console +1 -0
  9. data/exe/html2rss +6 -0
  10. data/html2rss.gemspec +16 -21
  11. data/lib/html2rss/attribute_post_processors/gsub.rb +30 -8
  12. data/lib/html2rss/attribute_post_processors/html_to_markdown.rb +7 -2
  13. data/lib/html2rss/attribute_post_processors/html_transformers/transform_urls_to_absolute_ones.rb +27 -0
  14. data/lib/html2rss/attribute_post_processors/html_transformers/wrap_img_in_a.rb +41 -0
  15. data/lib/html2rss/attribute_post_processors/markdown_to_html.rb +11 -2
  16. data/lib/html2rss/attribute_post_processors/parse_time.rb +11 -4
  17. data/lib/html2rss/attribute_post_processors/parse_uri.rb +12 -2
  18. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +46 -51
  19. data/lib/html2rss/attribute_post_processors/substring.rb +14 -4
  20. data/lib/html2rss/attribute_post_processors/template.rb +36 -12
  21. data/lib/html2rss/attribute_post_processors.rb +28 -5
  22. data/lib/html2rss/cli.rb +29 -0
  23. data/lib/html2rss/config/channel.rb +117 -0
  24. data/lib/html2rss/config/selectors.rb +91 -0
  25. data/lib/html2rss/config.rb +71 -78
  26. data/lib/html2rss/item.rb +118 -40
  27. data/lib/html2rss/item_extractors/attribute.rb +20 -7
  28. data/lib/html2rss/item_extractors/href.rb +20 -4
  29. data/lib/html2rss/item_extractors/html.rb +18 -6
  30. data/lib/html2rss/item_extractors/static.rb +18 -7
  31. data/lib/html2rss/item_extractors/text.rb +17 -5
  32. data/lib/html2rss/item_extractors.rb +75 -9
  33. data/lib/html2rss/object_to_xml_converter.rb +56 -0
  34. data/lib/html2rss/rss_builder/channel.rb +21 -0
  35. data/lib/html2rss/rss_builder/item.rb +83 -0
  36. data/lib/html2rss/rss_builder/stylesheet.rb +37 -0
  37. data/lib/html2rss/rss_builder.rb +96 -0
  38. data/lib/html2rss/utils.rb +94 -19
  39. data/lib/html2rss/version.rb +6 -1
  40. data/lib/html2rss.rb +51 -20
  41. data/rakefile.rb +16 -0
  42. metadata +54 -150
  43. data/.travis.yml +0 -25
  44. data/CHANGELOG.md +0 -210
  45. data/lib/html2rss/feed_builder.rb +0 -75
  46. data/lib/html2rss/item_extractors/current_time.rb +0 -21
  47. data/support/logo.png +0 -0
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'addressable'
4
+
5
+ module Html2rss
6
+ class Config
7
+ ##
8
+ # Holds the configuration for the feed's channel options.
9
+ # This contains:
10
+ #
11
+ # 1. the RSS channel attributes
12
+ # 2. html2rss options like json or custom HTTP-headers for the request
13
+ class Channel
14
+ ##
15
+ # @param channel [Hash<Symbol, Object>]
16
+ # @param params [Hash]
17
+ def initialize(channel, params: {})
18
+ raise ArgumentError, 'channel must be a hash' unless channel.is_a?(Hash)
19
+ raise ArgumentError, 'missing key :url' unless channel[:url].is_a?(String)
20
+
21
+ @config = process_params(channel, params.transform_keys(&:to_sym))
22
+ end
23
+
24
+ ##
25
+ # The HTTP headers to use for the request.
26
+ #
27
+ # @return [Hash<Symbol, String>]
28
+ def headers
29
+ config.fetch(:headers, {})
30
+ end
31
+
32
+ ##
33
+ # @return [String]
34
+ def author
35
+ config.fetch(:author, 'html2rss')
36
+ end
37
+
38
+ ##
39
+ # @return [Integer]
40
+ def ttl
41
+ config.fetch(:ttl, 360)
42
+ end
43
+
44
+ ##
45
+ # @return [String]
46
+ def title
47
+ config.fetch(:title) { Utils.titleized_url(url) }
48
+ end
49
+
50
+ ##
51
+ # @return [String] language code
52
+ def language
53
+ config.fetch(:language, 'en')
54
+ end
55
+
56
+ ##
57
+ # @return [String]
58
+ def description
59
+ config.fetch(:description) { "Latest items from #{url}." }
60
+ end
61
+
62
+ ##
63
+ # @return [Addressable::URI]
64
+ def url
65
+ Addressable::URI.parse(config[:url]).normalize
66
+ end
67
+
68
+ ##
69
+ # @return [String] time_zone name
70
+ def time_zone
71
+ config.fetch(:time_zone, 'UTC')
72
+ end
73
+
74
+ ##
75
+ # @return [true, false]
76
+ def json?
77
+ config.fetch(:json, false)
78
+ end
79
+
80
+ ##
81
+ # @param config [Hash<Symbol, Object>]
82
+ # @return [Set<String>] the required parameter names
83
+ def self.required_params_for_config(config)
84
+ config.each_with_object(Set.new) do |(_, value), required_params|
85
+ required_params.merge(value.scan(/%<([\w_\d]+)>/).flatten) if value.is_a?(String)
86
+ end
87
+ end
88
+
89
+ private
90
+
91
+ # @return [Hash<Symbol, Object>]
92
+ attr_reader :config
93
+
94
+ ##
95
+ # @param config [Hash<Symbol, Object>]
96
+ # @param params [Hash<Symbol, String>]
97
+ # @return [nil]
98
+ def assert_required_params_presence(config, params)
99
+ missing_params = self.class.required_params_for_config(config) - params.keys.map(&:to_s)
100
+ raise ParamsMissing, missing_params.to_a.join(', ') unless missing_params.empty?
101
+ end
102
+
103
+ ##
104
+ # Sets the variables used in the feed config's channel.
105
+ #
106
+ # @param config [Hash<Symbol, Object>]
107
+ # @param params [Hash<Symbol, Object>]
108
+ # @return [Hash<Symbol, Object>]
109
+ def process_params(config, params)
110
+ assert_required_params_presence(config, params)
111
+ config.transform_values do |value|
112
+ value.is_a?(String) ? format(value, params) : value
113
+ end
114
+ end
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Html2rss
4
+ class Config
5
+ ##
6
+ # Holds the configurations of the selectors.
7
+ class Selectors
8
+ ITEMS_SELECTOR_NAME = :items
9
+
10
+ # Struct to represent a selector with associated attributes for extraction and processing.
11
+ Selector = Struct.new(:selector, :attribute, :extractor, :post_process, :order, :static, keyword_init: true)
12
+
13
+ ##
14
+ # @param config [Hash<Symbol, Object>]
15
+ def initialize(config)
16
+ validate_config(config)
17
+ @config = config
18
+ end
19
+
20
+ ##
21
+ # @param name [Symbol]
22
+ # @return [true, false]
23
+ def selector?(name)
24
+ name != ITEMS_SELECTOR_NAME && item_selector_names.include?(name)
25
+ end
26
+
27
+ ##
28
+ # @param name [Symbol]
29
+ # @return [Selector]
30
+ def selector(name)
31
+ raise ArgumentError, "invalid item's selector name: #{name}" unless selector?(name)
32
+
33
+ Selector.new(config[name])
34
+ end
35
+
36
+ ##
37
+ # @return [Set<Symbol>]
38
+ def category_selector_names
39
+ selector_keys_for(:categories)
40
+ end
41
+
42
+ ##
43
+ # @return [Set<Symbol>]
44
+ def guid_selector_names
45
+ selector_keys_for(:guid, default: :title_or_description)
46
+ end
47
+
48
+ ##
49
+ # Returns the CSS/XPath selector.
50
+ #
51
+ # @param name [Symbol]
52
+ # @return [String]
53
+ def selector_string(name)
54
+ Selector.new(config[name]).selector
55
+ end
56
+
57
+ ##
58
+ # @return [Set<Symbol>]
59
+ def item_selector_names
60
+ @item_selector_names ||= config.keys.reject { |key| key == ITEMS_SELECTOR_NAME }.to_set
61
+ end
62
+
63
+ ##
64
+ # @return [Symbol, nil]
65
+ def items_order
66
+ config.dig(ITEMS_SELECTOR_NAME, :order)&.to_sym
67
+ end
68
+
69
+ private
70
+
71
+ attr_reader :config
72
+
73
+ def validate_config(config)
74
+ raise ArgumentError, 'selector for items is required' unless config[ITEMS_SELECTOR_NAME].is_a?(Hash)
75
+ end
76
+
77
+ ##
78
+ # Returns the selector keys for the selector named `name`. If none, returns [default].
79
+ #
80
+ # @param name [Symbol]
81
+ # @param default [String, Symbol]
82
+ # @return [Set<Symbol>]
83
+ def selector_keys_for(name, default: nil)
84
+ config.fetch(name) { Array(default) }.tap do |array|
85
+ array.reject! { |entry| entry.to_s == '' }
86
+ array.map!(&:to_sym)
87
+ end.to_set
88
+ end
89
+ end
90
+ end
91
+ end
@@ -1,91 +1,84 @@
1
- require 'active_support/core_ext/hash'
1
+ # frozen_string_literal: true
2
+
3
+ require 'forwardable'
2
4
 
3
5
  module Html2rss
4
6
  ##
5
7
  # The Config class abstracts from the config data structure and
6
8
  # provides default values.
7
9
  class Config
8
- def initialize(feed_config, global_config = {})
9
- @global_config = global_config.deep_symbolize_keys
10
- @feed_config = feed_config.deep_symbolize_keys
11
- @channel_config = @feed_config.fetch(:channel, {})
12
- end
13
-
14
- def author
15
- channel_config.fetch :author, 'html2rss'
16
- end
17
-
18
- def ttl
19
- channel_config.fetch :ttl, 360
20
- end
21
-
22
- def title
23
- channel_config.fetch(:title) { generated_title }
24
- end
25
-
26
- def generated_title
27
- uri = URI(url)
28
-
29
- nicer_path = uri.path.split('/')
30
- nicer_path.reject! { |part| part == '' }
31
-
32
- nicer_path.any? ? "#{uri.host}: #{nicer_path.join(' ').titleize}" : uri.host
33
- end
34
-
35
- def language
36
- channel_config.fetch :language, 'en'
37
- end
38
-
39
- def description
40
- channel_config.fetch :description, "Latest items from #{url}."
41
- end
42
-
43
- def url
44
- channel_config.dig :url
45
- end
46
- alias link url
47
-
48
- def time_zone
49
- channel_config.fetch :time_zone, 'UTC'
50
- end
51
-
52
- def json?
53
- channel_config.fetch :json, false
54
- end
55
-
10
+ extend Forwardable
11
+
12
+ ##
13
+ # The Error class to be thrown when a feed config requires params, but none
14
+ # were passed to Config.
15
+ class ParamsMissing < StandardError; end
16
+
17
+ ##
18
+ # Thrown when the feed config does not contain a value at `:channel`.
19
+ class ChannelMissing < StandardError; end
20
+
21
+ # Struct to store XML Stylesheet attributes
22
+ Stylesheet = Struct.new(:href, :type, :media, keyword_init: true)
23
+
24
+ def_delegator :@channel, :author
25
+ def_delegator :@channel, :ttl
26
+ def_delegator :@channel, :title
27
+ def_delegator :@channel, :language
28
+ def_delegator :@channel, :description
29
+ def_delegator :@channel, :url
30
+ def_delegator :@channel, :url, :link
31
+ def_delegator :@channel, :time_zone
32
+ def_delegator :@channel, :json?
33
+
34
+ def_delegator :@selectors, :item_selector_names
35
+ def_delegator :@selectors, :selector?
36
+ def_delegator :@selectors, :category_selector_names
37
+ def_delegator :@selectors, :guid_selector_names
38
+ def_delegator :@selectors, :items_order
39
+ def_delegator :@selectors, :selector_string
40
+
41
+ ##
42
+ # Initializes the Config object with feed configuration, global settings, and parameters.
43
+ #
44
+ # @param feed_config [Hash<Symbol, Object>] The configuration hash containing `:channel` and `:selectors`.
45
+ # @param global [Hash<Symbol, Object>] Global settings hash.
46
+ # @param params [Hash<Symbol, String>] Parameters hash.
47
+ def initialize(feed_config, global = {}, params = {})
48
+ channel_config = feed_config[:channel]
49
+ raise ChannelMissing, 'Channel configuration is missing in feed_config' unless channel_config
50
+
51
+ @channel = Channel.new(channel_config, params:)
52
+ @selectors = Selectors.new(feed_config[:selectors])
53
+ @global = global
54
+ end
55
+
56
+ ##
57
+ # Retrieves selector attributes merged with channel attributes.
58
+ #
59
+ # @param name [Symbol] Selector name.
60
+ # @return [Hash<Symbol, Object>] Merged attributes hash.
61
+ def selector_attributes_with_channel(name)
62
+ @selectors.selector(name).to_h.merge(channel: @channel)
63
+ end
64
+
65
+ ##
66
+ # Retrieves headers merged from global settings and channel headers.
67
+ #
68
+ # @return [Hash] Merged headers hash.
56
69
  def headers
57
- global_config.fetch(:headers, {}).merge(channel_config.fetch(:headers, {}))
58
- end
59
-
60
- def attribute_options(name)
61
- feed_config.dig(:selectors).fetch(name, {}).merge(channel: channel_config)
70
+ @global.fetch(:headers, {}).merge(@channel.headers)
62
71
  end
63
72
 
64
- def attribute?(name)
65
- attribute_names.include?(name)
73
+ ##
74
+ # Retrieves stylesheets from global settings.
75
+ #
76
+ # @return [Array<Stylesheet>] Array of Stylesheet structs.
77
+ def stylesheets
78
+ @global.fetch(:stylesheets, []).map { |attributes| Stylesheet.new(attributes) }
66
79
  end
67
80
 
68
- def category_selectors
69
- categories = feed_config.dig(:selectors, :categories)
70
- return [] unless categories
71
-
72
- categories = categories.keep_if { |category| category.to_s != '' }
73
- categories.map!(&:to_sym)
74
- categories.uniq!
75
-
76
- categories
77
- end
78
-
79
- def selector(name)
80
- feed_config.dig(:selectors, name, :selector)
81
- end
82
-
83
- def attribute_names
84
- @attribute_names ||= feed_config.fetch(:selectors, {}).keys.tap { |attrs| attrs.delete(:items) }
85
- end
86
-
87
- private
88
-
89
- attr_reader :feed_config, :channel_config, :global_config
81
+ # Provides read-only access to the channel object.
82
+ attr_reader :channel
90
83
  end
91
84
  end
data/lib/html2rss/item.rb CHANGED
@@ -1,12 +1,24 @@
1
- require 'faraday'
2
- require 'faraday_middleware'
1
+ # frozen_string_literal: true
2
+
3
3
  require 'nokogiri'
4
4
 
5
5
  module Html2rss
6
6
  ##
7
- # Takes the selected Nokogiri::HTML and responds to accessors names
7
+ # Takes the selected Nokogiri::HTML and responds to accessor names
8
8
  # defined in the feed config.
9
+ #
10
+ # Instances can only be created via `.from_url` and
11
+ # each represents an internally used "RSS item".
12
+ # Such an item provides dynamically defined attributes as methods.
9
13
  class Item
14
+ # A context instance is passed to Item Extractors.
15
+ Context = Struct.new('Context', :options, :item, :config, keyword_init: true)
16
+ # Class to keep an Item's <enclosure>.
17
+ Enclosure = Struct.new('Enclosure', :type, :bits_length, :url, keyword_init: true)
18
+
19
+ ##
20
+ # @param xml [Nokogiri::XML::Element]
21
+ # @param config [Html2rss::Config]
10
22
  def initialize(xml, config)
11
23
  @xml = xml
12
24
  @config = config
@@ -14,84 +26,150 @@ module Html2rss
14
26
 
15
27
  private_class_method :new
16
28
 
29
+ ##
30
+ # Checks if the object responds to a method dynamically based on the configuration.
31
+ #
32
+ # @param method_name [Symbol]
33
+ # @param _include_private [true, false]
34
+ # @return [true, false]
17
35
  def respond_to_missing?(method_name, _include_private = false)
18
- config.attribute?(method_name) || super
36
+ config.selector?(method_name) || super
19
37
  end
20
38
 
39
+ ##
40
+ # Dynamically extracts data based on the method name.
41
+ #
42
+ # @param method_name [Symbol]
43
+ # @param _args [Array]
44
+ # @return [String] extracted value for the selector.
21
45
  def method_missing(method_name, *_args)
22
46
  return super unless respond_to_missing?(method_name)
23
47
 
24
- attribute_options = config.attribute_options(method_name)
48
+ extract(method_name)
49
+ end
25
50
 
26
- extractor = ItemExtractors.get_extractor(attribute_options[:extractor])
27
- value = extractor.new(xml, attribute_options).get
51
+ ##
52
+ # Selects and processes data according to the selector name.
53
+ #
54
+ # @param tag [Symbol]
55
+ # @return [String] the extracted value for the selector.
56
+ def extract(tag)
57
+ attribute_options = config.selector_attributes_with_channel(tag.to_sym)
58
+
59
+ post_process(
60
+ ItemExtractors.item_extractor_factory(attribute_options, xml).get,
61
+ attribute_options.fetch(:post_process, false)
62
+ )
63
+ end
28
64
 
29
- post_process(value, attribute_options.fetch(:post_process, false))
65
+ ##
66
+ # Checks if the item is valid accordin to RSS 2.0 spec,
67
+ # by ensuring it has at least a title or a description.
68
+ #
69
+ # @return [true, false]
70
+ def valid?
71
+ title_or_description.to_s != ''
30
72
  end
31
73
 
32
- def available_attributes
33
- @available_attributes ||= (%i[title link description author comments updated] &
34
- @config.attribute_names) - %i[categories enclosure]
74
+ ##
75
+ # Returns either the title or the description, preferring title if available.
76
+ #
77
+ # @return [String, nil]
78
+ def title_or_description
79
+ return title if config.selector?(:title)
80
+
81
+ description if config.selector?(:description)
35
82
  end
36
83
 
37
84
  ##
38
- # At least a title or a description is required to be a valid RSS 2.0 item.
39
- def valid?
40
- title = self.title if config.attribute?(:title)
41
- description = self.description if config.attribute?(:description)
42
- [title, description].join != ''
85
+ #
86
+ # @return [String] SHA1 hashed GUID.
87
+ def guid
88
+ content = config.guid_selector_names.flat_map { |method_name| public_send(method_name) }.join
89
+
90
+ Digest::SHA1.hexdigest(content)
43
91
  end
44
92
 
45
93
  ##
46
- # @return [Array]
94
+ # Retrieves categories for the item based on configured category selectors.
95
+ #
96
+ # @return [Array<String>] list of categories.
47
97
  def categories
48
- config.category_selectors.map(&method(:method_missing))
98
+ config.category_selector_names.map { |method_name| public_send(method_name) }
49
99
  end
50
100
 
101
+ ##
102
+ # Checks if the item has an enclosure based on configuration.
103
+ #
104
+ # @return [true, false]
51
105
  def enclosure?
52
- config.attribute?(:enclosure)
106
+ config.selector?(:enclosure)
53
107
  end
54
108
 
55
- def enclosure_url
56
- enclosure = Html2rss::Utils.sanitize_url(method_missing(:enclosure))
57
-
58
- Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s if enclosure
109
+ ##
110
+ # Retrieves enclosure details for the item.
111
+ #
112
+ # @return [Enclosure] enclosure details.
113
+ def enclosure
114
+ url = enclosure_url
115
+
116
+ raise 'An item.enclosure requires an absolute URL' unless url&.absolute?
117
+
118
+ Enclosure.new(
119
+ type: Html2rss::Utils.guess_content_type_from_url(url),
120
+ bits_length: 0,
121
+ url: url.to_s
122
+ )
59
123
  end
60
124
 
61
125
  ##
62
- # @return [Array]
126
+ # Fetches items from a given URL using configuration settings.
127
+ #
128
+ # @param url [String] URL to fetch items from.
129
+ # @param config [Html2rss::Config] Configuration object.
130
+ # @return [Array<Html2rss::Item>] list of items fetched.
63
131
  def self.from_url(url, config)
64
- body = get_body_from_url(url, config)
132
+ body = Utils.request_body_from_url(url, convert_json_to_xml: config.json?, headers: config.headers)
65
133
 
66
- Nokogiri.HTML(body).css(config.selector(:items))
67
- .map { |xml_item| new xml_item, config }
68
- .keep_if(&:valid?)
134
+ Nokogiri.HTML(body)
135
+ .css(config.selector_string(Config::Selectors::ITEMS_SELECTOR_NAME))
136
+ .map { |xml| new(xml, config) }
137
+ .select(&:valid?)
69
138
  end
70
139
 
71
140
  private
72
141
 
73
- def self.get_body_from_url(url, config)
74
- body = Faraday.new(url: url, headers: config.headers) do |faraday|
75
- faraday.use FaradayMiddleware::FollowRedirects
76
- faraday.adapter Faraday.default_adapter
77
- end.get.body
78
-
79
- config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body
80
- end
81
- private_class_method :get_body_from_url
82
-
83
- attr_reader :xml, :config
142
+ # @return [Nokogiri::XML::Element] XML element representing the item.
143
+ attr_reader :xml
144
+ # @return [Html2rss::Config] Configuration object for the item.
145
+ attr_reader :config
84
146
 
147
+ ##
148
+ # Processes the extracted value according to post-processing options.
149
+ #
150
+ # @param value [String] extracted value.
151
+ # @param post_process_options [Hash<Symbol, Object>] post-processing options.
152
+ # @return [String] processed value.
85
153
  def post_process(value, post_process_options)
86
154
  return value unless post_process_options
87
155
 
88
156
  [post_process_options].flatten.each do |options|
89
157
  value = AttributePostProcessors.get_processor(options[:name])
90
- .new(value, options: options, item: self, config: @config)
158
+ .new(value, Context.new(options:, item: self, config:))
91
159
  .get
92
160
  end
93
161
 
94
162
  value
95
163
  end
164
+
165
+ ##
166
+ # Retrieves the URL for the enclosure, sanitizing and ensuring it's absolute.
167
+ #
168
+ # @return [Addressable::URI, nil] absolute URL of the enclosure.
169
+ def enclosure_url
170
+ enclosure = Html2rss::Utils.sanitize_url(extract(:enclosure))
171
+
172
+ Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url) if enclosure
173
+ end
96
174
  end
97
175
  end
@@ -1,9 +1,11 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Html2rss
2
4
  module ItemExtractors
3
5
  ##
4
6
  # Returns the value of the attribute.
5
7
  #
6
- # Imagine this +time+ HTML element with a +datetime+ attribute:
8
+ # Imagine this +time+ HTML tag with a +datetime+ attribute:
7
9
  #
8
10
  # <time datetime="2019-07-01">...</time>
9
11
  #
@@ -18,19 +20,30 @@ module Html2rss
18
20
  # Would return:
19
21
  # '2019-07-01'
20
22
  #
21
- # In case you're extracting a date or a time, do not forget to parse it
22
- # during post processing with
23
- # {AttributePostProcessors::ParseTime}[rdoc-ref:Html2rss::AttributePostProcessors::ParseTime].
23
+ # In case you're extracting a date or a time, consider parsing it
24
+ # during post processing with {AttributePostProcessors::ParseTime}.
24
25
  class Attribute
26
+ # The available options for the attribute extractor.
27
+ Options = Struct.new('AttributeOptions', :selector, :attribute, keyword_init: true)
28
+
29
+ ##
30
+ # Initializes the Attribute extractor.
31
+ #
32
+ # @param xml [Nokogiri::XML::Element]
33
+ # @param options [Options]
25
34
  def initialize(xml, options)
26
35
  @options = options
27
- @element = ItemExtractors.element(xml, options)
36
+ @element = ItemExtractors.element(xml, options.selector)
28
37
  end
29
38
 
30
39
  ##
31
- # @return [String]
40
+ # Retrieves and returns the attribute's value as a string.
41
+ #
42
+ # @return [String] The value of the attribute.
32
43
  def get
33
- @element.attr(@options[:attribute]).to_s
44
+ @element.attr(@options.attribute).to_s.freeze
45
+ rescue NoMethodError => error
46
+ raise "Failed to extract attribute: #{error.message}"
34
47
  end
35
48
  end
36
49
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Html2rss
2
4
  module ItemExtractors
3
5
  ##
@@ -21,15 +23,29 @@ module Html2rss
21
23
  # Would return:
22
24
  # 'http://blog-without-a-feed.example.com/posts/latest-findings'
23
25
  class Href
26
+ # The available options for the href (attribute) extractor.
27
+ Options = Struct.new('HrefOptions', :selector, :channel, keyword_init: true)
28
+
29
+ ##
30
+ # Initializes the Href extractor.
31
+ #
32
+ # @param xml [Nokogiri::XML::Element]
33
+ # @param options [Options]
24
34
  def initialize(xml, options)
25
35
  @options = options
26
- element = ItemExtractors.element(xml, options)
27
- @href = Html2rss::Utils.sanitize_url(element.attr('href'))
36
+ @element = ItemExtractors.element(xml, options.selector)
37
+ @href = @element.attr('href').to_s
28
38
  end
29
39
 
30
- # @return [URI::HTTPS, URI::HTTP]
40
+ ##
41
+ # Retrieves and returns the normalized absolute URL.
42
+ #
43
+ # @return [String] The absolute URL.
31
44
  def get
32
- Html2rss::Utils.build_absolute_url_from_relative(@href, @options[:channel][:url])
45
+ return nil unless @href
46
+
47
+ sanitized_href = Html2rss::Utils.sanitize_url(@href)
48
+ Html2rss::Utils.build_absolute_url_from_relative(sanitized_href, @options.channel.url)
33
49
  end
34
50
  end
35
51
  end