html2rss 0.6.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  require 'rss'
2
- require_relative 'item'
2
+ require 'mime/types'
3
3
 
4
4
  module Html2rss
5
5
  ##
@@ -18,42 +18,64 @@ module Html2rss
18
18
  # @return [RSS:Rss]
19
19
  def rss
20
20
  RSS::Maker.make('2.0') do |maker|
21
- add_channel(maker)
21
+ add_channel(maker.channel)
22
22
 
23
- feed_items.map do |feed_item|
24
- add_item(feed_item, maker.items.new_item)
25
- end
23
+ items.each { |item| add_item(item, maker.items.new_item) }
26
24
  end
27
25
  end
28
26
 
27
+ def self.add_categories(categories, item_maker)
28
+ categories.each { |category| item_maker.categories.new_category.content = category }
29
+ end
30
+
31
+ def self.add_enclosure_from_url(url, item_maker)
32
+ return unless url
33
+
34
+ enclosure = item_maker.enclosure
35
+ content_type = MIME::Types.type_for(File.extname(url).delete('.'))
36
+
37
+ enclosure.type = content_type.any? ? content_type.first.to_s : 'application/octet-stream'
38
+ enclosure.length = 0
39
+ enclosure.url = url
40
+ end
41
+
42
+ def self.add_guid(item, item_maker)
43
+ guid = item_maker.guid
44
+ guid.content = Digest::SHA1.hexdigest(item.title)
45
+ guid.isPermaLink = false
46
+ end
47
+
29
48
  private
30
49
 
31
50
  attr_reader :config
32
51
 
33
- def add_channel(maker)
52
+ def add_channel(channel_maker)
34
53
  %i[language author title description link ttl].each do |attribute_name|
35
- maker.channel.public_send("#{attribute_name}=".to_sym, config.public_send(attribute_name))
54
+ channel_maker.public_send("#{attribute_name}=", config.public_send(attribute_name))
36
55
  end
37
56
 
38
- maker.channel.generator = "html2rss V. #{::Html2rss::VERSION}"
39
- maker.channel.lastBuildDate = Time.now.to_s
57
+ channel_maker.generator = "html2rss V. #{::Html2rss::VERSION}"
58
+ channel_maker.lastBuildDate = Time.now
40
59
  end
41
60
 
42
- def feed_items
43
- @feed_items ||= Item.from_url(config.url, config).keep_if(&:valid?)
44
- end
61
+ def items
62
+ return @items if defined?(@items)
45
63
 
46
- def add_item(feed_item, rss_item)
47
- feed_item.available_attributes.each do |attribute_name|
48
- rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
49
- end
64
+ items = Item.from_url(config.url, config)
65
+
66
+ items.reverse! if config.items_order == :reverse
67
+
68
+ @items = items
69
+ end
50
70
 
51
- feed_item.categories.each do |category|
52
- rss_item.categories.new_category.content = category
71
+ def add_item(item, item_maker)
72
+ item.available_attributes.each do |attribute_name|
73
+ item_maker.public_send("#{attribute_name}=", item.public_send(attribute_name))
53
74
  end
54
75
 
55
- rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
56
- rss_item.guid.isPermaLink = false
76
+ self.class.add_categories(item.categories, item_maker)
77
+ self.class.add_enclosure_from_url(item.enclosure_url, item_maker) if item.enclosure?
78
+ self.class.add_guid(item, item_maker)
57
79
  end
58
80
  end
59
81
  end
@@ -1,9 +1,6 @@
1
1
  require 'faraday'
2
2
  require 'faraday_middleware'
3
- require 'open-uri'
4
3
  require 'nokogiri'
5
- require_relative 'item_extractors'
6
- require_relative 'attribute_post_processors'
7
4
 
8
5
  module Html2rss
9
6
  ##
@@ -26,15 +23,15 @@ module Html2rss
26
23
 
27
24
  attribute_options = config.attribute_options(method_name)
28
25
 
29
- extractor = ItemExtractors.get_extractor(attribute_options['extractor'])
26
+ extractor = ItemExtractors.get_extractor(attribute_options[:extractor])
30
27
  value = extractor.new(xml, attribute_options).get
31
28
 
32
- post_process(value, attribute_options.fetch('post_process', false))
29
+ post_process(value, attribute_options.fetch(:post_process, false))
33
30
  end
34
31
 
35
32
  def available_attributes
36
- @available_attributes ||= (%w[title link description author comments updated] &
37
- @config.attribute_names) - ['categories']
33
+ @available_attributes ||= (%i[title link description author comments updated] &
34
+ @config.attribute_names) - %i[categories enclosure]
38
35
  end
39
36
 
40
37
  ##
@@ -48,10 +45,17 @@ module Html2rss
48
45
  ##
49
46
  # @return [Array]
50
47
  def categories
51
- categories = config.categories
52
- categories.map!(&method(:method_missing))
53
- categories.uniq!
54
- categories.keep_if { |category| category.to_s != '' }
48
+ config.category_selectors.map(&method(:method_missing))
49
+ end
50
+
51
+ def enclosure?
52
+ config.attribute?(:enclosure)
53
+ end
54
+
55
+ def enclosure_url
56
+ enclosure = Html2rss::Utils.sanitize_url(method_missing(:enclosure))
57
+
58
+ Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s if enclosure
55
59
  end
56
60
 
57
61
  ##
@@ -59,20 +63,22 @@ module Html2rss
59
63
  def self.from_url(url, config)
60
64
  body = get_body_from_url(url, config)
61
65
 
62
- Nokogiri::HTML(body).css(config.selector('items')).map do |xml_item|
63
- new xml_item, config
64
- end
66
+ Nokogiri.HTML(body).css(config.selector(:items))
67
+ .map { |xml_item| new xml_item, config }
68
+ .keep_if(&:valid?)
65
69
  end
66
70
 
67
71
  private
68
72
 
69
73
  def self.get_body_from_url(url, config)
70
- body = Faraday.new(url: url, headers: config.headers) do |faraday|
74
+ request = Faraday.new(url: url, headers: config.headers) do |faraday|
71
75
  faraday.use FaradayMiddleware::FollowRedirects
72
76
  faraday.adapter Faraday.default_adapter
73
- end.get.body
77
+ end
78
+
79
+ body = request.get.body
74
80
 
75
- config.json? ? Html2rss::Utils.hash_to_xml(JSON.parse(body)) : body
81
+ config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body
76
82
  end
77
83
  private_class_method :get_body_from_url
78
84
 
@@ -82,7 +88,7 @@ module Html2rss
82
88
  return value unless post_process_options
83
89
 
84
90
  [post_process_options].flatten.each do |options|
85
- value = AttributePostProcessors.get_processor(options['name'])
91
+ value = AttributePostProcessors.get_processor(options[:name])
86
92
  .new(value, options: options, item: self, config: @config)
87
93
  .get
88
94
  end
@@ -1,29 +1,22 @@
1
- require_relative 'item_extractors/attribute'
2
- require_relative 'item_extractors/current_time'
3
- require_relative 'item_extractors/href'
4
- require_relative 'item_extractors/html'
5
- require_relative 'item_extractors/static'
6
- require_relative 'item_extractors/text'
7
-
8
1
  module Html2rss
9
2
  ##
10
3
  # Provides a namespace for item extractors.
11
4
  module ItemExtractors
12
5
  DEFAULT = 'text'.freeze
6
+ private_constant :DEFAULT
13
7
 
14
8
  def self.get_extractor(name)
15
- @extractors = Hash.new do |hash, key|
16
- camel_cased_name = key.split('_').map(&:capitalize).join
17
- class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
9
+ @get_extractor ||= Hash.new do |extractors, key|
10
+ extractors[key] = Utils.get_class_from_name(key || DEFAULT, 'ItemExtractors')
11
+ end
18
12
 
19
- hash[key] = Object.const_get(class_name)
20
- end[name || DEFAULT]
13
+ @get_extractor[name]
21
14
  end
22
15
 
23
16
  ##
24
17
  # @return [Nokogiri::XML::Element]
25
18
  def self.element(xml, options)
26
- selector = options['selector']
19
+ selector = options[:selector]
27
20
  selector ? xml.css(selector) : xml
28
21
  end
29
22
  end
@@ -30,7 +30,7 @@ module Html2rss
30
30
  ##
31
31
  # @return [String]
32
32
  def get
33
- @element.attr(@options['attribute']).to_s
33
+ @element.attr(@options[:attribute]).to_s
34
34
  end
35
35
  end
36
36
  end
@@ -24,12 +24,12 @@ module Html2rss
24
24
  def initialize(xml, options)
25
25
  @options = options
26
26
  element = ItemExtractors.element(xml, options)
27
- @href = element.attr('href').to_s
27
+ @href = Html2rss::Utils.sanitize_url(element.attr('href'))
28
28
  end
29
29
 
30
30
  # @return [URI::HTTPS, URI::HTTP]
31
31
  def get
32
- Html2rss::Utils.build_absolute_url_from_relative(@href, @options['channel']['url'])
32
+ Html2rss::Utils.build_absolute_url_from_relative(@href, @options[:channel][:url])
33
33
  end
34
34
  end
35
35
  end
@@ -15,13 +15,13 @@ module Html2rss
15
15
  @options = options
16
16
  end
17
17
 
18
- # Returns what options['static'] holds.
18
+ # Returns what options[:static] holds.
19
19
  #
20
20
  # options = { static: 'Foobar' }
21
21
  # Static.new(xml, options).get
22
22
  # # => 'Foobar'
23
23
  def get
24
- @options['static']
24
+ @options[:static]
25
25
  end
26
26
  end
27
27
  end
@@ -1,6 +1,6 @@
1
1
  require 'active_support/core_ext/hash'
2
+ require 'addressable/uri'
2
3
  require 'builder'
3
- require 'hashie'
4
4
  require 'json'
5
5
  require 'nokogiri'
6
6
 
@@ -8,27 +8,33 @@ module Html2rss
8
8
  ##
9
9
  # The collecting tank for utility methods.
10
10
  module Utils
11
- ##
12
- # A Hash with indifferent access, build with {https://github.com/intridea/hashie Hashie}.
13
- class IndifferentAccessHash < Hash
14
- include Hashie::Extensions::MergeInitializer
15
- include Hashie::Extensions::IndifferentAccess
16
- end
17
-
18
- def self.build_absolute_url_from_relative(url, channel_url)
11
+ def self.build_absolute_url_from_relative(url, base_url)
19
12
  url = URI(url) if url.is_a?(String)
20
13
 
21
14
  return url if url.absolute?
22
15
 
23
- URI(channel_url).tap do |uri|
16
+ URI(base_url).tap do |uri|
24
17
  uri.path = url.path.to_s.start_with?('/') ? url.path : "/#{url.path}"
25
18
  uri.query = url.query
26
19
  uri.fragment = url.fragment if url.fragment
27
20
  end
28
21
  end
29
22
 
30
- def self.hash_to_xml(hash)
31
- hash.to_xml(root: :html, skip_instruct: true, skip_types: true)
23
+ def self.object_to_xml(object)
24
+ object.to_xml(skip_instruct: true, skip_types: true)
25
+ end
26
+
27
+ def self.get_class_from_name(snake_cased_name, module_name)
28
+ camel_cased_name = snake_cased_name.split('_').map(&:capitalize).join
29
+ class_name = ['Html2rss', module_name, camel_cased_name].join('::')
30
+ Object.const_get(class_name)
31
+ end
32
+
33
+ def self.sanitize_url(url)
34
+ squished_url = url.to_s.split(' ').join
35
+ return if squished_url.to_s == ''
36
+
37
+ Addressable::URI.parse(squished_url).normalize.to_s
32
38
  end
33
39
  end
34
40
  end
@@ -1,3 +1,4 @@
1
1
  module Html2rss
2
- VERSION = '0.6.0'.freeze
2
+ VERSION = '0.9.0'.freeze
3
+ public_constant :VERSION
3
4
  end
metadata CHANGED
@@ -1,29 +1,49 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-10-05 00:00:00.000000000 Z
11
+ date: 2020-06-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '5'
20
+ - - "<"
21
+ - !ruby/object:Gem::Version
22
+ version: '7'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ version: '5'
30
+ - - "<"
31
+ - !ruby/object:Gem::Version
32
+ version: '7'
33
+ - !ruby/object:Gem::Dependency
34
+ name: addressable
15
35
  requirement: !ruby/object:Gem::Requirement
16
36
  requirements:
17
37
  - - "~>"
18
38
  - !ruby/object:Gem::Version
19
- version: '5.0'
39
+ version: '2.7'
20
40
  type: :runtime
21
41
  prerelease: false
22
42
  version_requirements: !ruby/object:Gem::Requirement
23
43
  requirements:
24
44
  - - "~>"
25
45
  - !ruby/object:Gem::Version
26
- version: '5.0'
46
+ version: '2.7'
27
47
  - !ruby/object:Gem::Dependency
28
48
  name: builder
29
49
  requirement: !ruby/object:Gem::Requirement
@@ -44,42 +64,56 @@ dependencies:
44
64
  requirements:
45
65
  - - "~>"
46
66
  - !ruby/object:Gem::Version
47
- version: '0.15'
67
+ version: '1.0'
48
68
  type: :runtime
49
69
  prerelease: false
50
70
  version_requirements: !ruby/object:Gem::Requirement
51
71
  requirements:
52
72
  - - "~>"
53
73
  - !ruby/object:Gem::Version
54
- version: '0.15'
74
+ version: '1.0'
55
75
  - !ruby/object:Gem::Dependency
56
76
  name: faraday_middleware
57
77
  requirement: !ruby/object:Gem::Requirement
58
78
  requirements:
59
- - - "~>"
79
+ - - ">="
60
80
  - !ruby/object:Gem::Version
61
- version: '0.13'
81
+ version: '0'
62
82
  type: :runtime
63
83
  prerelease: false
64
84
  version_requirements: !ruby/object:Gem::Requirement
65
85
  requirements:
66
- - - "~>"
86
+ - - ">="
67
87
  - !ruby/object:Gem::Version
68
- version: '0.13'
88
+ version: '0'
69
89
  - !ruby/object:Gem::Dependency
70
- name: hashie
90
+ name: kramdown
71
91
  requirement: !ruby/object:Gem::Requirement
72
92
  requirements:
73
- - - "~>"
93
+ - - ">="
74
94
  - !ruby/object:Gem::Version
75
- version: '3.6'
95
+ version: '0'
76
96
  type: :runtime
77
97
  prerelease: false
78
98
  version_requirements: !ruby/object:Gem::Requirement
79
99
  requirements:
80
- - - "~>"
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ - !ruby/object:Gem::Dependency
104
+ name: mime-types
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">"
108
+ - !ruby/object:Gem::Version
109
+ version: '3.0'
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">"
81
115
  - !ruby/object:Gem::Version
82
- version: '3.6'
116
+ version: '3.0'
83
117
  - !ruby/object:Gem::Dependency
84
118
  name: nokogiri
85
119
  requirement: !ruby/object:Gem::Requirement
@@ -106,14 +140,14 @@ dependencies:
106
140
  requirements:
107
141
  - - "~>"
108
142
  - !ruby/object:Gem::Version
109
- version: '1.3'
143
+ version: '2.0'
110
144
  type: :runtime
111
145
  prerelease: false
112
146
  version_requirements: !ruby/object:Gem::Requirement
113
147
  requirements:
114
148
  - - "~>"
115
149
  - !ruby/object:Gem::Version
116
- version: '1.3'
150
+ version: '2.0'
117
151
  - !ruby/object:Gem::Dependency
118
152
  name: sanitize
119
153
  requirement: !ruby/object:Gem::Requirement
@@ -128,20 +162,48 @@ dependencies:
128
162
  - - "~>"
129
163
  - !ruby/object:Gem::Version
130
164
  version: '5.0'
165
+ - !ruby/object:Gem::Dependency
166
+ name: to_regexp
167
+ requirement: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - ">="
170
+ - !ruby/object:Gem::Version
171
+ version: '0'
172
+ type: :runtime
173
+ prerelease: false
174
+ version_requirements: !ruby/object:Gem::Requirement
175
+ requirements:
176
+ - - ">="
177
+ - !ruby/object:Gem::Version
178
+ version: '0'
179
+ - !ruby/object:Gem::Dependency
180
+ name: zeitwerk
181
+ requirement: !ruby/object:Gem::Requirement
182
+ requirements:
183
+ - - ">="
184
+ - !ruby/object:Gem::Version
185
+ version: '0'
186
+ type: :runtime
187
+ prerelease: false
188
+ version_requirements: !ruby/object:Gem::Requirement
189
+ requirements:
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: '0'
131
193
  - !ruby/object:Gem::Dependency
132
194
  name: bundler
133
195
  requirement: !ruby/object:Gem::Requirement
134
196
  requirements:
135
- - - "~>"
197
+ - - ">="
136
198
  - !ruby/object:Gem::Version
137
- version: '1.16'
199
+ version: '0'
138
200
  type: :development
139
201
  prerelease: false
140
202
  version_requirements: !ruby/object:Gem::Requirement
141
203
  requirements:
142
- - - "~>"
204
+ - - ">="
143
205
  - !ruby/object:Gem::Version
144
- version: '1.16'
206
+ version: '0'
145
207
  - !ruby/object:Gem::Dependency
146
208
  name: byebug
147
209
  requirement: !ruby/object:Gem::Requirement
@@ -277,7 +339,9 @@ files:
277
339
  - html2rss.gemspec
278
340
  - lib/html2rss.rb
279
341
  - lib/html2rss/attribute_post_processors.rb
342
+ - lib/html2rss/attribute_post_processors/gsub.rb
280
343
  - lib/html2rss/attribute_post_processors/html_to_markdown.rb
344
+ - lib/html2rss/attribute_post_processors/markdown_to_html.rb
281
345
  - lib/html2rss/attribute_post_processors/parse_time.rb
282
346
  - lib/html2rss/attribute_post_processors/parse_uri.rb
283
347
  - lib/html2rss/attribute_post_processors/sanitize_html.rb
@@ -310,14 +374,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
310
374
  requirements:
311
375
  - - ">="
312
376
  - !ruby/object:Gem::Version
313
- version: 2.4.0
377
+ version: 2.5.0
314
378
  required_rubygems_version: !ruby/object:Gem::Requirement
315
379
  requirements:
316
380
  - - ">="
317
381
  - !ruby/object:Gem::Version
318
382
  version: '0'
319
383
  requirements: []
320
- rubygems_version: 3.0.6
384
+ rubyforge_project:
385
+ rubygems_version: 2.7.7
321
386
  signing_key:
322
387
  specification_version: 4
323
388
  summary: Returns an RSS::Rss object by scraping a URL.