html2rss 0.6.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
  require 'rss'
2
- require_relative 'item'
2
+ require 'mime/types'
3
3
 
4
4
  module Html2rss
5
5
  ##
@@ -18,42 +18,64 @@ module Html2rss
18
18
  # @return [RSS:Rss]
19
19
  def rss
20
20
  RSS::Maker.make('2.0') do |maker|
21
- add_channel(maker)
21
+ add_channel(maker.channel)
22
22
 
23
- feed_items.map do |feed_item|
24
- add_item(feed_item, maker.items.new_item)
25
- end
23
+ items.each { |item| add_item(item, maker.items.new_item) }
26
24
  end
27
25
  end
28
26
 
27
+ def self.add_categories(categories, item_maker)
28
+ categories.each { |category| item_maker.categories.new_category.content = category }
29
+ end
30
+
31
+ def self.add_enclosure_from_url(url, item_maker)
32
+ return unless url
33
+
34
+ enclosure = item_maker.enclosure
35
+ content_type = MIME::Types.type_for(File.extname(url).delete('.'))
36
+
37
+ enclosure.type = content_type.any? ? content_type.first.to_s : 'application/octet-stream'
38
+ enclosure.length = 0
39
+ enclosure.url = url
40
+ end
41
+
42
+ def self.add_guid(item, item_maker)
43
+ guid = item_maker.guid
44
+ guid.content = Digest::SHA1.hexdigest(item.title)
45
+ guid.isPermaLink = false
46
+ end
47
+
29
48
  private
30
49
 
31
50
  attr_reader :config
32
51
 
33
- def add_channel(maker)
52
+ def add_channel(channel_maker)
34
53
  %i[language author title description link ttl].each do |attribute_name|
35
- maker.channel.public_send("#{attribute_name}=".to_sym, config.public_send(attribute_name))
54
+ channel_maker.public_send("#{attribute_name}=", config.public_send(attribute_name))
36
55
  end
37
56
 
38
- maker.channel.generator = "html2rss V. #{::Html2rss::VERSION}"
39
- maker.channel.lastBuildDate = Time.now.to_s
57
+ channel_maker.generator = "html2rss V. #{::Html2rss::VERSION}"
58
+ channel_maker.lastBuildDate = Time.now
40
59
  end
41
60
 
42
- def feed_items
43
- @feed_items ||= Item.from_url(config.url, config).keep_if(&:valid?)
44
- end
61
+ def items
62
+ return @items if defined?(@items)
45
63
 
46
- def add_item(feed_item, rss_item)
47
- feed_item.available_attributes.each do |attribute_name|
48
- rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
49
- end
64
+ items = Item.from_url(config.url, config)
65
+
66
+ items.reverse! if config.items_order == :reverse
67
+
68
+ @items = items
69
+ end
50
70
 
51
- feed_item.categories.each do |category|
52
- rss_item.categories.new_category.content = category
71
+ def add_item(item, item_maker)
72
+ item.available_attributes.each do |attribute_name|
73
+ item_maker.public_send("#{attribute_name}=", item.public_send(attribute_name))
53
74
  end
54
75
 
55
- rss_item.guid.content = Digest::SHA1.hexdigest(feed_item.title)
56
- rss_item.guid.isPermaLink = false
76
+ self.class.add_categories(item.categories, item_maker)
77
+ self.class.add_enclosure_from_url(item.enclosure_url, item_maker) if item.enclosure?
78
+ self.class.add_guid(item, item_maker)
57
79
  end
58
80
  end
59
81
  end
@@ -1,9 +1,6 @@
1
1
  require 'faraday'
2
2
  require 'faraday_middleware'
3
- require 'open-uri'
4
3
  require 'nokogiri'
5
- require_relative 'item_extractors'
6
- require_relative 'attribute_post_processors'
7
4
 
8
5
  module Html2rss
9
6
  ##
@@ -26,15 +23,15 @@ module Html2rss
26
23
 
27
24
  attribute_options = config.attribute_options(method_name)
28
25
 
29
- extractor = ItemExtractors.get_extractor(attribute_options['extractor'])
26
+ extractor = ItemExtractors.get_extractor(attribute_options[:extractor])
30
27
  value = extractor.new(xml, attribute_options).get
31
28
 
32
- post_process(value, attribute_options.fetch('post_process', false))
29
+ post_process(value, attribute_options.fetch(:post_process, false))
33
30
  end
34
31
 
35
32
  def available_attributes
36
- @available_attributes ||= (%w[title link description author comments updated] &
37
- @config.attribute_names) - ['categories']
33
+ @available_attributes ||= (%i[title link description author comments updated] &
34
+ @config.attribute_names) - %i[categories enclosure]
38
35
  end
39
36
 
40
37
  ##
@@ -48,10 +45,17 @@ module Html2rss
48
45
  ##
49
46
  # @return [Array]
50
47
  def categories
51
- categories = config.categories
52
- categories.map!(&method(:method_missing))
53
- categories.uniq!
54
- categories.keep_if { |category| category.to_s != '' }
48
+ config.category_selectors.map(&method(:method_missing))
49
+ end
50
+
51
+ def enclosure?
52
+ config.attribute?(:enclosure)
53
+ end
54
+
55
+ def enclosure_url
56
+ enclosure = Html2rss::Utils.sanitize_url(method_missing(:enclosure))
57
+
58
+ Html2rss::Utils.build_absolute_url_from_relative(enclosure, config.url).to_s if enclosure
55
59
  end
56
60
 
57
61
  ##
@@ -59,20 +63,22 @@ module Html2rss
59
63
  def self.from_url(url, config)
60
64
  body = get_body_from_url(url, config)
61
65
 
62
- Nokogiri::HTML(body).css(config.selector('items')).map do |xml_item|
63
- new xml_item, config
64
- end
66
+ Nokogiri.HTML(body).css(config.selector(:items))
67
+ .map { |xml_item| new xml_item, config }
68
+ .keep_if(&:valid?)
65
69
  end
66
70
 
67
71
  private
68
72
 
69
73
  def self.get_body_from_url(url, config)
70
- body = Faraday.new(url: url, headers: config.headers) do |faraday|
74
+ request = Faraday.new(url: url, headers: config.headers) do |faraday|
71
75
  faraday.use FaradayMiddleware::FollowRedirects
72
76
  faraday.adapter Faraday.default_adapter
73
- end.get.body
77
+ end
78
+
79
+ body = request.get.body
74
80
 
75
- config.json? ? Html2rss::Utils.hash_to_xml(JSON.parse(body)) : body
81
+ config.json? ? Html2rss::Utils.object_to_xml(JSON.parse(body)) : body
76
82
  end
77
83
  private_class_method :get_body_from_url
78
84
 
@@ -82,7 +88,7 @@ module Html2rss
82
88
  return value unless post_process_options
83
89
 
84
90
  [post_process_options].flatten.each do |options|
85
- value = AttributePostProcessors.get_processor(options['name'])
91
+ value = AttributePostProcessors.get_processor(options[:name])
86
92
  .new(value, options: options, item: self, config: @config)
87
93
  .get
88
94
  end
@@ -1,29 +1,22 @@
1
- require_relative 'item_extractors/attribute'
2
- require_relative 'item_extractors/current_time'
3
- require_relative 'item_extractors/href'
4
- require_relative 'item_extractors/html'
5
- require_relative 'item_extractors/static'
6
- require_relative 'item_extractors/text'
7
-
8
1
  module Html2rss
9
2
  ##
10
3
  # Provides a namespace for item extractors.
11
4
  module ItemExtractors
12
5
  DEFAULT = 'text'.freeze
6
+ private_constant :DEFAULT
13
7
 
14
8
  def self.get_extractor(name)
15
- @extractors = Hash.new do |hash, key|
16
- camel_cased_name = key.split('_').map(&:capitalize).join
17
- class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
9
+ @get_extractor ||= Hash.new do |extractors, key|
10
+ extractors[key] = Utils.get_class_from_name(key || DEFAULT, 'ItemExtractors')
11
+ end
18
12
 
19
- hash[key] = Object.const_get(class_name)
20
- end[name || DEFAULT]
13
+ @get_extractor[name]
21
14
  end
22
15
 
23
16
  ##
24
17
  # @return [Nokogiri::XML::Element]
25
18
  def self.element(xml, options)
26
- selector = options['selector']
19
+ selector = options[:selector]
27
20
  selector ? xml.css(selector) : xml
28
21
  end
29
22
  end
@@ -30,7 +30,7 @@ module Html2rss
30
30
  ##
31
31
  # @return [String]
32
32
  def get
33
- @element.attr(@options['attribute']).to_s
33
+ @element.attr(@options[:attribute]).to_s
34
34
  end
35
35
  end
36
36
  end
@@ -24,12 +24,12 @@ module Html2rss
24
24
  def initialize(xml, options)
25
25
  @options = options
26
26
  element = ItemExtractors.element(xml, options)
27
- @href = element.attr('href').to_s
27
+ @href = Html2rss::Utils.sanitize_url(element.attr('href'))
28
28
  end
29
29
 
30
30
  # @return [URI::HTTPS, URI::HTTP]
31
31
  def get
32
- Html2rss::Utils.build_absolute_url_from_relative(@href, @options['channel']['url'])
32
+ Html2rss::Utils.build_absolute_url_from_relative(@href, @options[:channel][:url])
33
33
  end
34
34
  end
35
35
  end
@@ -15,13 +15,13 @@ module Html2rss
15
15
  @options = options
16
16
  end
17
17
 
18
- # Returns what options['static'] holds.
18
+ # Returns what options[:static] holds.
19
19
  #
20
20
  # options = { static: 'Foobar' }
21
21
  # Static.new(xml, options).get
22
22
  # # => 'Foobar'
23
23
  def get
24
- @options['static']
24
+ @options[:static]
25
25
  end
26
26
  end
27
27
  end
@@ -1,6 +1,6 @@
1
1
  require 'active_support/core_ext/hash'
2
+ require 'addressable/uri'
2
3
  require 'builder'
3
- require 'hashie'
4
4
  require 'json'
5
5
  require 'nokogiri'
6
6
 
@@ -8,27 +8,33 @@ module Html2rss
8
8
  ##
9
9
  # The collecting tank for utility methods.
10
10
  module Utils
11
- ##
12
- # A Hash with indifferent access, build with {https://github.com/intridea/hashie Hashie}.
13
- class IndifferentAccessHash < Hash
14
- include Hashie::Extensions::MergeInitializer
15
- include Hashie::Extensions::IndifferentAccess
16
- end
17
-
18
- def self.build_absolute_url_from_relative(url, channel_url)
11
+ def self.build_absolute_url_from_relative(url, base_url)
19
12
  url = URI(url) if url.is_a?(String)
20
13
 
21
14
  return url if url.absolute?
22
15
 
23
- URI(channel_url).tap do |uri|
16
+ URI(base_url).tap do |uri|
24
17
  uri.path = url.path.to_s.start_with?('/') ? url.path : "/#{url.path}"
25
18
  uri.query = url.query
26
19
  uri.fragment = url.fragment if url.fragment
27
20
  end
28
21
  end
29
22
 
30
- def self.hash_to_xml(hash)
31
- hash.to_xml(root: :html, skip_instruct: true, skip_types: true)
23
+ def self.object_to_xml(object)
24
+ object.to_xml(skip_instruct: true, skip_types: true)
25
+ end
26
+
27
+ def self.get_class_from_name(snake_cased_name, module_name)
28
+ camel_cased_name = snake_cased_name.split('_').map(&:capitalize).join
29
+ class_name = ['Html2rss', module_name, camel_cased_name].join('::')
30
+ Object.const_get(class_name)
31
+ end
32
+
33
+ def self.sanitize_url(url)
34
+ squished_url = url.to_s.split(' ').join
35
+ return if squished_url.to_s == ''
36
+
37
+ Addressable::URI.parse(squished_url).normalize.to_s
32
38
  end
33
39
  end
34
40
  end
@@ -1,3 +1,4 @@
1
1
  module Html2rss
2
- VERSION = '0.6.0'.freeze
2
+ VERSION = '0.9.0'.freeze
3
+ public_constant :VERSION
3
4
  end
metadata CHANGED
@@ -1,29 +1,49 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-10-05 00:00:00.000000000 Z
11
+ date: 2020-06-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '5'
20
+ - - "<"
21
+ - !ruby/object:Gem::Version
22
+ version: '7'
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ version: '5'
30
+ - - "<"
31
+ - !ruby/object:Gem::Version
32
+ version: '7'
33
+ - !ruby/object:Gem::Dependency
34
+ name: addressable
15
35
  requirement: !ruby/object:Gem::Requirement
16
36
  requirements:
17
37
  - - "~>"
18
38
  - !ruby/object:Gem::Version
19
- version: '5.0'
39
+ version: '2.7'
20
40
  type: :runtime
21
41
  prerelease: false
22
42
  version_requirements: !ruby/object:Gem::Requirement
23
43
  requirements:
24
44
  - - "~>"
25
45
  - !ruby/object:Gem::Version
26
- version: '5.0'
46
+ version: '2.7'
27
47
  - !ruby/object:Gem::Dependency
28
48
  name: builder
29
49
  requirement: !ruby/object:Gem::Requirement
@@ -44,42 +64,56 @@ dependencies:
44
64
  requirements:
45
65
  - - "~>"
46
66
  - !ruby/object:Gem::Version
47
- version: '0.15'
67
+ version: '1.0'
48
68
  type: :runtime
49
69
  prerelease: false
50
70
  version_requirements: !ruby/object:Gem::Requirement
51
71
  requirements:
52
72
  - - "~>"
53
73
  - !ruby/object:Gem::Version
54
- version: '0.15'
74
+ version: '1.0'
55
75
  - !ruby/object:Gem::Dependency
56
76
  name: faraday_middleware
57
77
  requirement: !ruby/object:Gem::Requirement
58
78
  requirements:
59
- - - "~>"
79
+ - - ">="
60
80
  - !ruby/object:Gem::Version
61
- version: '0.13'
81
+ version: '0'
62
82
  type: :runtime
63
83
  prerelease: false
64
84
  version_requirements: !ruby/object:Gem::Requirement
65
85
  requirements:
66
- - - "~>"
86
+ - - ">="
67
87
  - !ruby/object:Gem::Version
68
- version: '0.13'
88
+ version: '0'
69
89
  - !ruby/object:Gem::Dependency
70
- name: hashie
90
+ name: kramdown
71
91
  requirement: !ruby/object:Gem::Requirement
72
92
  requirements:
73
- - - "~>"
93
+ - - ">="
74
94
  - !ruby/object:Gem::Version
75
- version: '3.6'
95
+ version: '0'
76
96
  type: :runtime
77
97
  prerelease: false
78
98
  version_requirements: !ruby/object:Gem::Requirement
79
99
  requirements:
80
- - - "~>"
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ - !ruby/object:Gem::Dependency
104
+ name: mime-types
105
+ requirement: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">"
108
+ - !ruby/object:Gem::Version
109
+ version: '3.0'
110
+ type: :runtime
111
+ prerelease: false
112
+ version_requirements: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - ">"
81
115
  - !ruby/object:Gem::Version
82
- version: '3.6'
116
+ version: '3.0'
83
117
  - !ruby/object:Gem::Dependency
84
118
  name: nokogiri
85
119
  requirement: !ruby/object:Gem::Requirement
@@ -106,14 +140,14 @@ dependencies:
106
140
  requirements:
107
141
  - - "~>"
108
142
  - !ruby/object:Gem::Version
109
- version: '1.3'
143
+ version: '2.0'
110
144
  type: :runtime
111
145
  prerelease: false
112
146
  version_requirements: !ruby/object:Gem::Requirement
113
147
  requirements:
114
148
  - - "~>"
115
149
  - !ruby/object:Gem::Version
116
- version: '1.3'
150
+ version: '2.0'
117
151
  - !ruby/object:Gem::Dependency
118
152
  name: sanitize
119
153
  requirement: !ruby/object:Gem::Requirement
@@ -128,20 +162,48 @@ dependencies:
128
162
  - - "~>"
129
163
  - !ruby/object:Gem::Version
130
164
  version: '5.0'
165
+ - !ruby/object:Gem::Dependency
166
+ name: to_regexp
167
+ requirement: !ruby/object:Gem::Requirement
168
+ requirements:
169
+ - - ">="
170
+ - !ruby/object:Gem::Version
171
+ version: '0'
172
+ type: :runtime
173
+ prerelease: false
174
+ version_requirements: !ruby/object:Gem::Requirement
175
+ requirements:
176
+ - - ">="
177
+ - !ruby/object:Gem::Version
178
+ version: '0'
179
+ - !ruby/object:Gem::Dependency
180
+ name: zeitwerk
181
+ requirement: !ruby/object:Gem::Requirement
182
+ requirements:
183
+ - - ">="
184
+ - !ruby/object:Gem::Version
185
+ version: '0'
186
+ type: :runtime
187
+ prerelease: false
188
+ version_requirements: !ruby/object:Gem::Requirement
189
+ requirements:
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: '0'
131
193
  - !ruby/object:Gem::Dependency
132
194
  name: bundler
133
195
  requirement: !ruby/object:Gem::Requirement
134
196
  requirements:
135
- - - "~>"
197
+ - - ">="
136
198
  - !ruby/object:Gem::Version
137
- version: '1.16'
199
+ version: '0'
138
200
  type: :development
139
201
  prerelease: false
140
202
  version_requirements: !ruby/object:Gem::Requirement
141
203
  requirements:
142
- - - "~>"
204
+ - - ">="
143
205
  - !ruby/object:Gem::Version
144
- version: '1.16'
206
+ version: '0'
145
207
  - !ruby/object:Gem::Dependency
146
208
  name: byebug
147
209
  requirement: !ruby/object:Gem::Requirement
@@ -277,7 +339,9 @@ files:
277
339
  - html2rss.gemspec
278
340
  - lib/html2rss.rb
279
341
  - lib/html2rss/attribute_post_processors.rb
342
+ - lib/html2rss/attribute_post_processors/gsub.rb
280
343
  - lib/html2rss/attribute_post_processors/html_to_markdown.rb
344
+ - lib/html2rss/attribute_post_processors/markdown_to_html.rb
281
345
  - lib/html2rss/attribute_post_processors/parse_time.rb
282
346
  - lib/html2rss/attribute_post_processors/parse_uri.rb
283
347
  - lib/html2rss/attribute_post_processors/sanitize_html.rb
@@ -310,14 +374,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
310
374
  requirements:
311
375
  - - ">="
312
376
  - !ruby/object:Gem::Version
313
- version: 2.4.0
377
+ version: 2.5.0
314
378
  required_rubygems_version: !ruby/object:Gem::Requirement
315
379
  requirements:
316
380
  - - ">="
317
381
  - !ruby/object:Gem::Version
318
382
  version: '0'
319
383
  requirements: []
320
- rubygems_version: 3.0.6
384
+ rubyforge_project:
385
+ rubygems_version: 2.7.7
321
386
  signing_key:
322
387
  specification_version: 4
323
388
  summary: Returns an RSS::Rss object by scraping a URL.