html2rss 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +18 -11
  3. data/.travis.yml +3 -3
  4. data/.yardopts +6 -0
  5. data/Gemfile.lock +23 -5
  6. data/README.md +2 -1
  7. data/docs/Html2rss.html +353 -0
  8. data/docs/Html2rss/AttributePostProcessors.html +203 -0
  9. data/docs/Html2rss/AttributePostProcessors/ParseTime.html +332 -0
  10. data/docs/Html2rss/AttributePostProcessors/ParseUri.html +314 -0
  11. data/docs/Html2rss/AttributePostProcessors/SanitizeHtml.html +346 -0
  12. data/docs/Html2rss/AttributePostProcessors/Substring.html +321 -0
  13. data/docs/Html2rss/AttributePostProcessors/Template.html +336 -0
  14. data/docs/Html2rss/Config.html +795 -0
  15. data/docs/Html2rss/FeedBuilder.html +295 -0
  16. data/docs/Html2rss/Item.html +654 -0
  17. data/docs/Html2rss/ItemExtractors.html +297 -0
  18. data/docs/Html2rss/ItemExtractors/Attribute.html +317 -0
  19. data/docs/Html2rss/ItemExtractors/CurrentTime.html +297 -0
  20. data/docs/Html2rss/ItemExtractors/Href.html +319 -0
  21. data/docs/Html2rss/ItemExtractors/Html.html +314 -0
  22. data/docs/Html2rss/ItemExtractors/Static.html +301 -0
  23. data/docs/Html2rss/ItemExtractors/Text.html +312 -0
  24. data/docs/Html2rss/Utils.html +115 -0
  25. data/docs/Html2rss/Utils/IndifferentAccessHash.html +142 -0
  26. data/docs/_index.html +300 -0
  27. data/docs/class_list.html +51 -0
  28. data/docs/css/common.css +1 -0
  29. data/docs/css/full_list.css +58 -0
  30. data/docs/css/style.css +496 -0
  31. data/docs/file.README.html +135 -0
  32. data/docs/file_list.html +56 -0
  33. data/docs/frames.html +17 -0
  34. data/docs/index.html +135 -0
  35. data/docs/js/app.js +303 -0
  36. data/docs/js/full_list.js +216 -0
  37. data/docs/js/jquery.js +4 -0
  38. data/docs/method_list.html +435 -0
  39. data/docs/top-level-namespace.html +110 -0
  40. data/html2rss.gemspec +3 -0
  41. data/lib/html2rss.rb +19 -4
  42. data/lib/html2rss/attribute_post_processors.rb +5 -3
  43. data/lib/html2rss/attribute_post_processors/parse_time.rb +29 -3
  44. data/lib/html2rss/attribute_post_processors/parse_uri.rb +20 -1
  45. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +65 -3
  46. data/lib/html2rss/attribute_post_processors/substring.rb +24 -3
  47. data/lib/html2rss/attribute_post_processors/template.rb +37 -10
  48. data/lib/html2rss/config.rb +11 -12
  49. data/lib/html2rss/feed_builder.rb +8 -6
  50. data/lib/html2rss/item.rb +28 -19
  51. data/lib/html2rss/item_extractors.rb +29 -0
  52. data/lib/html2rss/item_extractors/attribute.rb +37 -0
  53. data/lib/html2rss/item_extractors/current_time.rb +21 -0
  54. data/lib/html2rss/item_extractors/href.rb +36 -0
  55. data/lib/html2rss/item_extractors/html.rb +34 -0
  56. data/lib/html2rss/item_extractors/static.rb +28 -0
  57. data/lib/html2rss/item_extractors/text.rb +32 -0
  58. data/lib/html2rss/utils.rb +25 -0
  59. data/lib/html2rss/version.rb +1 -1
  60. metadata +88 -4
  61. data/lib/html2rss/item_extractor.rb +0 -37
@@ -1,18 +1,9 @@
1
- require 'hashie'
2
-
3
1
  module Html2rss
4
2
  class Config
5
- attr_reader :feed_config, :channel_config, :global_config
6
-
7
- class IndifferentAccessHash < Hash
8
- include Hashie::Extensions::MergeInitializer
9
- include Hashie::Extensions::IndifferentAccess
10
- end
11
-
12
3
  def initialize(feed_config, global_config = {})
13
- @global_config = IndifferentAccessHash.new global_config
14
- @feed_config = IndifferentAccessHash.new feed_config
15
- @channel_config = IndifferentAccessHash.new @feed_config.fetch('channel', {})
4
+ @global_config = Utils::IndifferentAccessHash.new global_config
5
+ @feed_config = Utils::IndifferentAccessHash.new feed_config
6
+ @channel_config = Utils::IndifferentAccessHash.new @feed_config.fetch('channel', {})
16
7
  end
17
8
 
18
9
  def author
@@ -40,6 +31,10 @@ module Html2rss
40
31
  end
41
32
  alias link url
42
33
 
34
+ def time_zone
35
+ channel_config.fetch 'time_zone', 'UTC'
36
+ end
37
+
43
38
  def headers
44
39
  global_config.fetch('headers', {})
45
40
  end
@@ -61,5 +56,9 @@ module Html2rss
61
56
  attribute_names.delete('items')
62
57
  attribute_names
63
58
  end
59
+
60
+ private
61
+
62
+ attr_reader :feed_config, :channel_config, :global_config
64
63
  end
65
64
  end
@@ -3,12 +3,12 @@ require_relative 'item'
3
3
 
4
4
  module Html2rss
5
5
  class FeedBuilder
6
- attr_reader :config
7
-
8
- def initialize(feed_config)
9
- @config = feed_config
6
+ def initialize(config)
7
+ @config = config
10
8
  end
11
9
 
10
+ ##
11
+ # @return [RSS:Rss]
12
12
  def rss
13
13
  RSS::Maker.make('2.0') do |maker|
14
14
  add_channel_to_maker(maker)
@@ -21,9 +21,11 @@ module Html2rss
21
21
 
22
22
  private
23
23
 
24
+ attr_reader :config
25
+
24
26
  def add_channel_to_maker(maker)
25
27
  %i[language author title description link ttl].each do |attribute_name|
26
- maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
28
+ maker.channel.public_send("#{attribute_name}=".to_sym, config.public_send(attribute_name))
27
29
  end
28
30
 
29
31
  maker.channel.generator = "html2rss V. #{::Html2rss::VERSION}"
@@ -39,7 +41,7 @@ module Html2rss
39
41
 
40
42
  items.new_item do |rss_item|
41
43
  feed_item.available_attributes.each do |attribute_name|
42
- rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))
44
+ rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
43
45
  end
44
46
 
45
47
  feed_item.categories.each do |category|
@@ -2,18 +2,18 @@ require 'faraday'
2
2
  require 'faraday_middleware'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
- require_relative 'item_extractor'
5
+ require_relative 'item_extractors'
6
6
  require_relative 'attribute_post_processors'
7
7
 
8
8
  module Html2rss
9
9
  class Item
10
- attr_reader :xml, :config
11
-
12
10
  def initialize(xml, config)
13
11
  @xml = xml
14
12
  @config = config
15
13
  end
16
14
 
15
+ private_class_method :new
16
+
17
17
  def respond_to_missing?(method_name, _include_private = false)
18
18
  config.attribute_names.include?(method_name) || super
19
19
  end
@@ -22,9 +22,8 @@ module Html2rss
22
22
  attribute_config = config.options(method_name.to_s)
23
23
  return super unless attribute_config
24
24
 
25
- extractor = attribute_config['extractor'] || 'text'
26
- proc = ItemExtractor.const_get extractor.upcase.to_sym
27
- value = proc.call(xml, attribute_config)
25
+ extractor = ItemExtractors.get_extractor(attribute_config['extractor'])
26
+ value = extractor.new(xml, attribute_config).get
28
27
 
29
28
  post_process(value, attribute_config.fetch('post_process', false))
30
29
  end
@@ -38,17 +37,19 @@ module Html2rss
38
37
  [title.to_s, description.to_s].join('') != ''
39
38
  end
40
39
 
40
+ ##
41
+ # @return [Array]
41
42
  def categories
42
- config.categories.map(&method(:method_missing)).uniq.keep_if { |category| category.to_s != '' }
43
+ categories = config.categories
44
+ categories.map!(&method(:method_missing))
45
+ categories.uniq!
46
+ categories.keep_if { |category| category.to_s != '' }
43
47
  end
44
48
 
49
+ ##
50
+ # @return [Array]
45
51
  def self.from_url(url, config)
46
- connection = Faraday.new(url: url, headers: config.headers) { |faraday|
47
- faraday.use FaradayMiddleware::FollowRedirects
48
- faraday.adapter Faraday.default_adapter
49
- }
50
-
51
- page = Nokogiri::HTML(connection.get.body)
52
+ page = Nokogiri::HTML(get_body_from_url(url, config.headers))
52
53
  page.css(config.selector('items')).map do |xml_item|
53
54
  new xml_item, config
54
55
  end
@@ -56,14 +57,22 @@ module Html2rss
56
57
 
57
58
  private
58
59
 
59
- def post_process(value, post_process_options = [])
60
- return value unless post_process_options
60
+ def self.get_body_from_url(url, headers)
61
+ Faraday.new(url: url, headers: headers) do |faraday|
62
+ faraday.use FaradayMiddleware::FollowRedirects
63
+ faraday.adapter Faraday.default_adapter
64
+ end.get.body
65
+ end
66
+ private_class_method :get_body_from_url
61
67
 
62
- post_process_options = [post_process_options] unless post_process_options.is_a?(Array)
68
+ attr_reader :xml, :config
69
+
70
+ def post_process(value, post_process_options)
71
+ return value unless post_process_options
63
72
 
64
- post_process_options.each do |options|
65
- value = AttributePostProcessors.get_processor(options)
66
- .new(value, options, self)
73
+ [post_process_options].flatten.each do |options|
74
+ value = AttributePostProcessors.get_processor(options['name'])
75
+ .new(value, options: options, item: self, config: @config)
67
76
  .get
68
77
  end
69
78
 
@@ -0,0 +1,29 @@
1
+ require_relative 'item_extractors/attribute'
2
+ require_relative 'item_extractors/current_time'
3
+ require_relative 'item_extractors/href'
4
+ require_relative 'item_extractors/html'
5
+ require_relative 'item_extractors/static'
6
+ require_relative 'item_extractors/text'
7
+
8
+ module Html2rss
9
+ ##
10
+ # Provides a namespace for item extractors.
11
+ module ItemExtractors
12
+ DEFAULT = 'text'.freeze
13
+
14
+ def self.get_extractor(name)
15
+ name ||= DEFAULT
16
+ camel_cased_name = name.split('_').map(&:capitalize).join
17
+ class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
18
+
19
+ Object.const_get(class_name)
20
+ end
21
+
22
+ ##
23
+ # @return [Nokogiri::XML::Element]
24
+ def self.element(xml, options)
25
+ selector = options['selector']
26
+ selector ? xml.css(selector) : xml
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,37 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Returns the value of the attribute.
5
+ #
6
+ # Imagine this +time+ HTML element with a +datetime+ attribute:
7
+ #
8
+ # <time datetime="2019-07-01">...</time>
9
+ #
10
+ # YAML usage example:
11
+ #
12
+ # selectors:
13
+ # link:
14
+ # selector: time
15
+ # extractor: attribute
16
+ # attribute: datetime
17
+ #
18
+ # Would return:
19
+ # '2019-07-01'
20
+ #
21
+ # In case you're extracting a date or a time, do not forget to parse it
22
+ # during post processing with
23
+ # {AttributePostProcessors::ParseTime}[rdoc-ref:Html2rss::AttributePostProcessors::ParseTime].
24
+ class Attribute
25
+ def initialize(xml, options)
26
+ @options = options
27
+ @element = ItemExtractors.element(xml, options)
28
+ end
29
+
30
+ ##
31
+ # @return [String]
32
+ def get
33
+ @element.attr(@options['attribute']).to_s
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,21 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Returns the current Time.
5
+ #
6
+ # YAML usage example:
7
+ #
8
+ # selectors:
9
+ # updated:
10
+ # extractor: current_time
11
+ class CurrentTime
12
+ def initialize(_xml, _options); end
13
+
14
+ ##
15
+ # @return [Time]
16
+ def get
17
+ Time.new
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,36 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Returns the value of the +href+ attribute.
5
+ # It always returns absolute URLs. If the extracted +href+ value is a
6
+ # relative URL, it prepends the channel's URL.
7
+ #
8
+ # Imagine this +a+ HTML element with a +href+ attribute:
9
+ #
10
+ # <a href="/posts/latest-findings">...</a>
11
+ #
12
+ # YAML usage example:
13
+ # channel:
14
+ # url: http://blog-without-a-feed.example.com
15
+ # ...
16
+ # selectors:
17
+ # link:
18
+ # selector: a
19
+ # extractor: href
20
+ #
21
+ # Would return:
22
+ # 'http://blog-without-a-feed.example.com/posts/latest-findings'
23
+ class Href
24
+ def initialize(xml, options)
25
+ @options = options
26
+ element = ItemExtractors.element(xml, options)
27
+ @href = element.attr('href').to_s
28
+ end
29
+
30
+ # @return [URI::HTTPS, URI::HTTP]
31
+ def get
32
+ Html2rss::Utils.build_absolute_url_from_relative(@href, @options['channel']['url'])
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,34 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Return the HTML of the attribute.
5
+ #
6
+ # Imagine this HTML structure:
7
+ #
8
+ # <p>Lorem <b>ipsum</b> dolor ...</p>
9
+ #
10
+ # YAML usage example:
11
+ #
12
+ # selectors:
13
+ # description:
14
+ # selector: p
15
+ # extractor: html
16
+ #
17
+ # Would return:
18
+ # '<p>Lorem <b>ipsum</b> dolor ...</p>'
19
+ #
20
+ # Always make sure to sanitize the HTML during post processing with
21
+ # {AttributePostProcessors::SanitizeHtml}[rdoc-ref:Html2rss::AttributePostProcessors::SanitizeHtml].
22
+ class Html
23
+ def initialize(xml, options)
24
+ @element = ItemExtractors.element(xml, options)
25
+ end
26
+
27
+ ##
28
+ # @return [String]
29
+ def get
30
+ @element.to_s
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,28 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # YAML usage example:
5
+ #
6
+ # selectors:
7
+ # autor:
8
+ # extractor: static
9
+ # static: Foobar
10
+ #
11
+ # Would return:
12
+ # 'Foobar'
13
+ class Static
14
+ def initialize(_xml, options)
15
+ @options = options
16
+ end
17
+
18
+ # Returns what options['static'] holds.
19
+ #
20
+ # options = { static: 'Foobar' }
21
+ # Static.new(xml, options).get
22
+ # # => 'Foobar'
23
+ def get
24
+ @options['static']
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,32 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Return the text of the attribute. This is the default extractor used,
5
+ # when no extractor is explicitly given.
6
+ #
7
+ # Imagine this HTML structure:
8
+ #
9
+ # <p>Lorem <b>ipsum</b> dolor ...</p>
10
+ #
11
+ # YAML usage example:
12
+ #
13
+ # selectors:
14
+ # description:
15
+ # selector: p
16
+ # extractor: text
17
+ #
18
+ # Would return:
19
+ # 'Lorem ipsum dolor ...'
20
+ class Text
21
+ def initialize(xml, options)
22
+ @element = ItemExtractors.element(xml, options)
23
+ end
24
+
25
+ ##
26
+ # @return [String]
27
+ def get
28
+ @element.text.to_s.strip.split.join(' ')
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,25 @@
1
+ require 'hashie'
2
+
3
+ module Html2rss
4
+ module Utils
5
+ ##
6
+ # A Hash with indifferent access, build with {https://github.com/intridea/hashie Hashie}.
7
+ class IndifferentAccessHash < Hash
8
+ include Hashie::Extensions::MergeInitializer
9
+ include Hashie::Extensions::IndifferentAccess
10
+ end
11
+
12
+ def self.build_absolute_url_from_relative(url, channel_url)
13
+ url = URI(url) if url.is_a?(String)
14
+
15
+ return url if url.absolute?
16
+
17
+ path, query = url.to_s.split('?')
18
+
19
+ URI(channel_url).tap do |uri|
20
+ uri.path = path.to_s.start_with?('/') ? path : "/#{path}"
21
+ uri.query = query
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,3 +1,3 @@
1
1
  module Html2rss
2
- VERSION = '0.3.3'.freeze
2
+ VERSION = '0.4.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-07-01 00:00:00.000000000 Z
11
+ date: 2019-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '5.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '5.0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: faraday
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -156,6 +170,20 @@ dependencies:
156
170
  - - ">="
157
171
  - !ruby/object:Gem::Version
158
172
  version: '0'
173
+ - !ruby/object:Gem::Dependency
174
+ name: rubocop-rspec
175
+ requirement: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ type: :development
181
+ prerelease: false
182
+ version_requirements: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: '0'
159
187
  - !ruby/object:Gem::Dependency
160
188
  name: simplecov
161
189
  requirement: !ruby/object:Gem::Requirement
@@ -184,6 +212,20 @@ dependencies:
184
212
  - - ">="
185
213
  - !ruby/object:Gem::Version
186
214
  version: '0'
215
+ - !ruby/object:Gem::Dependency
216
+ name: yard
217
+ requirement: !ruby/object:Gem::Requirement
218
+ requirements:
219
+ - - ">="
220
+ - !ruby/object:Gem::Version
221
+ version: '0'
222
+ type: :development
223
+ prerelease: false
224
+ version_requirements: !ruby/object:Gem::Requirement
225
+ requirements:
226
+ - - ">="
227
+ - !ruby/object:Gem::Version
228
+ version: '0'
187
229
  description: Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance
188
230
  in return.
189
231
  email:
@@ -197,6 +239,7 @@ files:
197
239
  - ".rspec"
198
240
  - ".rubocop.yml"
199
241
  - ".travis.yml"
242
+ - ".yardopts"
200
243
  - CHANGELOG.md
201
244
  - Gemfile
202
245
  - Gemfile.lock
@@ -204,6 +247,39 @@ files:
204
247
  - README.md
205
248
  - bin/console
206
249
  - bin/setup
250
+ - docs/Html2rss.html
251
+ - docs/Html2rss/AttributePostProcessors.html
252
+ - docs/Html2rss/AttributePostProcessors/ParseTime.html
253
+ - docs/Html2rss/AttributePostProcessors/ParseUri.html
254
+ - docs/Html2rss/AttributePostProcessors/SanitizeHtml.html
255
+ - docs/Html2rss/AttributePostProcessors/Substring.html
256
+ - docs/Html2rss/AttributePostProcessors/Template.html
257
+ - docs/Html2rss/Config.html
258
+ - docs/Html2rss/FeedBuilder.html
259
+ - docs/Html2rss/Item.html
260
+ - docs/Html2rss/ItemExtractors.html
261
+ - docs/Html2rss/ItemExtractors/Attribute.html
262
+ - docs/Html2rss/ItemExtractors/CurrentTime.html
263
+ - docs/Html2rss/ItemExtractors/Href.html
264
+ - docs/Html2rss/ItemExtractors/Html.html
265
+ - docs/Html2rss/ItemExtractors/Static.html
266
+ - docs/Html2rss/ItemExtractors/Text.html
267
+ - docs/Html2rss/Utils.html
268
+ - docs/Html2rss/Utils/IndifferentAccessHash.html
269
+ - docs/_index.html
270
+ - docs/class_list.html
271
+ - docs/css/common.css
272
+ - docs/css/full_list.css
273
+ - docs/css/style.css
274
+ - docs/file.README.html
275
+ - docs/file_list.html
276
+ - docs/frames.html
277
+ - docs/index.html
278
+ - docs/js/app.js
279
+ - docs/js/full_list.js
280
+ - docs/js/jquery.js
281
+ - docs/method_list.html
282
+ - docs/top-level-namespace.html
207
283
  - html2rss.gemspec
208
284
  - lib/html2rss.rb
209
285
  - lib/html2rss/attribute_post_processors.rb
@@ -215,7 +291,14 @@ files:
215
291
  - lib/html2rss/config.rb
216
292
  - lib/html2rss/feed_builder.rb
217
293
  - lib/html2rss/item.rb
218
- - lib/html2rss/item_extractor.rb
294
+ - lib/html2rss/item_extractors.rb
295
+ - lib/html2rss/item_extractors/attribute.rb
296
+ - lib/html2rss/item_extractors/current_time.rb
297
+ - lib/html2rss/item_extractors/href.rb
298
+ - lib/html2rss/item_extractors/html.rb
299
+ - lib/html2rss/item_extractors/static.rb
300
+ - lib/html2rss/item_extractors/text.rb
301
+ - lib/html2rss/utils.rb
219
302
  - lib/html2rss/version.rb
220
303
  - support/logo.png
221
304
  homepage: https://github.com/gildesmarais/html2rss
@@ -238,7 +321,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
238
321
  - !ruby/object:Gem::Version
239
322
  version: '0'
240
323
  requirements: []
241
- rubygems_version: 3.0.4
324
+ rubyforge_project:
325
+ rubygems_version: 2.7.7
242
326
  signing_key:
243
327
  specification_version: 4
244
328
  summary: Returns an RSS::Rss object by scraping a URL.