html2rss 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +18 -11
  3. data/.travis.yml +3 -3
  4. data/.yardopts +6 -0
  5. data/Gemfile.lock +23 -5
  6. data/README.md +2 -1
  7. data/docs/Html2rss.html +353 -0
  8. data/docs/Html2rss/AttributePostProcessors.html +203 -0
  9. data/docs/Html2rss/AttributePostProcessors/ParseTime.html +332 -0
  10. data/docs/Html2rss/AttributePostProcessors/ParseUri.html +314 -0
  11. data/docs/Html2rss/AttributePostProcessors/SanitizeHtml.html +346 -0
  12. data/docs/Html2rss/AttributePostProcessors/Substring.html +321 -0
  13. data/docs/Html2rss/AttributePostProcessors/Template.html +336 -0
  14. data/docs/Html2rss/Config.html +795 -0
  15. data/docs/Html2rss/FeedBuilder.html +295 -0
  16. data/docs/Html2rss/Item.html +654 -0
  17. data/docs/Html2rss/ItemExtractors.html +297 -0
  18. data/docs/Html2rss/ItemExtractors/Attribute.html +317 -0
  19. data/docs/Html2rss/ItemExtractors/CurrentTime.html +297 -0
  20. data/docs/Html2rss/ItemExtractors/Href.html +319 -0
  21. data/docs/Html2rss/ItemExtractors/Html.html +314 -0
  22. data/docs/Html2rss/ItemExtractors/Static.html +301 -0
  23. data/docs/Html2rss/ItemExtractors/Text.html +312 -0
  24. data/docs/Html2rss/Utils.html +115 -0
  25. data/docs/Html2rss/Utils/IndifferentAccessHash.html +142 -0
  26. data/docs/_index.html +300 -0
  27. data/docs/class_list.html +51 -0
  28. data/docs/css/common.css +1 -0
  29. data/docs/css/full_list.css +58 -0
  30. data/docs/css/style.css +496 -0
  31. data/docs/file.README.html +135 -0
  32. data/docs/file_list.html +56 -0
  33. data/docs/frames.html +17 -0
  34. data/docs/index.html +135 -0
  35. data/docs/js/app.js +303 -0
  36. data/docs/js/full_list.js +216 -0
  37. data/docs/js/jquery.js +4 -0
  38. data/docs/method_list.html +435 -0
  39. data/docs/top-level-namespace.html +110 -0
  40. data/html2rss.gemspec +3 -0
  41. data/lib/html2rss.rb +19 -4
  42. data/lib/html2rss/attribute_post_processors.rb +5 -3
  43. data/lib/html2rss/attribute_post_processors/parse_time.rb +29 -3
  44. data/lib/html2rss/attribute_post_processors/parse_uri.rb +20 -1
  45. data/lib/html2rss/attribute_post_processors/sanitize_html.rb +65 -3
  46. data/lib/html2rss/attribute_post_processors/substring.rb +24 -3
  47. data/lib/html2rss/attribute_post_processors/template.rb +37 -10
  48. data/lib/html2rss/config.rb +11 -12
  49. data/lib/html2rss/feed_builder.rb +8 -6
  50. data/lib/html2rss/item.rb +28 -19
  51. data/lib/html2rss/item_extractors.rb +29 -0
  52. data/lib/html2rss/item_extractors/attribute.rb +37 -0
  53. data/lib/html2rss/item_extractors/current_time.rb +21 -0
  54. data/lib/html2rss/item_extractors/href.rb +36 -0
  55. data/lib/html2rss/item_extractors/html.rb +34 -0
  56. data/lib/html2rss/item_extractors/static.rb +28 -0
  57. data/lib/html2rss/item_extractors/text.rb +32 -0
  58. data/lib/html2rss/utils.rb +25 -0
  59. data/lib/html2rss/version.rb +1 -1
  60. metadata +88 -4
  61. data/lib/html2rss/item_extractor.rb +0 -37
@@ -1,18 +1,9 @@
1
- require 'hashie'
2
-
3
1
  module Html2rss
4
2
  class Config
5
- attr_reader :feed_config, :channel_config, :global_config
6
-
7
- class IndifferentAccessHash < Hash
8
- include Hashie::Extensions::MergeInitializer
9
- include Hashie::Extensions::IndifferentAccess
10
- end
11
-
12
3
  def initialize(feed_config, global_config = {})
13
- @global_config = IndifferentAccessHash.new global_config
14
- @feed_config = IndifferentAccessHash.new feed_config
15
- @channel_config = IndifferentAccessHash.new @feed_config.fetch('channel', {})
4
+ @global_config = Utils::IndifferentAccessHash.new global_config
5
+ @feed_config = Utils::IndifferentAccessHash.new feed_config
6
+ @channel_config = Utils::IndifferentAccessHash.new @feed_config.fetch('channel', {})
16
7
  end
17
8
 
18
9
  def author
@@ -40,6 +31,10 @@ module Html2rss
40
31
  end
41
32
  alias link url
42
33
 
34
+ def time_zone
35
+ channel_config.fetch 'time_zone', 'UTC'
36
+ end
37
+
43
38
  def headers
44
39
  global_config.fetch('headers', {})
45
40
  end
@@ -61,5 +56,9 @@ module Html2rss
61
56
  attribute_names.delete('items')
62
57
  attribute_names
63
58
  end
59
+
60
+ private
61
+
62
+ attr_reader :feed_config, :channel_config, :global_config
64
63
  end
65
64
  end
@@ -3,12 +3,12 @@ require_relative 'item'
3
3
 
4
4
  module Html2rss
5
5
  class FeedBuilder
6
- attr_reader :config
7
-
8
- def initialize(feed_config)
9
- @config = feed_config
6
+ def initialize(config)
7
+ @config = config
10
8
  end
11
9
 
10
+ ##
11
+ # @return [RSS:Rss]
12
12
  def rss
13
13
  RSS::Maker.make('2.0') do |maker|
14
14
  add_channel_to_maker(maker)
@@ -21,9 +21,11 @@ module Html2rss
21
21
 
22
22
  private
23
23
 
24
+ attr_reader :config
25
+
24
26
  def add_channel_to_maker(maker)
25
27
  %i[language author title description link ttl].each do |attribute_name|
26
- maker.channel.send("#{attribute_name}=".to_sym, config.send(attribute_name))
28
+ maker.channel.public_send("#{attribute_name}=".to_sym, config.public_send(attribute_name))
27
29
  end
28
30
 
29
31
  maker.channel.generator = "html2rss V. #{::Html2rss::VERSION}"
@@ -39,7 +41,7 @@ module Html2rss
39
41
 
40
42
  items.new_item do |rss_item|
41
43
  feed_item.available_attributes.each do |attribute_name|
42
- rss_item.send("#{attribute_name}=".to_sym, feed_item.send(attribute_name))
44
+ rss_item.public_send("#{attribute_name}=".to_sym, feed_item.public_send(attribute_name))
43
45
  end
44
46
 
45
47
  feed_item.categories.each do |category|
@@ -2,18 +2,18 @@ require 'faraday'
2
2
  require 'faraday_middleware'
3
3
  require 'open-uri'
4
4
  require 'nokogiri'
5
- require_relative 'item_extractor'
5
+ require_relative 'item_extractors'
6
6
  require_relative 'attribute_post_processors'
7
7
 
8
8
  module Html2rss
9
9
  class Item
10
- attr_reader :xml, :config
11
-
12
10
  def initialize(xml, config)
13
11
  @xml = xml
14
12
  @config = config
15
13
  end
16
14
 
15
+ private_class_method :new
16
+
17
17
  def respond_to_missing?(method_name, _include_private = false)
18
18
  config.attribute_names.include?(method_name) || super
19
19
  end
@@ -22,9 +22,8 @@ module Html2rss
22
22
  attribute_config = config.options(method_name.to_s)
23
23
  return super unless attribute_config
24
24
 
25
- extractor = attribute_config['extractor'] || 'text'
26
- proc = ItemExtractor.const_get extractor.upcase.to_sym
27
- value = proc.call(xml, attribute_config)
25
+ extractor = ItemExtractors.get_extractor(attribute_config['extractor'])
26
+ value = extractor.new(xml, attribute_config).get
28
27
 
29
28
  post_process(value, attribute_config.fetch('post_process', false))
30
29
  end
@@ -38,17 +37,19 @@ module Html2rss
38
37
  [title.to_s, description.to_s].join('') != ''
39
38
  end
40
39
 
40
+ ##
41
+ # @return [Array]
41
42
  def categories
42
- config.categories.map(&method(:method_missing)).uniq.keep_if { |category| category.to_s != '' }
43
+ categories = config.categories
44
+ categories.map!(&method(:method_missing))
45
+ categories.uniq!
46
+ categories.keep_if { |category| category.to_s != '' }
43
47
  end
44
48
 
49
+ ##
50
+ # @return [Array]
45
51
  def self.from_url(url, config)
46
- connection = Faraday.new(url: url, headers: config.headers) { |faraday|
47
- faraday.use FaradayMiddleware::FollowRedirects
48
- faraday.adapter Faraday.default_adapter
49
- }
50
-
51
- page = Nokogiri::HTML(connection.get.body)
52
+ page = Nokogiri::HTML(get_body_from_url(url, config.headers))
52
53
  page.css(config.selector('items')).map do |xml_item|
53
54
  new xml_item, config
54
55
  end
@@ -56,14 +57,22 @@ module Html2rss
56
57
 
57
58
  private
58
59
 
59
- def post_process(value, post_process_options = [])
60
- return value unless post_process_options
60
+ def self.get_body_from_url(url, headers)
61
+ Faraday.new(url: url, headers: headers) do |faraday|
62
+ faraday.use FaradayMiddleware::FollowRedirects
63
+ faraday.adapter Faraday.default_adapter
64
+ end.get.body
65
+ end
66
+ private_class_method :get_body_from_url
61
67
 
62
- post_process_options = [post_process_options] unless post_process_options.is_a?(Array)
68
+ attr_reader :xml, :config
69
+
70
+ def post_process(value, post_process_options)
71
+ return value unless post_process_options
63
72
 
64
- post_process_options.each do |options|
65
- value = AttributePostProcessors.get_processor(options)
66
- .new(value, options, self)
73
+ [post_process_options].flatten.each do |options|
74
+ value = AttributePostProcessors.get_processor(options['name'])
75
+ .new(value, options: options, item: self, config: @config)
67
76
  .get
68
77
  end
69
78
 
@@ -0,0 +1,29 @@
1
+ require_relative 'item_extractors/attribute'
2
+ require_relative 'item_extractors/current_time'
3
+ require_relative 'item_extractors/href'
4
+ require_relative 'item_extractors/html'
5
+ require_relative 'item_extractors/static'
6
+ require_relative 'item_extractors/text'
7
+
8
+ module Html2rss
9
+ ##
10
+ # Provides a namespace for item extractors.
11
+ module ItemExtractors
12
+ DEFAULT = 'text'.freeze
13
+
14
+ def self.get_extractor(name)
15
+ name ||= DEFAULT
16
+ camel_cased_name = name.split('_').map(&:capitalize).join
17
+ class_name = ['Html2rss', 'ItemExtractors', camel_cased_name].join('::')
18
+
19
+ Object.const_get(class_name)
20
+ end
21
+
22
+ ##
23
+ # @return [Nokogiri::XML::Element]
24
+ def self.element(xml, options)
25
+ selector = options['selector']
26
+ selector ? xml.css(selector) : xml
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,37 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Returns the value of the attribute.
5
+ #
6
+ # Imagine this +time+ HTML element with a +datetime+ attribute:
7
+ #
8
+ # <time datetime="2019-07-01">...</time>
9
+ #
10
+ # YAML usage example:
11
+ #
12
+ # selectors:
13
+ # link:
14
+ # selector: time
15
+ # extractor: attribute
16
+ # attribute: datetime
17
+ #
18
+ # Would return:
19
+ # '2019-07-01'
20
+ #
21
+ # In case you're extracting a date or a time, do not forget to parse it
22
+ # during post processing with
23
+ # {AttributePostProcessors::ParseTime}[rdoc-ref:Html2rss::AttributePostProcessors::ParseTime].
24
+ class Attribute
25
+ def initialize(xml, options)
26
+ @options = options
27
+ @element = ItemExtractors.element(xml, options)
28
+ end
29
+
30
+ ##
31
+ # @return [String]
32
+ def get
33
+ @element.attr(@options['attribute']).to_s
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,21 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Returns the current Time.
5
+ #
6
+ # YAML usage example:
7
+ #
8
+ # selectors:
9
+ # updated:
10
+ # extractor: current_time
11
+ class CurrentTime
12
+ def initialize(_xml, _options); end
13
+
14
+ ##
15
+ # @return [Time]
16
+ def get
17
+ Time.new
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,36 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Returns the value of the +href+ attribute.
5
+ # It always returns absolute URLs. If the extracted +href+ value is a
6
+ # relative URL, it prepends the channel's URL.
7
+ #
8
+ # Imagine this +a+ HTML element with a +href+ attribute:
9
+ #
10
+ # <a href="/posts/latest-findings">...</a>
11
+ #
12
+ # YAML usage example:
13
+ # channel:
14
+ # url: http://blog-without-a-feed.example.com
15
+ # ...
16
+ # selectors:
17
+ # link:
18
+ # selector: a
19
+ # extractor: href
20
+ #
21
+ # Would return:
22
+ # 'http://blog-without-a-feed.example.com/posts/latest-findings'
23
+ class Href
24
+ def initialize(xml, options)
25
+ @options = options
26
+ element = ItemExtractors.element(xml, options)
27
+ @href = element.attr('href').to_s
28
+ end
29
+
30
+ # @return [URI::HTTPS, URI::HTTP]
31
+ def get
32
+ Html2rss::Utils.build_absolute_url_from_relative(@href, @options['channel']['url'])
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,34 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Return the HTML of the attribute.
5
+ #
6
+ # Imagine this HTML structure:
7
+ #
8
+ # <p>Lorem <b>ipsum</b> dolor ...</p>
9
+ #
10
+ # YAML usage example:
11
+ #
12
+ # selectors:
13
+ # description:
14
+ # selector: p
15
+ # extractor: html
16
+ #
17
+ # Would return:
18
+ # '<p>Lorem <b>ipsum</b> dolor ...</p>'
19
+ #
20
+ # Always make sure to sanitize the HTML during post processing with
21
+ # {AttributePostProcessors::SanitizeHtml}[rdoc-ref:Html2rss::AttributePostProcessors::SanitizeHtml].
22
+ class Html
23
+ def initialize(xml, options)
24
+ @element = ItemExtractors.element(xml, options)
25
+ end
26
+
27
+ ##
28
+ # @return [String]
29
+ def get
30
+ @element.to_s
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,28 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # YAML usage example:
5
+ #
6
+ # selectors:
7
+ # autor:
8
+ # extractor: static
9
+ # static: Foobar
10
+ #
11
+ # Would return:
12
+ # 'Foobar'
13
+ class Static
14
+ def initialize(_xml, options)
15
+ @options = options
16
+ end
17
+
18
+ # Returns what options['static'] holds.
19
+ #
20
+ # options = { static: 'Foobar' }
21
+ # Static.new(xml, options).get
22
+ # # => 'Foobar'
23
+ def get
24
+ @options['static']
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,32 @@
1
+ module Html2rss
2
+ module ItemExtractors
3
+ ##
4
+ # Return the text of the attribute. This is the default extractor used,
5
+ # when no extractor is explicitly given.
6
+ #
7
+ # Imagine this HTML structure:
8
+ #
9
+ # <p>Lorem <b>ipsum</b> dolor ...</p>
10
+ #
11
+ # YAML usage example:
12
+ #
13
+ # selectors:
14
+ # description:
15
+ # selector: p
16
+ # extractor: text
17
+ #
18
+ # Would return:
19
+ # 'Lorem ipsum dolor ...'
20
+ class Text
21
+ def initialize(xml, options)
22
+ @element = ItemExtractors.element(xml, options)
23
+ end
24
+
25
+ ##
26
+ # @return [String]
27
+ def get
28
+ @element.text.to_s.strip.split.join(' ')
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,25 @@
1
+ require 'hashie'
2
+
3
+ module Html2rss
4
+ module Utils
5
+ ##
6
+ # A Hash with indifferent access, build with {https://github.com/intridea/hashie Hashie}.
7
+ class IndifferentAccessHash < Hash
8
+ include Hashie::Extensions::MergeInitializer
9
+ include Hashie::Extensions::IndifferentAccess
10
+ end
11
+
12
+ def self.build_absolute_url_from_relative(url, channel_url)
13
+ url = URI(url) if url.is_a?(String)
14
+
15
+ return url if url.absolute?
16
+
17
+ path, query = url.to_s.split('?')
18
+
19
+ URI(channel_url).tap do |uri|
20
+ uri.path = path.to_s.start_with?('/') ? path : "/#{path}"
21
+ uri.query = query
22
+ end
23
+ end
24
+ end
25
+ end
@@ -1,3 +1,3 @@
1
1
  module Html2rss
2
- VERSION = '0.3.3'.freeze
2
+ VERSION = '0.4.0'.freeze
3
3
  end
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2rss
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gil Desmarais
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-07-01 00:00:00.000000000 Z
11
+ date: 2019-09-07 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: activesupport
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '5.0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '5.0'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: faraday
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -156,6 +170,20 @@ dependencies:
156
170
  - - ">="
157
171
  - !ruby/object:Gem::Version
158
172
  version: '0'
173
+ - !ruby/object:Gem::Dependency
174
+ name: rubocop-rspec
175
+ requirement: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - ">="
178
+ - !ruby/object:Gem::Version
179
+ version: '0'
180
+ type: :development
181
+ prerelease: false
182
+ version_requirements: !ruby/object:Gem::Requirement
183
+ requirements:
184
+ - - ">="
185
+ - !ruby/object:Gem::Version
186
+ version: '0'
159
187
  - !ruby/object:Gem::Dependency
160
188
  name: simplecov
161
189
  requirement: !ruby/object:Gem::Requirement
@@ -184,6 +212,20 @@ dependencies:
184
212
  - - ">="
185
213
  - !ruby/object:Gem::Version
186
214
  version: '0'
215
+ - !ruby/object:Gem::Dependency
216
+ name: yard
217
+ requirement: !ruby/object:Gem::Requirement
218
+ requirements:
219
+ - - ">="
220
+ - !ruby/object:Gem::Version
221
+ version: '0'
222
+ type: :development
223
+ prerelease: false
224
+ version_requirements: !ruby/object:Gem::Requirement
225
+ requirements:
226
+ - - ">="
227
+ - !ruby/object:Gem::Version
228
+ version: '0'
187
229
  description: Give the URL to scrape and some CSS selectors. Get a RSS::Rss instance
188
230
  in return.
189
231
  email:
@@ -197,6 +239,7 @@ files:
197
239
  - ".rspec"
198
240
  - ".rubocop.yml"
199
241
  - ".travis.yml"
242
+ - ".yardopts"
200
243
  - CHANGELOG.md
201
244
  - Gemfile
202
245
  - Gemfile.lock
@@ -204,6 +247,39 @@ files:
204
247
  - README.md
205
248
  - bin/console
206
249
  - bin/setup
250
+ - docs/Html2rss.html
251
+ - docs/Html2rss/AttributePostProcessors.html
252
+ - docs/Html2rss/AttributePostProcessors/ParseTime.html
253
+ - docs/Html2rss/AttributePostProcessors/ParseUri.html
254
+ - docs/Html2rss/AttributePostProcessors/SanitizeHtml.html
255
+ - docs/Html2rss/AttributePostProcessors/Substring.html
256
+ - docs/Html2rss/AttributePostProcessors/Template.html
257
+ - docs/Html2rss/Config.html
258
+ - docs/Html2rss/FeedBuilder.html
259
+ - docs/Html2rss/Item.html
260
+ - docs/Html2rss/ItemExtractors.html
261
+ - docs/Html2rss/ItemExtractors/Attribute.html
262
+ - docs/Html2rss/ItemExtractors/CurrentTime.html
263
+ - docs/Html2rss/ItemExtractors/Href.html
264
+ - docs/Html2rss/ItemExtractors/Html.html
265
+ - docs/Html2rss/ItemExtractors/Static.html
266
+ - docs/Html2rss/ItemExtractors/Text.html
267
+ - docs/Html2rss/Utils.html
268
+ - docs/Html2rss/Utils/IndifferentAccessHash.html
269
+ - docs/_index.html
270
+ - docs/class_list.html
271
+ - docs/css/common.css
272
+ - docs/css/full_list.css
273
+ - docs/css/style.css
274
+ - docs/file.README.html
275
+ - docs/file_list.html
276
+ - docs/frames.html
277
+ - docs/index.html
278
+ - docs/js/app.js
279
+ - docs/js/full_list.js
280
+ - docs/js/jquery.js
281
+ - docs/method_list.html
282
+ - docs/top-level-namespace.html
207
283
  - html2rss.gemspec
208
284
  - lib/html2rss.rb
209
285
  - lib/html2rss/attribute_post_processors.rb
@@ -215,7 +291,14 @@ files:
215
291
  - lib/html2rss/config.rb
216
292
  - lib/html2rss/feed_builder.rb
217
293
  - lib/html2rss/item.rb
218
- - lib/html2rss/item_extractor.rb
294
+ - lib/html2rss/item_extractors.rb
295
+ - lib/html2rss/item_extractors/attribute.rb
296
+ - lib/html2rss/item_extractors/current_time.rb
297
+ - lib/html2rss/item_extractors/href.rb
298
+ - lib/html2rss/item_extractors/html.rb
299
+ - lib/html2rss/item_extractors/static.rb
300
+ - lib/html2rss/item_extractors/text.rb
301
+ - lib/html2rss/utils.rb
219
302
  - lib/html2rss/version.rb
220
303
  - support/logo.png
221
304
  homepage: https://github.com/gildesmarais/html2rss
@@ -238,7 +321,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
238
321
  - !ruby/object:Gem::Version
239
322
  version: '0'
240
323
  requirements: []
241
- rubygems_version: 3.0.4
324
+ rubyforge_project:
325
+ rubygems_version: 2.7.7
242
326
  signing_key:
243
327
  specification_version: 4
244
328
  summary: Returns an RSS::Rss object by scraping a URL.