feedzirra 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/Gemfile +0 -14
  4. data/README.md +2 -241
  5. data/feedzirra.gemspec +2 -8
  6. data/lib/feedzirra.rb +2 -15
  7. data/lib/feedzirra/version.rb +1 -1
  8. metadata +7 -182
  9. data/.rspec +0 -1
  10. data/.travis.yml +0 -8
  11. data/Guardfile +0 -5
  12. data/Rakefile +0 -6
  13. data/benchmarks/README.md +0 -90
  14. data/benchmarks/basic.rb +0 -31
  15. data/benchmarks/feed_list.txt +0 -10
  16. data/benchmarks/feed_xml/apple.xml +0 -149
  17. data/benchmarks/feed_xml/cnn.xml +0 -278
  18. data/benchmarks/feed_xml/daring_fireball.xml +0 -1697
  19. data/benchmarks/feed_xml/engadget.xml +0 -604
  20. data/benchmarks/feed_xml/feedzirra_commits.xml +0 -370
  21. data/benchmarks/feed_xml/gizmodo.xml +0 -2
  22. data/benchmarks/feed_xml/loop.xml +0 -441
  23. data/benchmarks/feed_xml/rails.xml +0 -1938
  24. data/benchmarks/feed_xml/white_house.xml +0 -951
  25. data/benchmarks/feed_xml/xkcd.xml +0 -2
  26. data/benchmarks/fetching_systems.rb +0 -23
  27. data/benchmarks/other_libraries.rb +0 -73
  28. data/lib/feedzirra/core_ext.rb +0 -3
  29. data/lib/feedzirra/core_ext/date.rb +0 -19
  30. data/lib/feedzirra/core_ext/string.rb +0 -9
  31. data/lib/feedzirra/core_ext/time.rb +0 -31
  32. data/lib/feedzirra/feed.rb +0 -459
  33. data/lib/feedzirra/feed_entry_utilities.rb +0 -66
  34. data/lib/feedzirra/feed_utilities.rb +0 -103
  35. data/lib/feedzirra/parser.rb +0 -20
  36. data/lib/feedzirra/parser/atom.rb +0 -61
  37. data/lib/feedzirra/parser/atom_entry.rb +0 -34
  38. data/lib/feedzirra/parser/atom_feed_burner.rb +0 -22
  39. data/lib/feedzirra/parser/atom_feed_burner_entry.rb +0 -35
  40. data/lib/feedzirra/parser/google_docs_atom.rb +0 -28
  41. data/lib/feedzirra/parser/google_docs_atom_entry.rb +0 -29
  42. data/lib/feedzirra/parser/itunes_rss.rb +0 -50
  43. data/lib/feedzirra/parser/itunes_rss_item.rb +0 -41
  44. data/lib/feedzirra/parser/itunes_rss_owner.rb +0 -12
  45. data/lib/feedzirra/parser/rss.rb +0 -24
  46. data/lib/feedzirra/parser/rss_entry.rb +0 -37
  47. data/lib/feedzirra/parser/rss_feed_burner.rb +0 -23
  48. data/lib/feedzirra/parser/rss_feed_burner_entry.rb +0 -43
  49. data/spec/feedzirra/feed_entry_utilities_spec.rb +0 -62
  50. data/spec/feedzirra/feed_spec.rb +0 -762
  51. data/spec/feedzirra/feed_utilities_spec.rb +0 -273
  52. data/spec/feedzirra/parser/atom_entry_spec.rb +0 -86
  53. data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +0 -47
  54. data/spec/feedzirra/parser/atom_feed_burner_spec.rb +0 -56
  55. data/spec/feedzirra/parser/atom_spec.rb +0 -76
  56. data/spec/feedzirra/parser/google_docs_atom_entry_spec.rb +0 -22
  57. data/spec/feedzirra/parser/google_docs_atom_spec.rb +0 -31
  58. data/spec/feedzirra/parser/itunes_rss_item_spec.rb +0 -63
  59. data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +0 -18
  60. data/spec/feedzirra/parser/itunes_rss_spec.rb +0 -58
  61. data/spec/feedzirra/parser/rss_entry_spec.rb +0 -85
  62. data/spec/feedzirra/parser/rss_feed_burner_entry_spec.rb +0 -85
  63. data/spec/feedzirra/parser/rss_feed_burner_spec.rb +0 -57
  64. data/spec/feedzirra/parser/rss_spec.rb +0 -57
  65. data/spec/sample_feeds/AmazonWebServicesBlog.xml +0 -797
  66. data/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml +0 -63
  67. data/spec/sample_feeds/AtomFeedWithSpacesAroundEquals.xml +0 -61
  68. data/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml +0 -28
  69. data/spec/sample_feeds/GoogleDocsList.xml +0 -188
  70. data/spec/sample_feeds/HREFConsideredHarmful.xml +0 -314
  71. data/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml +0 -22
  72. data/spec/sample_feeds/ITunesWithSpacesInAttributes.xml +0 -63
  73. data/spec/sample_feeds/PaulDixExplainsNothing.xml +0 -175
  74. data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +0 -175
  75. data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +0 -19
  76. data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +0 -174
  77. data/spec/sample_feeds/SamRuby.xml +0 -583
  78. data/spec/sample_feeds/TechCrunch.xml +0 -1515
  79. data/spec/sample_feeds/TechCrunchFirstEntry.xml +0 -9
  80. data/spec/sample_feeds/TechCrunchFirstEntryDescription.xml +0 -3
  81. data/spec/sample_feeds/TenderLovemaking.xml +0 -516
  82. data/spec/sample_feeds/TenderLovemakingFirstEntry.xml +0 -66
  83. data/spec/sample_feeds/TrotterCashionHome.xml +0 -611
  84. data/spec/sample_feeds/TypePadNews.xml +0 -368
  85. data/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml +0 -31
  86. data/spec/sample_feeds/itunes.xml +0 -67
  87. data/spec/sample_feeds/pet_atom.xml +0 -497
  88. data/spec/spec_helper.rb +0 -88
@@ -1,2 +0,0 @@
1
- <?xml version="1.0" encoding="utf-8"?>
2
- <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title>xkcd.com</title><link href="http://xkcd.com/" rel="alternate"></link><id>http://xkcd.com/</id><updated>2013-11-29T00:00:00Z</updated><entry><title>Oort Cloud</title><link href="http://xkcd.com/1297/" rel="alternate"></link><updated>2013-11-29T00:00:00Z</updated><id>http://xkcd.com/1297/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/oort_cloud.png" title="... I wanna try. Hang on, be right back." alt="... I wanna try. Hang on, be right back." /&gt;</summary></entry><entry><title>Git Commit</title><link href="http://xkcd.com/1296/" rel="alternate"></link><updated>2013-11-27T00:00:00Z</updated><id>http://xkcd.com/1296/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/git_commit.png" title="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" alt="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" /&gt;</summary></entry><entry><title>New Study</title><link href="http://xkcd.com/1295/" rel="alternate"></link><updated>2013-11-25T00:00:00Z</updated><id>http://xkcd.com/1295/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/new_study.png" title="When the results are published, no one will be sure whether to report on them again." alt="When the results are published, no one will be sure whether to report on them again." /&gt;</summary></entry><entry><title>Telescope Names</title><link href="http://xkcd.com/1294/" rel="alternate"></link><updated>2013-11-22T00:00:00Z</updated><id>http://xkcd.com/1294/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/telescope_names.png" title="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." alt="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." /&gt;</summary></entry></feed>
@@ -1,23 +0,0 @@
1
- require 'benchmark'
2
- require 'net/http'
3
- require 'curb'
4
-
5
- urls = ['http://www.google.com'] * 100
6
-
7
- Benchmark.bm(11) do |b|
8
- b.report('Net::HTTP') do
9
- urls.each do |url|
10
- Net::HTTP.get URI.parse url
11
- end
12
- end
13
-
14
- b.report('Curl::Easy') do
15
- urls.each do |url|
16
- Curl::Easy.perform url
17
- end
18
- end
19
-
20
- b.report('Curl::Multi') do
21
- Curl::Multi.get urls
22
- end
23
- end
@@ -1,73 +0,0 @@
1
- require 'benchmark'
2
- require 'feedzirra'
3
- require 'simple-rss'
4
- require 'feed-normalizer'
5
- require 'feed_me'
6
-
7
- iterations = 10
8
- urls = File.readlines(File.dirname(__FILE__) + '/feed_list.txt')
9
- files = Dir.glob(File.dirname(__FILE__) + '/feed_xml/*.xml')
10
- xmls = files.map { |file| File.open(file).read }
11
-
12
- # suppress warnings
13
- $VERBOSE = nil
14
-
15
- puts 'Parsing benchmarks'
16
-
17
- Benchmark.bm(15) do |b|
18
- b.report('feedzirra') do
19
- iterations.times do
20
- xmls.each { |xml| Feedzirra::Feed.parse xml }
21
- end
22
- end
23
-
24
- b.report('simple-rss') do
25
- iterations.times do
26
- xmls.each { |xml| SimpleRSS.parse xml }
27
- end
28
- end
29
-
30
- b.report('feed-normalizer') do
31
- iterations.times do
32
- xmls.each { |xml| FeedNormalizer::FeedNormalizer.parse xml }
33
- end
34
- end
35
-
36
- # incompatible with `ruby-feedparser`, same constant used
37
- require 'feed_parser'
38
- b.report('feed_parser') do
39
- iterations.times do
40
- xmls.each { |xml| FeedParser.new(feed_xml: xml).parse }
41
- end
42
- end
43
-
44
- b.report('feed_me') do
45
- iterations.times do
46
- xmls.each { |xml| FeedMe.parse xml }
47
- end
48
- end
49
-
50
- # incompatible with `feed_parser`, same constant used
51
- # require 'feedparser'
52
- # b.report('ruby-feedparser') do
53
- # iterations.times do
54
- # xmls.each { |xml| FeedParser::Feed::new xml }
55
- # end
56
- # end
57
- end
58
-
59
- puts "\nFetch and parse benchmarks"
60
-
61
- Benchmark.bm(15) do |b|
62
- b.report('feedzirra') do
63
- iterations.times { Feedzirra::Feed.fetch_and_parse urls }
64
- end
65
-
66
- # incompatible with `ruby-feedparser`, same constant used
67
- require 'feed_parser'
68
- b.report('feed_parser') do
69
- iterations.times do
70
- urls.each { |url| FeedParser.new(url: url).parse }
71
- end
72
- end
73
- end
@@ -1,3 +0,0 @@
1
- require "feedzirra/core_ext/time"
2
- require "feedzirra/core_ext/date"
3
- require "feedzirra/core_ext/string"
@@ -1,19 +0,0 @@
1
- # Date code pulled and adapted from:
2
- # Ruby Cookbook by Lucas Carlson and Leonard Richardson
3
- # Published by O'Reilly
4
- # ISBN: 0-596-52369-6
5
- class Date
6
- def feed_utils_to_gm_time
7
- feed_utils_to_time(new_offset, :gm)
8
- end
9
-
10
- def feed_utils_to_local_time
11
- feed_utils_to_time(new_offset(DateTime.now.offset-offset), :local)
12
- end
13
-
14
- private
15
- def feed_utils_to_time(dest, method)
16
- Time.send(method, dest.year, dest.month, dest.day, dest.hour, dest.min,
17
- dest.sec, dest.zone)
18
- end
19
- end
@@ -1,9 +0,0 @@
1
- class String
2
- def sanitize!
3
- self.replace(sanitize)
4
- end
5
-
6
- def sanitize
7
- Loofah.scrub_fragment(self, :prune).to_s
8
- end
9
- end
@@ -1,31 +0,0 @@
1
- require "time"
2
- require "date"
3
-
4
- class Time
5
- # Parse a time string and convert it to UTC without raising errors.
6
- # Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC.
7
- #
8
- # === Parameters
9
- # [dt<String or Time>] Time definition to be parsed.
10
- #
11
- # === Returns
12
- # A Time instance in UTC or nil if there were errors while parsing.
13
- def self.parse_safely(dt)
14
- if dt
15
- case
16
- when dt.is_a?(Time)
17
- dt.utc
18
- when dt.respond_to?(:empty?) && dt.empty?
19
- nil
20
- when dt.respond_to?(:to_datetime)
21
- dt.to_datetime.utc
22
- when dt.to_s =~ /\A\d{14}\z/
23
- parse("#{dt.to_s}Z", true)
24
- else
25
- parse(dt.to_s, true).utc
26
- end
27
- end
28
- rescue StandardError
29
- nil
30
- end unless method_defined?(:parse_safely)
31
- end
@@ -1,459 +0,0 @@
1
- module Feedzirra
2
- class Feed
3
- USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master"
4
-
5
- # Passes raw XML and callbacks to a parser.
6
- # === Parameters
7
- # [parser<Object>] The parser to pass arguments to - must respond to
8
- # `parse` and should return a Feed object.
9
- # [xml<String>] The XML that you would like parsed.
10
- # === Returns
11
- # An instance of the parser feed type.
12
- def self.parse_with(parser, xml, &block)
13
- parser.parse xml, &block
14
- end
15
-
16
- # Takes a raw XML feed and attempts to parse it. If no parser is available a Feedzirra::NoParserAvailable exception is raised.
17
- # You can pass a block to be called when there's an error during the parsing.
18
- # === Parameters
19
- # [xml<String>] The XML that you would like parsed.
20
- # === Returns
21
- # An instance of the determined feed type. By default, one of these:
22
- # * Feedzirra::Parser::RSSFeedBurner
23
- # * Feedzirra::Parser::GoogleDocsAtom
24
- # * Feedzirra::Parser::AtomFeedBurner
25
- # * Feedzirra::Parser::Atom
26
- # * Feedzirra::Parser::ITunesRSS
27
- # * Feedzirra::Parser::RSS
28
- # === Raises
29
- # Feedzirra::NoParserAvailable : If no valid parser classes could be found for the feed.
30
- def self.parse(xml, &block)
31
- if parser = determine_feed_parser_for_xml(xml)
32
- parse_with parser, xml, &block
33
- else
34
- raise NoParserAvailable.new("No valid parser for XML.")
35
- end
36
- end
37
-
38
- # Determines the correct parser class to use for parsing the feed.
39
- #
40
- # === Parameters
41
- # [xml<String>] The XML that you would like determine the parser for.
42
- # === Returns
43
- # The class name of the parser that can handle the XML.
44
- def self.determine_feed_parser_for_xml(xml)
45
- start_of_doc = xml.slice(0, 2000)
46
- feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
47
- end
48
-
49
- # Adds a new feed parsing class that will be used for parsing.
50
- #
51
- # === Parameters
52
- # [klass<Constant>] The class/constant that you want to register.
53
- # === Returns
54
- # A updated array of feed parser class names.
55
- def self.add_feed_class(klass)
56
- feed_classes.unshift klass
57
- end
58
-
59
- # Provides a list of registered feed parsing classes.
60
- #
61
- # === Returns
62
- # A array of class names.
63
- def self.feed_classes
64
- @feed_classes ||= [
65
- Feedzirra::Parser::RSSFeedBurner,
66
- Feedzirra::Parser::GoogleDocsAtom,
67
- Feedzirra::Parser::AtomFeedBurner,
68
- Feedzirra::Parser::Atom,
69
- Feedzirra::Parser::ITunesRSS,
70
- Feedzirra::Parser::RSS
71
- ]
72
- end
73
-
74
- # Makes all registered feeds types look for the passed in element to parse.
75
- # This is actually just a call to element (a SAXMachine call) in the class.
76
- #
77
- # === Parameters
78
- # [element_tag<String>] The element tag
79
- # [options<Hash>] Valid keys are same as with SAXMachine
80
- def self.add_common_feed_element(element_tag, options = {})
81
- feed_classes.each do |k|
82
- k.element element_tag, options
83
- end
84
- end
85
-
86
- # Makes all registered feeds types look for the passed in elements to parse.
87
- # This is actually just a call to elements (a SAXMachine call) in the class.
88
- #
89
- # === Parameters
90
- # [element_tag<String>] The element tag
91
- # [options<Hash>] Valid keys are same as with SAXMachine
92
- def self.add_common_feed_elements(element_tag, options = {})
93
- feed_classes.each do |k|
94
- k.elements element_tag, options
95
- end
96
- end
97
-
98
- # Makes all registered entry types look for the passed in element to parse.
99
- # This is actually just a call to element (a SAXMachine call) in the class.
100
- #
101
- # === Parameters
102
- # [element_tag<String>]
103
- # [options<Hash>] Valid keys are same as with SAXMachine
104
- def self.add_common_feed_entry_element(element_tag, options = {})
105
- call_on_each_feed_entry :element, element_tag, options
106
- end
107
-
108
- # Makes all registered entry types look for the passed in elements to parse.
109
- # This is actually just a call to element (a SAXMachine call) in the class.
110
- #
111
- # === Parameters
112
- # [element_tag<String>]
113
- # [options<Hash>] Valid keys are same as with SAXMachine
114
- def self.add_common_feed_entry_elements(element_tag, options = {})
115
- call_on_each_feed_entry :elements, element_tag, options
116
- end
117
-
118
- # Call a method on all feed entries classes.
119
- #
120
- # === Parameters
121
- # [method<Symbol>] The method name
122
- # [parameters<Array>] The method parameters
123
- def self.call_on_each_feed_entry(method, *parameters)
124
- feed_classes.each do |k|
125
- # iterate on the collections defined in the sax collection
126
- k.sax_config.collection_elements.each_value do |vl|
127
- # vl is a list of CollectionConfig mapped to an attribute name
128
- # we'll look for the one set as 'entries' and add the new element
129
- vl.find_all{|v| (v.accessor == 'entries') && (v.data_class.class == Class)}.each do |v|
130
- v.data_class.send(method, *parameters)
131
- end
132
- end
133
- end
134
- end
135
-
136
- # Setup curl from options.
137
- # Possible parameters:
138
- # * :user_agent - overrides the default user agent.
139
- # * :compress - any value to enable compression
140
- # * :enable_cookies - boolean
141
- # * :cookiefile - file to read cookies
142
- # * :cookies - contents of cookies header
143
- # * :http_authentication - array containing username, then password
144
- # * :proxy_url - proxy url
145
- # * :proxy_port - proxy port
146
- # * :max_redirects - max number of redirections
147
- # * :timeout - timeout
148
- # * :ssl_verify_host - boolean
149
- # * :ssl_verify_peer - boolean
150
- # * :ssl_version - the ssl version to use, see OpenSSL::SSL::SSLContext::METHODS for options
151
- def self.setup_easy(curl, options={})
152
- curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
153
- curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
154
- curl.enable_cookies = options[:enable_cookies] if options.has_key?(:enable_cookies)
155
- curl.cookiefile = options[:cookiefile] if options.has_key?(:cookiefile)
156
- curl.cookies = options[:cookies] if options.has_key?(:cookies)
157
-
158
- curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
159
- curl.proxy_url = options[:proxy_url] if options.has_key?(:proxy_url)
160
- curl.proxy_port = options[:proxy_port] if options.has_key?(:proxy_port)
161
- curl.max_redirects = options[:max_redirects] if options[:max_redirects]
162
- curl.timeout = options[:timeout] if options[:timeout]
163
- curl.ssl_verify_host = options[:ssl_verify_host] if options.has_key?(:ssl_verify_host)
164
- curl.ssl_verify_peer = options[:ssl_verify_peer] if options.has_key?(:ssl_verify_peer)
165
- curl.ssl_version = options[:ssl_version] if options.has_key?(:ssl_version)
166
-
167
- curl.follow_location = true
168
- end
169
-
170
- # Fetches and returns the raw XML for each URL provided.
171
- #
172
- # === Parameters
173
- # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
174
- # [options<Hash>] Valid keys for this argument as as followed:
175
- # :if_modified_since - Time object representing when the feed was last updated.
176
- # :if_none_match - String that's normally an etag for the request that was stored previously.
177
- # :on_success - Block that gets executed after a successful request.
178
- # :on_failure - Block that gets executed after a failed request.
179
- # * all parameters defined in setup_easy
180
- # === Returns
181
- # A String of XML if a single URL is passed.
182
- #
183
- # A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
184
- def self.fetch_raw(urls, options = {})
185
- url_queue = [*urls]
186
- multi = Curl::Multi.new
187
- responses = {}
188
- url_queue.each do |url|
189
- easy = Curl::Easy.new(url) do |curl|
190
- setup_easy curl, options
191
-
192
- curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
193
- curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
194
-
195
- curl.on_success do |c|
196
- responses[url] = decode_content(c)
197
- end
198
-
199
- curl.on_complete do |c, err|
200
- responses[url] = c.response_code unless responses.has_key?(url)
201
- end
202
- end
203
- multi.add(easy)
204
- end
205
-
206
- multi.perform
207
- urls.is_a?(String) ? responses.values.first : responses
208
- end
209
-
210
- # Fetches and returns the parsed XML for each URL provided.
211
- #
212
- # === Parameters
213
- # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
214
- # [options<Hash>] Valid keys for this argument as as followed:
215
- # * :user_agent - String that overrides the default user agent.
216
- # * :if_modified_since - Time object representing when the feed was last updated.
217
- # * :if_none_match - String, an etag for the request that was stored previously.
218
- # * :on_success - Block that gets executed after a successful request.
219
- # * :on_failure - Block that gets executed after a failed request.
220
- # === Returns
221
- # A Feed object if a single URL is passed.
222
- #
223
- # A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
224
- def self.fetch_and_parse(urls, options = {})
225
- url_queue = [*urls]
226
- multi = Curl::Multi.new
227
- responses = {}
228
-
229
- # I broke these down so I would only try to do 30 simultaneously because
230
- # I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
231
- url_queue.slice!(0, 30).each do |url|
232
- add_url_to_multi(multi, url, url_queue, responses, options)
233
- end
234
-
235
- multi.perform
236
- return urls.is_a?(String) ? responses.values.first : responses
237
- end
238
-
239
- # Decodes the XML document if it was compressed.
240
- #
241
- # === Parameters
242
- # [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
243
- # === Returns
244
- # A decoded string of XML.
245
- def self.decode_content(c)
246
- if c.header_str.match(/Content-Encoding: gzip/i)
247
- begin
248
- gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
249
- xml = gz.read
250
- gz.close
251
- rescue Zlib::GzipFile::Error
252
- # Maybe this is not gzipped?
253
- xml = c.body_str
254
- end
255
- elsif c.header_str.match(/Content-Encoding: deflate/i)
256
- xml = Zlib::Inflate.inflate(c.body_str)
257
- else
258
- xml = c.body_str
259
- end
260
-
261
- xml
262
- end
263
-
264
- # Updates each feed for each Feed object provided.
265
- #
266
- # === Parameters
267
- # [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
268
- # [options<Hash>] Valid keys for this argument as as followed:
269
- # * :on_success - Block that gets executed after a successful request.
270
- # * :on_failure - Block that gets executed after a failed request.
271
- # * all parameters defined in setup_easy
272
- # === Returns
273
- # A updated Feed object if a single URL is passed.
274
- #
275
- # A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
276
- def self.update(feeds, options = {})
277
- feed_queue = [*feeds]
278
- multi = Curl::Multi.new
279
- responses = {}
280
-
281
- feed_queue.slice!(0, 30).each do |feed|
282
- add_feed_to_multi(multi, feed, feed_queue, responses, options)
283
- end
284
-
285
- multi.perform
286
- feeds.is_a?(Array) ? responses : responses.values.first
287
- end
288
-
289
- # An abstraction for adding a feed by URL to the passed Curb::multi stack.
290
- #
291
- # === Parameters
292
- # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
293
- # [url<String>] The URL of the feed that you would like to be fetched.
294
- # [url_queue<Array>] An array of URLs that are queued for request.
295
- # [responses<Hash>] Existing responses that you want the response from the request added to.
296
- # [feeds<String> or <Array>] A single feed object, or an array of feed objects.
297
- # [options<Hash>] Valid keys for this argument as as followed:
298
- # * :on_success - Block that gets executed after a successful request.
299
- # * :on_failure - Block that gets executed after a failed request.
300
- # * all parameters defined in setup_easy
301
- # === Returns
302
- # The updated Curl::Multi object with the request details added to it's stack.
303
- def self.add_url_to_multi(multi, url, url_queue, responses, options)
304
- easy = Curl::Easy.new(url) do |curl|
305
- setup_easy curl, options
306
- curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
307
- curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
308
-
309
- curl.on_success do |c|
310
- xml = decode_content(c)
311
- klass = determine_feed_parser_for_xml(xml)
312
-
313
- if klass
314
- begin
315
- feed = parse_with klass, xml, &on_parser_failure(url)
316
-
317
- feed.feed_url = c.last_effective_url
318
- feed.etag = etag_from_header(c.header_str)
319
- feed.last_modified = last_modified_from_header(c.header_str)
320
- responses[url] = feed
321
- options[:on_success].call(url, feed) if options.has_key?(:on_success)
322
- rescue Exception => e
323
- call_on_failure(c, e, options[:on_failure])
324
- end
325
- else
326
- call_on_failure(c, "Can't determine a parser", options[:on_failure])
327
- end
328
- end
329
-
330
- #
331
- # trigger on_failure for 404s
332
- #
333
- curl.on_complete do |c|
334
- add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
335
- responses[url] = c.response_code unless responses.has_key?(url)
336
- end
337
-
338
- curl.on_redirect do |c|
339
- if c.response_code == 304 # it's not modified. this isn't an error condition
340
- options[:on_success].call(url, nil) if options.has_key?(:on_success)
341
- end
342
- end
343
-
344
- curl.on_missing do |c|
345
- if c.response_code == 404 && options.has_key?(:on_failure)
346
- call_on_failure(c, 'Server returned a 404', options[:on_failure])
347
- end
348
- end
349
-
350
- curl.on_failure do |c, err|
351
- responses[url] = c.response_code
352
- call_on_failure(c, err, options[:on_failure])
353
- end
354
- end
355
- multi.add(easy)
356
- end
357
-
358
- # An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
359
- #
360
- # === Parameters
361
- # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
362
- # [feed<Feed>] A feed object that you would like to be fetched.
363
- # [url_queue<Array>] An array of feed objects that are queued for request.
364
- # [responses<Hash>] Existing responses that you want the response from the request added to.
365
- # [feeds<String>] or <Array> A single feed object, or an array of feed objects.
366
- # [options<Hash>] Valid keys for this argument as as followed:
367
- # * :on_success - Block that gets executed after a successful request.
368
- # * :on_failure - Block that gets executed after a failed request.
369
- # * all parameters defined in setup_easy
370
- # === Returns
371
- # The updated Curl::Multi object with the request details added to it's stack.
372
- def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
373
- easy = Curl::Easy.new(feed.feed_url) do |curl|
374
- setup_easy curl, options
375
- curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
376
- curl.headers["If-Modified-Since"] = options[:if_modified_since] if options[:if_modified_since] && (!feed.last_modified || (Time.parse(options[:if_modified_since].to_s) > feed.last_modified))
377
- curl.headers["If-None-Match"] = feed.etag if feed.etag
378
-
379
- curl.on_success do |c|
380
- begin
381
- updated_feed = Feed.parse c.body_str, &on_parser_failure(feed.feed_url)
382
-
383
- updated_feed.feed_url = c.last_effective_url
384
- updated_feed.etag = etag_from_header(c.header_str)
385
- updated_feed.last_modified = last_modified_from_header(c.header_str)
386
- feed.update_from_feed(updated_feed)
387
- responses[feed.feed_url] = feed
388
- options[:on_success].call(feed) if options.has_key?(:on_success)
389
- rescue Exception => e
390
- call_on_failure(c, e, options[:on_failure])
391
- end
392
- end
393
-
394
- curl.on_failure do |c, err| # response code 50X
395
- responses[feed.feed_url] = c.response_code
396
- call_on_failure(c, 'Server returned a 404', options[:on_failure])
397
- end
398
-
399
- curl.on_redirect do |c, err| # response code 30X
400
- if c.response_code == 304
401
- options[:on_success].call(feed) if options.has_key?(:on_success)
402
- else
403
- responses[feed.feed_url] = c.response_code
404
- call_on_failure(c, err, options[:on_failure])
405
- end
406
- end
407
-
408
- curl.on_complete do |c|
409
- add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
410
- responses[feed.feed_url] = feed unless responses.has_key?(feed.feed_url)
411
- end
412
- end
413
- multi.add(easy)
414
- end
415
-
416
- # Determines the etag from the request headers.
417
- #
418
- # === Parameters
419
- # [header<String>] Raw request header returned from the request
420
- # === Returns
421
- # A string of the etag or nil if it cannot be found in the headers.
422
- def self.etag_from_header(header)
423
- header =~ /.*ETag:\s(.*)\r/
424
- $1
425
- end
426
-
427
- # Determines the last modified date from the request headers.
428
- #
429
- # === Parameters
430
- # [header<String>] Raw request header returned from the request
431
- # === Returns
432
- # A Time object of the last modified date or nil if it cannot be found in the headers.
433
- def self.last_modified_from_header(header)
434
- header =~ /.*Last-Modified:\s(.*)\r/
435
- Time.parse_safely($1) if $1
436
- end
437
-
438
- class << self
439
- private
440
-
441
- def on_parser_failure(url)
442
- Proc.new { |message| raise "Error while parsing [#{url}] #{message}" }
443
- end
444
-
445
- def call_on_failure(c, error, on_failure)
446
- if on_failure
447
- if on_failure.arity == 4
448
- warn 'on_failure proc with deprecated arity 4 should include a fifth parameter containing the error'
449
- on_failure.call(c.url, c.response_code, c.header_str, c.body_str)
450
- elsif on_failure.arity == 2
451
- on_failure.call(c, error)
452
- else
453
- warn "on_failure proc with invalid parameters number #{on_failure.arity} instead of 2, ignoring it"
454
- end
455
- end
456
- end
457
- end
458
- end
459
- end