feedzirra 0.7.1 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/Gemfile +0 -14
  4. data/README.md +2 -241
  5. data/feedzirra.gemspec +2 -8
  6. data/lib/feedzirra.rb +2 -15
  7. data/lib/feedzirra/version.rb +1 -1
  8. metadata +7 -182
  9. data/.rspec +0 -1
  10. data/.travis.yml +0 -8
  11. data/Guardfile +0 -5
  12. data/Rakefile +0 -6
  13. data/benchmarks/README.md +0 -90
  14. data/benchmarks/basic.rb +0 -31
  15. data/benchmarks/feed_list.txt +0 -10
  16. data/benchmarks/feed_xml/apple.xml +0 -149
  17. data/benchmarks/feed_xml/cnn.xml +0 -278
  18. data/benchmarks/feed_xml/daring_fireball.xml +0 -1697
  19. data/benchmarks/feed_xml/engadget.xml +0 -604
  20. data/benchmarks/feed_xml/feedzirra_commits.xml +0 -370
  21. data/benchmarks/feed_xml/gizmodo.xml +0 -2
  22. data/benchmarks/feed_xml/loop.xml +0 -441
  23. data/benchmarks/feed_xml/rails.xml +0 -1938
  24. data/benchmarks/feed_xml/white_house.xml +0 -951
  25. data/benchmarks/feed_xml/xkcd.xml +0 -2
  26. data/benchmarks/fetching_systems.rb +0 -23
  27. data/benchmarks/other_libraries.rb +0 -73
  28. data/lib/feedzirra/core_ext.rb +0 -3
  29. data/lib/feedzirra/core_ext/date.rb +0 -19
  30. data/lib/feedzirra/core_ext/string.rb +0 -9
  31. data/lib/feedzirra/core_ext/time.rb +0 -31
  32. data/lib/feedzirra/feed.rb +0 -459
  33. data/lib/feedzirra/feed_entry_utilities.rb +0 -66
  34. data/lib/feedzirra/feed_utilities.rb +0 -103
  35. data/lib/feedzirra/parser.rb +0 -20
  36. data/lib/feedzirra/parser/atom.rb +0 -61
  37. data/lib/feedzirra/parser/atom_entry.rb +0 -34
  38. data/lib/feedzirra/parser/atom_feed_burner.rb +0 -22
  39. data/lib/feedzirra/parser/atom_feed_burner_entry.rb +0 -35
  40. data/lib/feedzirra/parser/google_docs_atom.rb +0 -28
  41. data/lib/feedzirra/parser/google_docs_atom_entry.rb +0 -29
  42. data/lib/feedzirra/parser/itunes_rss.rb +0 -50
  43. data/lib/feedzirra/parser/itunes_rss_item.rb +0 -41
  44. data/lib/feedzirra/parser/itunes_rss_owner.rb +0 -12
  45. data/lib/feedzirra/parser/rss.rb +0 -24
  46. data/lib/feedzirra/parser/rss_entry.rb +0 -37
  47. data/lib/feedzirra/parser/rss_feed_burner.rb +0 -23
  48. data/lib/feedzirra/parser/rss_feed_burner_entry.rb +0 -43
  49. data/spec/feedzirra/feed_entry_utilities_spec.rb +0 -62
  50. data/spec/feedzirra/feed_spec.rb +0 -762
  51. data/spec/feedzirra/feed_utilities_spec.rb +0 -273
  52. data/spec/feedzirra/parser/atom_entry_spec.rb +0 -86
  53. data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +0 -47
  54. data/spec/feedzirra/parser/atom_feed_burner_spec.rb +0 -56
  55. data/spec/feedzirra/parser/atom_spec.rb +0 -76
  56. data/spec/feedzirra/parser/google_docs_atom_entry_spec.rb +0 -22
  57. data/spec/feedzirra/parser/google_docs_atom_spec.rb +0 -31
  58. data/spec/feedzirra/parser/itunes_rss_item_spec.rb +0 -63
  59. data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +0 -18
  60. data/spec/feedzirra/parser/itunes_rss_spec.rb +0 -58
  61. data/spec/feedzirra/parser/rss_entry_spec.rb +0 -85
  62. data/spec/feedzirra/parser/rss_feed_burner_entry_spec.rb +0 -85
  63. data/spec/feedzirra/parser/rss_feed_burner_spec.rb +0 -57
  64. data/spec/feedzirra/parser/rss_spec.rb +0 -57
  65. data/spec/sample_feeds/AmazonWebServicesBlog.xml +0 -797
  66. data/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml +0 -63
  67. data/spec/sample_feeds/AtomFeedWithSpacesAroundEquals.xml +0 -61
  68. data/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml +0 -28
  69. data/spec/sample_feeds/GoogleDocsList.xml +0 -188
  70. data/spec/sample_feeds/HREFConsideredHarmful.xml +0 -314
  71. data/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml +0 -22
  72. data/spec/sample_feeds/ITunesWithSpacesInAttributes.xml +0 -63
  73. data/spec/sample_feeds/PaulDixExplainsNothing.xml +0 -175
  74. data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +0 -175
  75. data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +0 -19
  76. data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +0 -174
  77. data/spec/sample_feeds/SamRuby.xml +0 -583
  78. data/spec/sample_feeds/TechCrunch.xml +0 -1515
  79. data/spec/sample_feeds/TechCrunchFirstEntry.xml +0 -9
  80. data/spec/sample_feeds/TechCrunchFirstEntryDescription.xml +0 -3
  81. data/spec/sample_feeds/TenderLovemaking.xml +0 -516
  82. data/spec/sample_feeds/TenderLovemakingFirstEntry.xml +0 -66
  83. data/spec/sample_feeds/TrotterCashionHome.xml +0 -611
  84. data/spec/sample_feeds/TypePadNews.xml +0 -368
  85. data/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml +0 -31
  86. data/spec/sample_feeds/itunes.xml +0 -67
  87. data/spec/sample_feeds/pet_atom.xml +0 -497
  88. data/spec/spec_helper.rb +0 -88
@@ -1,2 +0,0 @@
1
- <?xml version="1.0" encoding="utf-8"?>
2
- <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title>xkcd.com</title><link href="http://xkcd.com/" rel="alternate"></link><id>http://xkcd.com/</id><updated>2013-11-29T00:00:00Z</updated><entry><title>Oort Cloud</title><link href="http://xkcd.com/1297/" rel="alternate"></link><updated>2013-11-29T00:00:00Z</updated><id>http://xkcd.com/1297/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/oort_cloud.png" title="... I wanna try. Hang on, be right back." alt="... I wanna try. Hang on, be right back." /&gt;</summary></entry><entry><title>Git Commit</title><link href="http://xkcd.com/1296/" rel="alternate"></link><updated>2013-11-27T00:00:00Z</updated><id>http://xkcd.com/1296/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/git_commit.png" title="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" alt="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" /&gt;</summary></entry><entry><title>New Study</title><link href="http://xkcd.com/1295/" rel="alternate"></link><updated>2013-11-25T00:00:00Z</updated><id>http://xkcd.com/1295/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/new_study.png" title="When the results are published, no one will be sure whether to report on them again." alt="When the results are published, no one will be sure whether to report on them again." /&gt;</summary></entry><entry><title>Telescope Names</title><link href="http://xkcd.com/1294/" rel="alternate"></link><updated>2013-11-22T00:00:00Z</updated><id>http://xkcd.com/1294/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/telescope_names.png" title="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." alt="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." /&gt;</summary></entry></feed>
@@ -1,23 +0,0 @@
1
- require 'benchmark'
2
- require 'net/http'
3
- require 'curb'
4
-
5
- urls = ['http://www.google.com'] * 100
6
-
7
- Benchmark.bm(11) do |b|
8
- b.report('Net::HTTP') do
9
- urls.each do |url|
10
- Net::HTTP.get URI.parse url
11
- end
12
- end
13
-
14
- b.report('Curl::Easy') do
15
- urls.each do |url|
16
- Curl::Easy.perform url
17
- end
18
- end
19
-
20
- b.report('Curl::Multi') do
21
- Curl::Multi.get urls
22
- end
23
- end
@@ -1,73 +0,0 @@
1
- require 'benchmark'
2
- require 'feedzirra'
3
- require 'simple-rss'
4
- require 'feed-normalizer'
5
- require 'feed_me'
6
-
7
- iterations = 10
8
- urls = File.readlines(File.dirname(__FILE__) + '/feed_list.txt')
9
- files = Dir.glob(File.dirname(__FILE__) + '/feed_xml/*.xml')
10
- xmls = files.map { |file| File.open(file).read }
11
-
12
- # suppress warnings
13
- $VERBOSE = nil
14
-
15
- puts 'Parsing benchmarks'
16
-
17
- Benchmark.bm(15) do |b|
18
- b.report('feedzirra') do
19
- iterations.times do
20
- xmls.each { |xml| Feedzirra::Feed.parse xml }
21
- end
22
- end
23
-
24
- b.report('simple-rss') do
25
- iterations.times do
26
- xmls.each { |xml| SimpleRSS.parse xml }
27
- end
28
- end
29
-
30
- b.report('feed-normalizer') do
31
- iterations.times do
32
- xmls.each { |xml| FeedNormalizer::FeedNormalizer.parse xml }
33
- end
34
- end
35
-
36
- # incompatible with `ruby-feedparser`, same constant used
37
- require 'feed_parser'
38
- b.report('feed_parser') do
39
- iterations.times do
40
- xmls.each { |xml| FeedParser.new(feed_xml: xml).parse }
41
- end
42
- end
43
-
44
- b.report('feed_me') do
45
- iterations.times do
46
- xmls.each { |xml| FeedMe.parse xml }
47
- end
48
- end
49
-
50
- # incompatible with `feed_parser`, same constant used
51
- # require 'feedparser'
52
- # b.report('ruby-feedparser') do
53
- # iterations.times do
54
- # xmls.each { |xml| FeedParser::Feed::new xml }
55
- # end
56
- # end
57
- end
58
-
59
- puts "\nFetch and parse benchmarks"
60
-
61
- Benchmark.bm(15) do |b|
62
- b.report('feedzirra') do
63
- iterations.times { Feedzirra::Feed.fetch_and_parse urls }
64
- end
65
-
66
- # incompatible with `ruby-feedparser`, same constant used
67
- require 'feed_parser'
68
- b.report('feed_parser') do
69
- iterations.times do
70
- urls.each { |url| FeedParser.new(url: url).parse }
71
- end
72
- end
73
- end
@@ -1,3 +0,0 @@
1
- require "feedzirra/core_ext/time"
2
- require "feedzirra/core_ext/date"
3
- require "feedzirra/core_ext/string"
@@ -1,19 +0,0 @@
1
- # Date code pulled and adapted from:
2
- # Ruby Cookbook by Lucas Carlson and Leonard Richardson
3
- # Published by O'Reilly
4
- # ISBN: 0-596-52369-6
5
- class Date
6
- def feed_utils_to_gm_time
7
- feed_utils_to_time(new_offset, :gm)
8
- end
9
-
10
- def feed_utils_to_local_time
11
- feed_utils_to_time(new_offset(DateTime.now.offset-offset), :local)
12
- end
13
-
14
- private
15
- def feed_utils_to_time(dest, method)
16
- Time.send(method, dest.year, dest.month, dest.day, dest.hour, dest.min,
17
- dest.sec, dest.zone)
18
- end
19
- end
@@ -1,9 +0,0 @@
1
- class String
2
- def sanitize!
3
- self.replace(sanitize)
4
- end
5
-
6
- def sanitize
7
- Loofah.scrub_fragment(self, :prune).to_s
8
- end
9
- end
@@ -1,31 +0,0 @@
1
- require "time"
2
- require "date"
3
-
4
- class Time
5
- # Parse a time string and convert it to UTC without raising errors.
6
- # Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC.
7
- #
8
- # === Parameters
9
- # [dt<String or Time>] Time definition to be parsed.
10
- #
11
- # === Returns
12
- # A Time instance in UTC or nil if there were errors while parsing.
13
- def self.parse_safely(dt)
14
- if dt
15
- case
16
- when dt.is_a?(Time)
17
- dt.utc
18
- when dt.respond_to?(:empty?) && dt.empty?
19
- nil
20
- when dt.respond_to?(:to_datetime)
21
- dt.to_datetime.utc
22
- when dt.to_s =~ /\A\d{14}\z/
23
- parse("#{dt.to_s}Z", true)
24
- else
25
- parse(dt.to_s, true).utc
26
- end
27
- end
28
- rescue StandardError
29
- nil
30
- end unless method_defined?(:parse_safely)
31
- end
@@ -1,459 +0,0 @@
1
- module Feedzirra
2
- class Feed
3
- USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master"
4
-
5
- # Passes raw XML and callbacks to a parser.
6
- # === Parameters
7
- # [parser<Object>] The parser to pass arguments to - must respond to
8
- # `parse` and should return a Feed object.
9
- # [xml<String>] The XML that you would like parsed.
10
- # === Returns
11
- # An instance of the parser feed type.
12
- def self.parse_with(parser, xml, &block)
13
- parser.parse xml, &block
14
- end
15
-
16
- # Takes a raw XML feed and attempts to parse it. If no parser is available a Feedzirra::NoParserAvailable exception is raised.
17
- # You can pass a block to be called when there's an error during the parsing.
18
- # === Parameters
19
- # [xml<String>] The XML that you would like parsed.
20
- # === Returns
21
- # An instance of the determined feed type. By default, one of these:
22
- # * Feedzirra::Parser::RSSFeedBurner
23
- # * Feedzirra::Parser::GoogleDocsAtom
24
- # * Feedzirra::Parser::AtomFeedBurner
25
- # * Feedzirra::Parser::Atom
26
- # * Feedzirra::Parser::ITunesRSS
27
- # * Feedzirra::Parser::RSS
28
- # === Raises
29
- # Feedzirra::NoParserAvailable : If no valid parser classes could be found for the feed.
30
- def self.parse(xml, &block)
31
- if parser = determine_feed_parser_for_xml(xml)
32
- parse_with parser, xml, &block
33
- else
34
- raise NoParserAvailable.new("No valid parser for XML.")
35
- end
36
- end
37
-
38
- # Determines the correct parser class to use for parsing the feed.
39
- #
40
- # === Parameters
41
- # [xml<String>] The XML that you would like determine the parser for.
42
- # === Returns
43
- # The class name of the parser that can handle the XML.
44
- def self.determine_feed_parser_for_xml(xml)
45
- start_of_doc = xml.slice(0, 2000)
46
- feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
47
- end
48
-
49
- # Adds a new feed parsing class that will be used for parsing.
50
- #
51
- # === Parameters
52
- # [klass<Constant>] The class/constant that you want to register.
53
- # === Returns
54
- # A updated array of feed parser class names.
55
- def self.add_feed_class(klass)
56
- feed_classes.unshift klass
57
- end
58
-
59
- # Provides a list of registered feed parsing classes.
60
- #
61
- # === Returns
62
- # A array of class names.
63
- def self.feed_classes
64
- @feed_classes ||= [
65
- Feedzirra::Parser::RSSFeedBurner,
66
- Feedzirra::Parser::GoogleDocsAtom,
67
- Feedzirra::Parser::AtomFeedBurner,
68
- Feedzirra::Parser::Atom,
69
- Feedzirra::Parser::ITunesRSS,
70
- Feedzirra::Parser::RSS
71
- ]
72
- end
73
-
74
- # Makes all registered feeds types look for the passed in element to parse.
75
- # This is actually just a call to element (a SAXMachine call) in the class.
76
- #
77
- # === Parameters
78
- # [element_tag<String>] The element tag
79
- # [options<Hash>] Valid keys are same as with SAXMachine
80
- def self.add_common_feed_element(element_tag, options = {})
81
- feed_classes.each do |k|
82
- k.element element_tag, options
83
- end
84
- end
85
-
86
- # Makes all registered feeds types look for the passed in elements to parse.
87
- # This is actually just a call to elements (a SAXMachine call) in the class.
88
- #
89
- # === Parameters
90
- # [element_tag<String>] The element tag
91
- # [options<Hash>] Valid keys are same as with SAXMachine
92
- def self.add_common_feed_elements(element_tag, options = {})
93
- feed_classes.each do |k|
94
- k.elements element_tag, options
95
- end
96
- end
97
-
98
- # Makes all registered entry types look for the passed in element to parse.
99
- # This is actually just a call to element (a SAXMachine call) in the class.
100
- #
101
- # === Parameters
102
- # [element_tag<String>]
103
- # [options<Hash>] Valid keys are same as with SAXMachine
104
- def self.add_common_feed_entry_element(element_tag, options = {})
105
- call_on_each_feed_entry :element, element_tag, options
106
- end
107
-
108
- # Makes all registered entry types look for the passed in elements to parse.
109
- # This is actually just a call to element (a SAXMachine call) in the class.
110
- #
111
- # === Parameters
112
- # [element_tag<String>]
113
- # [options<Hash>] Valid keys are same as with SAXMachine
114
- def self.add_common_feed_entry_elements(element_tag, options = {})
115
- call_on_each_feed_entry :elements, element_tag, options
116
- end
117
-
118
- # Call a method on all feed entries classes.
119
- #
120
- # === Parameters
121
- # [method<Symbol>] The method name
122
- # [parameters<Array>] The method parameters
123
- def self.call_on_each_feed_entry(method, *parameters)
124
- feed_classes.each do |k|
125
- # iterate on the collections defined in the sax collection
126
- k.sax_config.collection_elements.each_value do |vl|
127
- # vl is a list of CollectionConfig mapped to an attribute name
128
- # we'll look for the one set as 'entries' and add the new element
129
- vl.find_all{|v| (v.accessor == 'entries') && (v.data_class.class == Class)}.each do |v|
130
- v.data_class.send(method, *parameters)
131
- end
132
- end
133
- end
134
- end
135
-
136
- # Setup curl from options.
137
- # Possible parameters:
138
- # * :user_agent - overrides the default user agent.
139
- # * :compress - any value to enable compression
140
- # * :enable_cookies - boolean
141
- # * :cookiefile - file to read cookies
142
- # * :cookies - contents of cookies header
143
- # * :http_authentication - array containing username, then password
144
- # * :proxy_url - proxy url
145
- # * :proxy_port - proxy port
146
- # * :max_redirects - max number of redirections
147
- # * :timeout - timeout
148
- # * :ssl_verify_host - boolean
149
- # * :ssl_verify_peer - boolean
150
- # * :ssl_version - the ssl version to use, see OpenSSL::SSL::SSLContext::METHODS for options
151
- def self.setup_easy(curl, options={})
152
- curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
153
- curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
154
- curl.enable_cookies = options[:enable_cookies] if options.has_key?(:enable_cookies)
155
- curl.cookiefile = options[:cookiefile] if options.has_key?(:cookiefile)
156
- curl.cookies = options[:cookies] if options.has_key?(:cookies)
157
-
158
- curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
159
- curl.proxy_url = options[:proxy_url] if options.has_key?(:proxy_url)
160
- curl.proxy_port = options[:proxy_port] if options.has_key?(:proxy_port)
161
- curl.max_redirects = options[:max_redirects] if options[:max_redirects]
162
- curl.timeout = options[:timeout] if options[:timeout]
163
- curl.ssl_verify_host = options[:ssl_verify_host] if options.has_key?(:ssl_verify_host)
164
- curl.ssl_verify_peer = options[:ssl_verify_peer] if options.has_key?(:ssl_verify_peer)
165
- curl.ssl_version = options[:ssl_version] if options.has_key?(:ssl_version)
166
-
167
- curl.follow_location = true
168
- end
169
-
170
- # Fetches and returns the raw XML for each URL provided.
171
- #
172
- # === Parameters
173
- # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
174
- # [options<Hash>] Valid keys for this argument as as followed:
175
- # :if_modified_since - Time object representing when the feed was last updated.
176
- # :if_none_match - String that's normally an etag for the request that was stored previously.
177
- # :on_success - Block that gets executed after a successful request.
178
- # :on_failure - Block that gets executed after a failed request.
179
- # * all parameters defined in setup_easy
180
- # === Returns
181
- # A String of XML if a single URL is passed.
182
- #
183
- # A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
184
- def self.fetch_raw(urls, options = {})
185
- url_queue = [*urls]
186
- multi = Curl::Multi.new
187
- responses = {}
188
- url_queue.each do |url|
189
- easy = Curl::Easy.new(url) do |curl|
190
- setup_easy curl, options
191
-
192
- curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
193
- curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
194
-
195
- curl.on_success do |c|
196
- responses[url] = decode_content(c)
197
- end
198
-
199
- curl.on_complete do |c, err|
200
- responses[url] = c.response_code unless responses.has_key?(url)
201
- end
202
- end
203
- multi.add(easy)
204
- end
205
-
206
- multi.perform
207
- urls.is_a?(String) ? responses.values.first : responses
208
- end
209
-
210
- # Fetches and returns the parsed XML for each URL provided.
211
- #
212
- # === Parameters
213
- # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
214
- # [options<Hash>] Valid keys for this argument as as followed:
215
- # * :user_agent - String that overrides the default user agent.
216
- # * :if_modified_since - Time object representing when the feed was last updated.
217
- # * :if_none_match - String, an etag for the request that was stored previously.
218
- # * :on_success - Block that gets executed after a successful request.
219
- # * :on_failure - Block that gets executed after a failed request.
220
- # === Returns
221
- # A Feed object if a single URL is passed.
222
- #
223
- # A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
224
- def self.fetch_and_parse(urls, options = {})
225
- url_queue = [*urls]
226
- multi = Curl::Multi.new
227
- responses = {}
228
-
229
- # I broke these down so I would only try to do 30 simultaneously because
230
- # I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
231
- url_queue.slice!(0, 30).each do |url|
232
- add_url_to_multi(multi, url, url_queue, responses, options)
233
- end
234
-
235
- multi.perform
236
- return urls.is_a?(String) ? responses.values.first : responses
237
- end
238
-
239
- # Decodes the XML document if it was compressed.
240
- #
241
- # === Parameters
242
- # [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
243
- # === Returns
244
- # A decoded string of XML.
245
- def self.decode_content(c)
246
- if c.header_str.match(/Content-Encoding: gzip/i)
247
- begin
248
- gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
249
- xml = gz.read
250
- gz.close
251
- rescue Zlib::GzipFile::Error
252
- # Maybe this is not gzipped?
253
- xml = c.body_str
254
- end
255
- elsif c.header_str.match(/Content-Encoding: deflate/i)
256
- xml = Zlib::Inflate.inflate(c.body_str)
257
- else
258
- xml = c.body_str
259
- end
260
-
261
- xml
262
- end
263
-
264
- # Updates each feed for each Feed object provided.
265
- #
266
- # === Parameters
267
- # [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
268
- # [options<Hash>] Valid keys for this argument as as followed:
269
- # * :on_success - Block that gets executed after a successful request.
270
- # * :on_failure - Block that gets executed after a failed request.
271
- # * all parameters defined in setup_easy
272
- # === Returns
273
- # A updated Feed object if a single URL is passed.
274
- #
275
- # A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
276
- def self.update(feeds, options = {})
277
- feed_queue = [*feeds]
278
- multi = Curl::Multi.new
279
- responses = {}
280
-
281
- feed_queue.slice!(0, 30).each do |feed|
282
- add_feed_to_multi(multi, feed, feed_queue, responses, options)
283
- end
284
-
285
- multi.perform
286
- feeds.is_a?(Array) ? responses : responses.values.first
287
- end
288
-
289
- # An abstraction for adding a feed by URL to the passed Curb::multi stack.
290
- #
291
- # === Parameters
292
- # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
293
- # [url<String>] The URL of the feed that you would like to be fetched.
294
- # [url_queue<Array>] An array of URLs that are queued for request.
295
- # [responses<Hash>] Existing responses that you want the response from the request added to.
296
- # [feeds<String> or <Array>] A single feed object, or an array of feed objects.
297
- # [options<Hash>] Valid keys for this argument as as followed:
298
- # * :on_success - Block that gets executed after a successful request.
299
- # * :on_failure - Block that gets executed after a failed request.
300
- # * all parameters defined in setup_easy
301
- # === Returns
302
- # The updated Curl::Multi object with the request details added to it's stack.
303
- def self.add_url_to_multi(multi, url, url_queue, responses, options)
304
- easy = Curl::Easy.new(url) do |curl|
305
- setup_easy curl, options
306
- curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
307
- curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
308
-
309
- curl.on_success do |c|
310
- xml = decode_content(c)
311
- klass = determine_feed_parser_for_xml(xml)
312
-
313
- if klass
314
- begin
315
- feed = parse_with klass, xml, &on_parser_failure(url)
316
-
317
- feed.feed_url = c.last_effective_url
318
- feed.etag = etag_from_header(c.header_str)
319
- feed.last_modified = last_modified_from_header(c.header_str)
320
- responses[url] = feed
321
- options[:on_success].call(url, feed) if options.has_key?(:on_success)
322
- rescue Exception => e
323
- call_on_failure(c, e, options[:on_failure])
324
- end
325
- else
326
- call_on_failure(c, "Can't determine a parser", options[:on_failure])
327
- end
328
- end
329
-
330
- #
331
- # trigger on_failure for 404s
332
- #
333
- curl.on_complete do |c|
334
- add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
335
- responses[url] = c.response_code unless responses.has_key?(url)
336
- end
337
-
338
- curl.on_redirect do |c|
339
- if c.response_code == 304 # it's not modified. this isn't an error condition
340
- options[:on_success].call(url, nil) if options.has_key?(:on_success)
341
- end
342
- end
343
-
344
- curl.on_missing do |c|
345
- if c.response_code == 404 && options.has_key?(:on_failure)
346
- call_on_failure(c, 'Server returned a 404', options[:on_failure])
347
- end
348
- end
349
-
350
- curl.on_failure do |c, err|
351
- responses[url] = c.response_code
352
- call_on_failure(c, err, options[:on_failure])
353
- end
354
- end
355
- multi.add(easy)
356
- end
357
-
358
- # An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
359
- #
360
- # === Parameters
361
- # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
362
- # [feed<Feed>] A feed object that you would like to be fetched.
363
- # [url_queue<Array>] An array of feed objects that are queued for request.
364
- # [responses<Hash>] Existing responses that you want the response from the request added to.
365
- # [feeds<String>] or <Array> A single feed object, or an array of feed objects.
366
- # [options<Hash>] Valid keys for this argument as as followed:
367
- # * :on_success - Block that gets executed after a successful request.
368
- # * :on_failure - Block that gets executed after a failed request.
369
- # * all parameters defined in setup_easy
370
- # === Returns
371
- # The updated Curl::Multi object with the request details added to it's stack.
372
- def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
373
- easy = Curl::Easy.new(feed.feed_url) do |curl|
374
- setup_easy curl, options
375
- curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
376
- curl.headers["If-Modified-Since"] = options[:if_modified_since] if options[:if_modified_since] && (!feed.last_modified || (Time.parse(options[:if_modified_since].to_s) > feed.last_modified))
377
- curl.headers["If-None-Match"] = feed.etag if feed.etag
378
-
379
- curl.on_success do |c|
380
- begin
381
- updated_feed = Feed.parse c.body_str, &on_parser_failure(feed.feed_url)
382
-
383
- updated_feed.feed_url = c.last_effective_url
384
- updated_feed.etag = etag_from_header(c.header_str)
385
- updated_feed.last_modified = last_modified_from_header(c.header_str)
386
- feed.update_from_feed(updated_feed)
387
- responses[feed.feed_url] = feed
388
- options[:on_success].call(feed) if options.has_key?(:on_success)
389
- rescue Exception => e
390
- call_on_failure(c, e, options[:on_failure])
391
- end
392
- end
393
-
394
- curl.on_failure do |c, err| # response code 50X
395
- responses[feed.feed_url] = c.response_code
396
- call_on_failure(c, 'Server returned a 404', options[:on_failure])
397
- end
398
-
399
- curl.on_redirect do |c, err| # response code 30X
400
- if c.response_code == 304
401
- options[:on_success].call(feed) if options.has_key?(:on_success)
402
- else
403
- responses[feed.feed_url] = c.response_code
404
- call_on_failure(c, err, options[:on_failure])
405
- end
406
- end
407
-
408
- curl.on_complete do |c|
409
- add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
410
- responses[feed.feed_url] = feed unless responses.has_key?(feed.feed_url)
411
- end
412
- end
413
- multi.add(easy)
414
- end
415
-
416
- # Determines the etag from the request headers.
417
- #
418
- # === Parameters
419
- # [header<String>] Raw request header returned from the request
420
- # === Returns
421
- # A string of the etag or nil if it cannot be found in the headers.
422
- def self.etag_from_header(header)
423
- header =~ /.*ETag:\s(.*)\r/
424
- $1
425
- end
426
-
427
- # Determines the last modified date from the request headers.
428
- #
429
- # === Parameters
430
- # [header<String>] Raw request header returned from the request
431
- # === Returns
432
- # A Time object of the last modified date or nil if it cannot be found in the headers.
433
- def self.last_modified_from_header(header)
434
- header =~ /.*Last-Modified:\s(.*)\r/
435
- Time.parse_safely($1) if $1
436
- end
437
-
438
- class << self
439
- private
440
-
441
- def on_parser_failure(url)
442
- Proc.new { |message| raise "Error while parsing [#{url}] #{message}" }
443
- end
444
-
445
- def call_on_failure(c, error, on_failure)
446
- if on_failure
447
- if on_failure.arity == 4
448
- warn 'on_failure proc with deprecated arity 4 should include a fifth parameter containing the error'
449
- on_failure.call(c.url, c.response_code, c.header_str, c.body_str)
450
- elsif on_failure.arity == 2
451
- on_failure.call(c, error)
452
- else
453
- warn "on_failure proc with invalid parameters number #{on_failure.arity} instead of 2, ignoring it"
454
- end
455
- end
456
- end
457
- end
458
- end
459
- end