feedjira 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +8 -0
  5. data/CHANGELOG.md +162 -0
  6. data/Gemfile +17 -0
  7. data/Guardfile +5 -0
  8. data/README.md +242 -0
  9. data/Rakefile +6 -0
  10. data/benchmarks/README.md +90 -0
  11. data/benchmarks/basic.rb +31 -0
  12. data/benchmarks/feed_list.txt +10 -0
  13. data/benchmarks/feed_xml/apple.xml +149 -0
  14. data/benchmarks/feed_xml/cnn.xml +278 -0
  15. data/benchmarks/feed_xml/daring_fireball.xml +1697 -0
  16. data/benchmarks/feed_xml/engadget.xml +604 -0
  17. data/benchmarks/feed_xml/feedjira_commits.xml +370 -0
  18. data/benchmarks/feed_xml/gizmodo.xml +2 -0
  19. data/benchmarks/feed_xml/loop.xml +441 -0
  20. data/benchmarks/feed_xml/rails.xml +1938 -0
  21. data/benchmarks/feed_xml/white_house.xml +951 -0
  22. data/benchmarks/feed_xml/xkcd.xml +2 -0
  23. data/benchmarks/fetching_systems.rb +23 -0
  24. data/benchmarks/other_libraries.rb +73 -0
  25. data/feedjira.gemspec +27 -0
  26. data/lib/feedjira.rb +16 -0
  27. data/lib/feedjira/core_ext.rb +3 -0
  28. data/lib/feedjira/core_ext/date.rb +19 -0
  29. data/lib/feedjira/core_ext/string.rb +9 -0
  30. data/lib/feedjira/core_ext/time.rb +31 -0
  31. data/lib/feedjira/feed.rb +459 -0
  32. data/lib/feedjira/feed_entry_utilities.rb +66 -0
  33. data/lib/feedjira/feed_utilities.rb +103 -0
  34. data/lib/feedjira/parser.rb +20 -0
  35. data/lib/feedjira/parser/atom.rb +61 -0
  36. data/lib/feedjira/parser/atom_entry.rb +34 -0
  37. data/lib/feedjira/parser/atom_feed_burner.rb +22 -0
  38. data/lib/feedjira/parser/atom_feed_burner_entry.rb +35 -0
  39. data/lib/feedjira/parser/google_docs_atom.rb +28 -0
  40. data/lib/feedjira/parser/google_docs_atom_entry.rb +29 -0
  41. data/lib/feedjira/parser/itunes_rss.rb +50 -0
  42. data/lib/feedjira/parser/itunes_rss_item.rb +41 -0
  43. data/lib/feedjira/parser/itunes_rss_owner.rb +12 -0
  44. data/lib/feedjira/parser/rss.rb +24 -0
  45. data/lib/feedjira/parser/rss_entry.rb +37 -0
  46. data/lib/feedjira/parser/rss_feed_burner.rb +23 -0
  47. data/lib/feedjira/parser/rss_feed_burner_entry.rb +43 -0
  48. data/lib/feedjira/version.rb +3 -0
  49. data/spec/feedjira/feed_entry_utilities_spec.rb +62 -0
  50. data/spec/feedjira/feed_spec.rb +762 -0
  51. data/spec/feedjira/feed_utilities_spec.rb +273 -0
  52. data/spec/feedjira/parser/atom_entry_spec.rb +86 -0
  53. data/spec/feedjira/parser/atom_feed_burner_entry_spec.rb +47 -0
  54. data/spec/feedjira/parser/atom_feed_burner_spec.rb +56 -0
  55. data/spec/feedjira/parser/atom_spec.rb +76 -0
  56. data/spec/feedjira/parser/google_docs_atom_entry_spec.rb +22 -0
  57. data/spec/feedjira/parser/google_docs_atom_spec.rb +31 -0
  58. data/spec/feedjira/parser/itunes_rss_item_spec.rb +63 -0
  59. data/spec/feedjira/parser/itunes_rss_owner_spec.rb +18 -0
  60. data/spec/feedjira/parser/itunes_rss_spec.rb +58 -0
  61. data/spec/feedjira/parser/rss_entry_spec.rb +85 -0
  62. data/spec/feedjira/parser/rss_feed_burner_entry_spec.rb +85 -0
  63. data/spec/feedjira/parser/rss_feed_burner_spec.rb +57 -0
  64. data/spec/feedjira/parser/rss_spec.rb +57 -0
  65. data/spec/sample_feeds/AmazonWebServicesBlog.xml +797 -0
  66. data/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml +63 -0
  67. data/spec/sample_feeds/AtomFeedWithSpacesAroundEquals.xml +61 -0
  68. data/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml +28 -0
  69. data/spec/sample_feeds/GoogleDocsList.xml +188 -0
  70. data/spec/sample_feeds/HREFConsideredHarmful.xml +314 -0
  71. data/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml +22 -0
  72. data/spec/sample_feeds/ITunesWithSpacesInAttributes.xml +63 -0
  73. data/spec/sample_feeds/PaulDixExplainsNothing.xml +175 -0
  74. data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +175 -0
  75. data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +19 -0
  76. data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +174 -0
  77. data/spec/sample_feeds/SamRuby.xml +583 -0
  78. data/spec/sample_feeds/TechCrunch.xml +1515 -0
  79. data/spec/sample_feeds/TechCrunchFirstEntry.xml +9 -0
  80. data/spec/sample_feeds/TechCrunchFirstEntryDescription.xml +3 -0
  81. data/spec/sample_feeds/TenderLovemaking.xml +516 -0
  82. data/spec/sample_feeds/TenderLovemakingFirstEntry.xml +66 -0
  83. data/spec/sample_feeds/TrotterCashionHome.xml +611 -0
  84. data/spec/sample_feeds/TypePadNews.xml +368 -0
  85. data/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml +31 -0
  86. data/spec/sample_feeds/itunes.xml +67 -0
  87. data/spec/sample_feeds/pet_atom.xml +497 -0
  88. data/spec/spec_helper.rb +88 -0
  89. metadata +229 -0
@@ -0,0 +1,2 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title>xkcd.com</title><link href="http://xkcd.com/" rel="alternate"></link><id>http://xkcd.com/</id><updated>2013-11-29T00:00:00Z</updated><entry><title>Oort Cloud</title><link href="http://xkcd.com/1297/" rel="alternate"></link><updated>2013-11-29T00:00:00Z</updated><id>http://xkcd.com/1297/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/oort_cloud.png" title="... I wanna try. Hang on, be right back." alt="... I wanna try. Hang on, be right back." /&gt;</summary></entry><entry><title>Git Commit</title><link href="http://xkcd.com/1296/" rel="alternate"></link><updated>2013-11-27T00:00:00Z</updated><id>http://xkcd.com/1296/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/git_commit.png" title="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" alt="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" /&gt;</summary></entry><entry><title>New Study</title><link href="http://xkcd.com/1295/" rel="alternate"></link><updated>2013-11-25T00:00:00Z</updated><id>http://xkcd.com/1295/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/new_study.png" title="When the results are published, no one will be sure whether to report on them again." alt="When the results are published, no one will be sure whether to report on them again." /&gt;</summary></entry><entry><title>Telescope Names</title><link href="http://xkcd.com/1294/" rel="alternate"></link><updated>2013-11-22T00:00:00Z</updated><id>http://xkcd.com/1294/</id><summary type="html">&lt;img src="http://imgs.xkcd.com/comics/telescope_names.png" title="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." alt="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." /&gt;</summary></entry></feed>
@@ -0,0 +1,23 @@
1
+ require 'benchmark'
2
+ require 'net/http'
3
+ require 'curb'
4
+
5
+ urls = ['http://www.google.com'] * 100
6
+
7
+ Benchmark.bm(11) do |b|
8
+ b.report('Net::HTTP') do
9
+ urls.each do |url|
10
+ Net::HTTP.get URI.parse url
11
+ end
12
+ end
13
+
14
+ b.report('Curl::Easy') do
15
+ urls.each do |url|
16
+ Curl::Easy.perform url
17
+ end
18
+ end
19
+
20
+ b.report('Curl::Multi') do
21
+ Curl::Multi.get urls
22
+ end
23
+ end
@@ -0,0 +1,73 @@
1
+ require 'benchmark'
2
+ require 'feedjira'
3
+ require 'simple-rss'
4
+ require 'feed-normalizer'
5
+ require 'feed_me'
6
+
7
+ iterations = 10
8
+ urls = File.readlines(File.dirname(__FILE__) + '/feed_list.txt')
9
+ files = Dir.glob(File.dirname(__FILE__) + '/feed_xml/*.xml')
10
+ xmls = files.map { |file| File.open(file).read }
11
+
12
+ # suppress warnings
13
+ $VERBOSE = nil
14
+
15
+ puts 'Parsing benchmarks'
16
+
17
+ Benchmark.bm(15) do |b|
18
+ b.report('feedjira') do
19
+ iterations.times do
20
+ xmls.each { |xml| Feedjira::Feed.parse xml }
21
+ end
22
+ end
23
+
24
+ b.report('simple-rss') do
25
+ iterations.times do
26
+ xmls.each { |xml| SimpleRSS.parse xml }
27
+ end
28
+ end
29
+
30
+ b.report('feed-normalizer') do
31
+ iterations.times do
32
+ xmls.each { |xml| FeedNormalizer::FeedNormalizer.parse xml }
33
+ end
34
+ end
35
+
36
+ # incompatible with `ruby-feedparser`, same constant used
37
+ require 'feed_parser'
38
+ b.report('feed_parser') do
39
+ iterations.times do
40
+ xmls.each { |xml| FeedParser.new(feed_xml: xml).parse }
41
+ end
42
+ end
43
+
44
+ b.report('feed_me') do
45
+ iterations.times do
46
+ xmls.each { |xml| FeedMe.parse xml }
47
+ end
48
+ end
49
+
50
+ # incompatible with `feed_parser`, same constant used
51
+ # require 'feedparser'
52
+ # b.report('ruby-feedparser') do
53
+ # iterations.times do
54
+ # xmls.each { |xml| FeedParser::Feed::new xml }
55
+ # end
56
+ # end
57
+ end
58
+
59
+ puts "\nFetch and parse benchmarks"
60
+
61
+ Benchmark.bm(15) do |b|
62
+ b.report('feedjira') do
63
+ iterations.times { Feedjira::Feed.fetch_and_parse urls }
64
+ end
65
+
66
+ # incompatible with `ruby-feedparser`, same constant used
67
+ require 'feed_parser'
68
+ b.report('feed_parser') do
69
+ iterations.times do
70
+ urls.each { |url| FeedParser.new(url: url).parse }
71
+ end
72
+ end
73
+ end
data/feedjira.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/feedjira/version', __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'feedjira'
6
+ s.version = Feedjira::VERSION
7
+ s.license = 'MIT'
8
+
9
+ s.authors = ['Paul Dix', 'Julien Kirch', 'Ezekiel Templin', 'Jon Allured']
10
+ s.email = 'feedjira@gmail.com'
11
+ s.homepage = 'http://feedjira.com'
12
+
13
+ s.summary = 'A feed fetching and parsing library'
14
+ s.description = 'A library designed to retrieve and parse feeds as quickly as possible'
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.require_paths = ['lib']
19
+
20
+ s.platform = Gem::Platform::RUBY
21
+
22
+ s.add_dependency 'sax-machine', '~> 0.2.1'
23
+ s.add_dependency 'curb', '~> 0.8.1'
24
+ s.add_dependency 'loofah', '~> 1.2.1'
25
+
26
+ s.add_development_dependency 'rspec', '~> 2.14.0'
27
+ end
data/lib/feedjira.rb ADDED
@@ -0,0 +1,16 @@
1
+ require 'zlib'
2
+ require 'curb'
3
+ require 'sax-machine'
4
+ require 'loofah'
5
+
6
+ require 'feedjira/core_ext'
7
+ require 'feedjira/version'
8
+
9
+ module Feedjira
10
+ autoload :FeedEntryUtilities, 'feedjira/feed_entry_utilities'
11
+ autoload :FeedUtilities, 'feedjira/feed_utilities'
12
+ autoload :Feed, 'feedjira/feed'
13
+ autoload :Parser, 'feedjira/parser'
14
+
15
+ class NoParserAvailable < StandardError; end
16
+ end
@@ -0,0 +1,3 @@
1
+ require "feedjira/core_ext/time"
2
+ require "feedjira/core_ext/date"
3
+ require "feedjira/core_ext/string"
@@ -0,0 +1,19 @@
1
+ # Date code pulled and adapted from:
2
+ # Ruby Cookbook by Lucas Carlson and Leonard Richardson
3
+ # Published by O'Reilly
4
+ # ISBN: 0-596-52369-6
5
+ class Date
6
+ def feed_utils_to_gm_time
7
+ feed_utils_to_time(new_offset, :gm)
8
+ end
9
+
10
+ def feed_utils_to_local_time
11
+ feed_utils_to_time(new_offset(DateTime.now.offset-offset), :local)
12
+ end
13
+
14
+ private
15
+ def feed_utils_to_time(dest, method)
16
+ Time.send(method, dest.year, dest.month, dest.day, dest.hour, dest.min,
17
+ dest.sec, dest.zone)
18
+ end
19
+ end
@@ -0,0 +1,9 @@
1
+ class String
2
+ def sanitize!
3
+ self.replace(sanitize)
4
+ end
5
+
6
+ def sanitize
7
+ Loofah.scrub_fragment(self, :prune).to_s
8
+ end
9
+ end
@@ -0,0 +1,31 @@
1
+ require "time"
2
+ require "date"
3
+
4
+ class Time
5
+ # Parse a time string and convert it to UTC without raising errors.
6
+ # Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC.
7
+ #
8
+ # === Parameters
9
+ # [dt<String or Time>] Time definition to be parsed.
10
+ #
11
+ # === Returns
12
+ # A Time instance in UTC or nil if there were errors while parsing.
13
+ def self.parse_safely(dt)
14
+ if dt
15
+ case
16
+ when dt.is_a?(Time)
17
+ dt.utc
18
+ when dt.respond_to?(:empty?) && dt.empty?
19
+ nil
20
+ when dt.respond_to?(:to_datetime)
21
+ dt.to_datetime.utc
22
+ when dt.to_s =~ /\A\d{14}\z/
23
+ parse("#{dt.to_s}Z", true)
24
+ else
25
+ parse(dt.to_s, true).utc
26
+ end
27
+ end
28
+ rescue StandardError
29
+ nil
30
+ end unless method_defined?(:parse_safely)
31
+ end
@@ -0,0 +1,459 @@
1
+ module Feedjira
2
+ class Feed
3
+ USER_AGENT = 'feedjira http://feedjira.com'
4
+
5
+ # Passes raw XML and callbacks to a parser.
6
+ # === Parameters
7
+ # [parser<Object>] The parser to pass arguments to - must respond to
8
+ # `parse` and should return a Feed object.
9
+ # [xml<String>] The XML that you would like parsed.
10
+ # === Returns
11
+ # An instance of the parser feed type.
12
+ def self.parse_with(parser, xml, &block)
13
+ parser.parse xml, &block
14
+ end
15
+
16
+ # Takes a raw XML feed and attempts to parse it. If no parser is available a Feedjira::NoParserAvailable exception is raised.
17
+ # You can pass a block to be called when there's an error during the parsing.
18
+ # === Parameters
19
+ # [xml<String>] The XML that you would like parsed.
20
+ # === Returns
21
+ # An instance of the determined feed type. By default, one of these:
22
+ # * Feedjira::Parser::RSSFeedBurner
23
+ # * Feedjira::Parser::GoogleDocsAtom
24
+ # * Feedjira::Parser::AtomFeedBurner
25
+ # * Feedjira::Parser::Atom
26
+ # * Feedjira::Parser::ITunesRSS
27
+ # * Feedjira::Parser::RSS
28
+ # === Raises
29
+ # Feedjira::NoParserAvailable : If no valid parser classes could be found for the feed.
30
+ def self.parse(xml, &block)
31
+ if parser = determine_feed_parser_for_xml(xml)
32
+ parse_with parser, xml, &block
33
+ else
34
+ raise NoParserAvailable.new("No valid parser for XML.")
35
+ end
36
+ end
37
+
38
+ # Determines the correct parser class to use for parsing the feed.
39
+ #
40
+ # === Parameters
41
+ # [xml<String>] The XML that you would like determine the parser for.
42
+ # === Returns
43
+ # The class name of the parser that can handle the XML.
44
+ def self.determine_feed_parser_for_xml(xml)
45
+ start_of_doc = xml.slice(0, 2000)
46
+ feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
47
+ end
48
+
49
+ # Adds a new feed parsing class that will be used for parsing.
50
+ #
51
+ # === Parameters
52
+ # [klass<Constant>] The class/constant that you want to register.
53
+ # === Returns
54
+ # A updated array of feed parser class names.
55
+ def self.add_feed_class(klass)
56
+ feed_classes.unshift klass
57
+ end
58
+
59
+ # Provides a list of registered feed parsing classes.
60
+ #
61
+ # === Returns
62
+ # A array of class names.
63
+ def self.feed_classes
64
+ @feed_classes ||= [
65
+ Feedjira::Parser::RSSFeedBurner,
66
+ Feedjira::Parser::GoogleDocsAtom,
67
+ Feedjira::Parser::AtomFeedBurner,
68
+ Feedjira::Parser::Atom,
69
+ Feedjira::Parser::ITunesRSS,
70
+ Feedjira::Parser::RSS
71
+ ]
72
+ end
73
+
74
+ # Makes all registered feeds types look for the passed in element to parse.
75
+ # This is actually just a call to element (a SAXMachine call) in the class.
76
+ #
77
+ # === Parameters
78
+ # [element_tag<String>] The element tag
79
+ # [options<Hash>] Valid keys are same as with SAXMachine
80
+ def self.add_common_feed_element(element_tag, options = {})
81
+ feed_classes.each do |k|
82
+ k.element element_tag, options
83
+ end
84
+ end
85
+
86
+ # Makes all registered feeds types look for the passed in elements to parse.
87
+ # This is actually just a call to elements (a SAXMachine call) in the class.
88
+ #
89
+ # === Parameters
90
+ # [element_tag<String>] The element tag
91
+ # [options<Hash>] Valid keys are same as with SAXMachine
92
+ def self.add_common_feed_elements(element_tag, options = {})
93
+ feed_classes.each do |k|
94
+ k.elements element_tag, options
95
+ end
96
+ end
97
+
98
+ # Makes all registered entry types look for the passed in element to parse.
99
+ # This is actually just a call to element (a SAXMachine call) in the class.
100
+ #
101
+ # === Parameters
102
+ # [element_tag<String>]
103
+ # [options<Hash>] Valid keys are same as with SAXMachine
104
+ def self.add_common_feed_entry_element(element_tag, options = {})
105
+ call_on_each_feed_entry :element, element_tag, options
106
+ end
107
+
108
+ # Makes all registered entry types look for the passed in elements to parse.
109
+ # This is actually just a call to element (a SAXMachine call) in the class.
110
+ #
111
+ # === Parameters
112
+ # [element_tag<String>]
113
+ # [options<Hash>] Valid keys are same as with SAXMachine
114
+ def self.add_common_feed_entry_elements(element_tag, options = {})
115
+ call_on_each_feed_entry :elements, element_tag, options
116
+ end
117
+
118
+ # Call a method on all feed entries classes.
119
+ #
120
+ # === Parameters
121
+ # [method<Symbol>] The method name
122
+ # [parameters<Array>] The method parameters
123
+ def self.call_on_each_feed_entry(method, *parameters)
124
+ feed_classes.each do |k|
125
+ # iterate on the collections defined in the sax collection
126
+ k.sax_config.collection_elements.each_value do |vl|
127
+ # vl is a list of CollectionConfig mapped to an attribute name
128
+ # we'll look for the one set as 'entries' and add the new element
129
+ vl.find_all{|v| (v.accessor == 'entries') && (v.data_class.class == Class)}.each do |v|
130
+ v.data_class.send(method, *parameters)
131
+ end
132
+ end
133
+ end
134
+ end
135
+
136
+ # Setup curl from options.
137
+ # Possible parameters:
138
+ # * :user_agent - overrides the default user agent.
139
+ # * :compress - any value to enable compression
140
+ # * :enable_cookies - boolean
141
+ # * :cookiefile - file to read cookies
142
+ # * :cookies - contents of cookies header
143
+ # * :http_authentication - array containing username, then password
144
+ # * :proxy_url - proxy url
145
+ # * :proxy_port - proxy port
146
+ # * :max_redirects - max number of redirections
147
+ # * :timeout - timeout
148
+ # * :ssl_verify_host - boolean
149
+ # * :ssl_verify_peer - boolean
150
+ # * :ssl_version - the ssl version to use, see OpenSSL::SSL::SSLContext::METHODS for options
151
+ def self.setup_easy(curl, options={})
152
+ curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
153
+ curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
154
+ curl.enable_cookies = options[:enable_cookies] if options.has_key?(:enable_cookies)
155
+ curl.cookiefile = options[:cookiefile] if options.has_key?(:cookiefile)
156
+ curl.cookies = options[:cookies] if options.has_key?(:cookies)
157
+
158
+ curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
159
+ curl.proxy_url = options[:proxy_url] if options.has_key?(:proxy_url)
160
+ curl.proxy_port = options[:proxy_port] if options.has_key?(:proxy_port)
161
+ curl.max_redirects = options[:max_redirects] if options[:max_redirects]
162
+ curl.timeout = options[:timeout] if options[:timeout]
163
+ curl.ssl_verify_host = options[:ssl_verify_host] if options.has_key?(:ssl_verify_host)
164
+ curl.ssl_verify_peer = options[:ssl_verify_peer] if options.has_key?(:ssl_verify_peer)
165
+ curl.ssl_version = options[:ssl_version] if options.has_key?(:ssl_version)
166
+
167
+ curl.follow_location = true
168
+ end
169
+
170
+ # Fetches and returns the raw XML for each URL provided.
171
+ #
172
+ # === Parameters
173
+ # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
174
+ # [options<Hash>] Valid keys for this argument as as followed:
175
+ # :if_modified_since - Time object representing when the feed was last updated.
176
+ # :if_none_match - String that's normally an etag for the request that was stored previously.
177
+ # :on_success - Block that gets executed after a successful request.
178
+ # :on_failure - Block that gets executed after a failed request.
179
+ # * all parameters defined in setup_easy
180
+ # === Returns
181
+ # A String of XML if a single URL is passed.
182
+ #
183
+ # A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
184
+ def self.fetch_raw(urls, options = {})
185
+ url_queue = [*urls]
186
+ multi = Curl::Multi.new
187
+ responses = {}
188
+ url_queue.each do |url|
189
+ easy = Curl::Easy.new(url) do |curl|
190
+ setup_easy curl, options
191
+
192
+ curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
193
+ curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
194
+
195
+ curl.on_success do |c|
196
+ responses[url] = decode_content(c)
197
+ end
198
+
199
+ curl.on_complete do |c, err|
200
+ responses[url] = c.response_code unless responses.has_key?(url)
201
+ end
202
+ end
203
+ multi.add(easy)
204
+ end
205
+
206
+ multi.perform
207
+ urls.is_a?(String) ? responses.values.first : responses
208
+ end
209
+
210
+ # Fetches and returns the parsed XML for each URL provided.
211
+ #
212
+ # === Parameters
213
+ # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
214
+ # [options<Hash>] Valid keys for this argument as as followed:
215
+ # * :user_agent - String that overrides the default user agent.
216
+ # * :if_modified_since - Time object representing when the feed was last updated.
217
+ # * :if_none_match - String, an etag for the request that was stored previously.
218
+ # * :on_success - Block that gets executed after a successful request.
219
+ # * :on_failure - Block that gets executed after a failed request.
220
+ # === Returns
221
+ # A Feed object if a single URL is passed.
222
+ #
223
+ # A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
224
+ def self.fetch_and_parse(urls, options = {})
225
+ url_queue = [*urls]
226
+ multi = Curl::Multi.new
227
+ responses = {}
228
+
229
+ # I broke these down so I would only try to do 30 simultaneously because
230
+ # I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
231
+ url_queue.slice!(0, 30).each do |url|
232
+ add_url_to_multi(multi, url, url_queue, responses, options)
233
+ end
234
+
235
+ multi.perform
236
+ return urls.is_a?(String) ? responses.values.first : responses
237
+ end
238
+
239
+ # Decodes the XML document if it was compressed.
240
+ #
241
+ # === Parameters
242
+ # [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
243
+ # === Returns
244
+ # A decoded string of XML.
245
+ def self.decode_content(c)
246
+ if c.header_str.match(/Content-Encoding: gzip/i)
247
+ begin
248
+ gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
249
+ xml = gz.read
250
+ gz.close
251
+ rescue Zlib::GzipFile::Error
252
+ # Maybe this is not gzipped?
253
+ xml = c.body_str
254
+ end
255
+ elsif c.header_str.match(/Content-Encoding: deflate/i)
256
+ xml = Zlib::Inflate.inflate(c.body_str)
257
+ else
258
+ xml = c.body_str
259
+ end
260
+
261
+ xml
262
+ end
263
+
264
+ # Updates each feed for each Feed object provided.
265
+ #
266
+ # === Parameters
267
+ # [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
268
+ # [options<Hash>] Valid keys for this argument as as followed:
269
+ # * :on_success - Block that gets executed after a successful request.
270
+ # * :on_failure - Block that gets executed after a failed request.
271
+ # * all parameters defined in setup_easy
272
+ # === Returns
273
+ # A updated Feed object if a single URL is passed.
274
+ #
275
+ # A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
276
+ def self.update(feeds, options = {})
277
+ feed_queue = [*feeds]
278
+ multi = Curl::Multi.new
279
+ responses = {}
280
+
281
+ feed_queue.slice!(0, 30).each do |feed|
282
+ add_feed_to_multi(multi, feed, feed_queue, responses, options)
283
+ end
284
+
285
+ multi.perform
286
+ feeds.is_a?(Array) ? responses : responses.values.first
287
+ end
288
+
289
+ # An abstraction for adding a feed by URL to the passed Curb::multi stack.
290
+ #
291
+ # === Parameters
292
+ # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
293
+ # [url<String>] The URL of the feed that you would like to be fetched.
294
+ # [url_queue<Array>] An array of URLs that are queued for request.
295
+ # [responses<Hash>] Existing responses that you want the response from the request added to.
296
+ # [feeds<String> or <Array>] A single feed object, or an array of feed objects.
297
+ # [options<Hash>] Valid keys for this argument as as followed:
298
+ # * :on_success - Block that gets executed after a successful request.
299
+ # * :on_failure - Block that gets executed after a failed request.
300
+ # * all parameters defined in setup_easy
301
+ # === Returns
302
+ # The updated Curl::Multi object with the request details added to it's stack.
303
+ def self.add_url_to_multi(multi, url, url_queue, responses, options)
304
+ easy = Curl::Easy.new(url) do |curl|
305
+ setup_easy curl, options
306
+ curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
307
+ curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
308
+
309
+ curl.on_success do |c|
310
+ xml = decode_content(c)
311
+ klass = determine_feed_parser_for_xml(xml)
312
+
313
+ if klass
314
+ begin
315
+ feed = parse_with klass, xml, &on_parser_failure(url)
316
+
317
+ feed.feed_url = c.last_effective_url
318
+ feed.etag = etag_from_header(c.header_str)
319
+ feed.last_modified = last_modified_from_header(c.header_str)
320
+ responses[url] = feed
321
+ options[:on_success].call(url, feed) if options.has_key?(:on_success)
322
+ rescue Exception => e
323
+ call_on_failure(c, e, options[:on_failure])
324
+ end
325
+ else
326
+ call_on_failure(c, "Can't determine a parser", options[:on_failure])
327
+ end
328
+ end
329
+
330
+ #
331
+ # trigger on_failure for 404s
332
+ #
333
+ curl.on_complete do |c|
334
+ add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
335
+ responses[url] = c.response_code unless responses.has_key?(url)
336
+ end
337
+
338
+ curl.on_redirect do |c|
339
+ if c.response_code == 304 # it's not modified. this isn't an error condition
340
+ options[:on_success].call(url, nil) if options.has_key?(:on_success)
341
+ end
342
+ end
343
+
344
+ curl.on_missing do |c|
345
+ if c.response_code == 404 && options.has_key?(:on_failure)
346
+ call_on_failure(c, 'Server returned a 404', options[:on_failure])
347
+ end
348
+ end
349
+
350
+ curl.on_failure do |c, err|
351
+ responses[url] = c.response_code
352
+ call_on_failure(c, err, options[:on_failure])
353
+ end
354
+ end
355
+ multi.add(easy)
356
+ end
357
+
358
+ # An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
359
+ #
360
+ # === Parameters
361
+ # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
362
+ # [feed<Feed>] A feed object that you would like to be fetched.
363
+ # [url_queue<Array>] An array of feed objects that are queued for request.
364
+ # [responses<Hash>] Existing responses that you want the response from the request added to.
365
+ # [feeds<String>] or <Array> A single feed object, or an array of feed objects.
366
+ # [options<Hash>] Valid keys for this argument as as followed:
367
+ # * :on_success - Block that gets executed after a successful request.
368
+ # * :on_failure - Block that gets executed after a failed request.
369
+ # * all parameters defined in setup_easy
370
+ # === Returns
371
+ # The updated Curl::Multi object with the request details added to it's stack.
372
+ def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
373
+ easy = Curl::Easy.new(feed.feed_url) do |curl|
374
+ setup_easy curl, options
375
+ curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
376
+ curl.headers["If-Modified-Since"] = options[:if_modified_since] if options[:if_modified_since] && (!feed.last_modified || (Time.parse(options[:if_modified_since].to_s) > feed.last_modified))
377
+ curl.headers["If-None-Match"] = feed.etag if feed.etag
378
+
379
+ curl.on_success do |c|
380
+ begin
381
+ updated_feed = Feed.parse c.body_str, &on_parser_failure(feed.feed_url)
382
+
383
+ updated_feed.feed_url = c.last_effective_url
384
+ updated_feed.etag = etag_from_header(c.header_str)
385
+ updated_feed.last_modified = last_modified_from_header(c.header_str)
386
+ feed.update_from_feed(updated_feed)
387
+ responses[feed.feed_url] = feed
388
+ options[:on_success].call(feed) if options.has_key?(:on_success)
389
+ rescue Exception => e
390
+ call_on_failure(c, e, options[:on_failure])
391
+ end
392
+ end
393
+
394
+ curl.on_failure do |c, err| # response code 50X
395
+ responses[feed.feed_url] = c.response_code
396
+ call_on_failure(c, 'Server returned a 404', options[:on_failure])
397
+ end
398
+
399
+ curl.on_redirect do |c, err| # response code 30X
400
+ if c.response_code == 304
401
+ options[:on_success].call(feed) if options.has_key?(:on_success)
402
+ else
403
+ responses[feed.feed_url] = c.response_code
404
+ call_on_failure(c, err, options[:on_failure])
405
+ end
406
+ end
407
+
408
+ curl.on_complete do |c|
409
+ add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
410
+ responses[feed.feed_url] = feed unless responses.has_key?(feed.feed_url)
411
+ end
412
+ end
413
+ multi.add(easy)
414
+ end
415
+
416
+ # Determines the etag from the request headers.
417
+ #
418
+ # === Parameters
419
+ # [header<String>] Raw request header returned from the request
420
+ # === Returns
421
+ # A string of the etag or nil if it cannot be found in the headers.
422
+ def self.etag_from_header(header)
423
+ header =~ /.*ETag:\s(.*)\r/
424
+ $1
425
+ end
426
+
427
+ # Determines the last modified date from the request headers.
428
+ #
429
+ # === Parameters
430
+ # [header<String>] Raw request header returned from the request
431
+ # === Returns
432
+ # A Time object of the last modified date or nil if it cannot be found in the headers.
433
+ def self.last_modified_from_header(header)
434
+ header =~ /.*Last-Modified:\s(.*)\r/
435
+ Time.parse_safely($1) if $1
436
+ end
437
+
438
+ class << self
439
+ private
440
+
441
+ def on_parser_failure(url)
442
+ Proc.new { |message| raise "Error while parsing [#{url}] #{message}" }
443
+ end
444
+
445
+ def call_on_failure(c, error, on_failure)
446
+ if on_failure
447
+ if on_failure.arity == 4
448
+ warn 'on_failure proc with deprecated arity 4 should include a fifth parameter containing the error'
449
+ on_failure.call(c.url, c.response_code, c.header_str, c.body_str)
450
+ elsif on_failure.arity == 2
451
+ on_failure.call(c, error)
452
+ else
453
+ warn "on_failure proc with invalid parameters number #{on_failure.arity} instead of 2, ignoring it"
454
+ end
455
+ end
456
+ end
457
+ end
458
+ end
459
+ end