feedzirra 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Gemfile +0 -14
- data/README.md +2 -241
- data/feedzirra.gemspec +2 -8
- data/lib/feedzirra.rb +2 -15
- data/lib/feedzirra/version.rb +1 -1
- metadata +7 -182
- data/.rspec +0 -1
- data/.travis.yml +0 -8
- data/Guardfile +0 -5
- data/Rakefile +0 -6
- data/benchmarks/README.md +0 -90
- data/benchmarks/basic.rb +0 -31
- data/benchmarks/feed_list.txt +0 -10
- data/benchmarks/feed_xml/apple.xml +0 -149
- data/benchmarks/feed_xml/cnn.xml +0 -278
- data/benchmarks/feed_xml/daring_fireball.xml +0 -1697
- data/benchmarks/feed_xml/engadget.xml +0 -604
- data/benchmarks/feed_xml/feedzirra_commits.xml +0 -370
- data/benchmarks/feed_xml/gizmodo.xml +0 -2
- data/benchmarks/feed_xml/loop.xml +0 -441
- data/benchmarks/feed_xml/rails.xml +0 -1938
- data/benchmarks/feed_xml/white_house.xml +0 -951
- data/benchmarks/feed_xml/xkcd.xml +0 -2
- data/benchmarks/fetching_systems.rb +0 -23
- data/benchmarks/other_libraries.rb +0 -73
- data/lib/feedzirra/core_ext.rb +0 -3
- data/lib/feedzirra/core_ext/date.rb +0 -19
- data/lib/feedzirra/core_ext/string.rb +0 -9
- data/lib/feedzirra/core_ext/time.rb +0 -31
- data/lib/feedzirra/feed.rb +0 -459
- data/lib/feedzirra/feed_entry_utilities.rb +0 -66
- data/lib/feedzirra/feed_utilities.rb +0 -103
- data/lib/feedzirra/parser.rb +0 -20
- data/lib/feedzirra/parser/atom.rb +0 -61
- data/lib/feedzirra/parser/atom_entry.rb +0 -34
- data/lib/feedzirra/parser/atom_feed_burner.rb +0 -22
- data/lib/feedzirra/parser/atom_feed_burner_entry.rb +0 -35
- data/lib/feedzirra/parser/google_docs_atom.rb +0 -28
- data/lib/feedzirra/parser/google_docs_atom_entry.rb +0 -29
- data/lib/feedzirra/parser/itunes_rss.rb +0 -50
- data/lib/feedzirra/parser/itunes_rss_item.rb +0 -41
- data/lib/feedzirra/parser/itunes_rss_owner.rb +0 -12
- data/lib/feedzirra/parser/rss.rb +0 -24
- data/lib/feedzirra/parser/rss_entry.rb +0 -37
- data/lib/feedzirra/parser/rss_feed_burner.rb +0 -23
- data/lib/feedzirra/parser/rss_feed_burner_entry.rb +0 -43
- data/spec/feedzirra/feed_entry_utilities_spec.rb +0 -62
- data/spec/feedzirra/feed_spec.rb +0 -762
- data/spec/feedzirra/feed_utilities_spec.rb +0 -273
- data/spec/feedzirra/parser/atom_entry_spec.rb +0 -86
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +0 -47
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +0 -56
- data/spec/feedzirra/parser/atom_spec.rb +0 -76
- data/spec/feedzirra/parser/google_docs_atom_entry_spec.rb +0 -22
- data/spec/feedzirra/parser/google_docs_atom_spec.rb +0 -31
- data/spec/feedzirra/parser/itunes_rss_item_spec.rb +0 -63
- data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +0 -18
- data/spec/feedzirra/parser/itunes_rss_spec.rb +0 -58
- data/spec/feedzirra/parser/rss_entry_spec.rb +0 -85
- data/spec/feedzirra/parser/rss_feed_burner_entry_spec.rb +0 -85
- data/spec/feedzirra/parser/rss_feed_burner_spec.rb +0 -57
- data/spec/feedzirra/parser/rss_spec.rb +0 -57
- data/spec/sample_feeds/AmazonWebServicesBlog.xml +0 -797
- data/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml +0 -63
- data/spec/sample_feeds/AtomFeedWithSpacesAroundEquals.xml +0 -61
- data/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml +0 -28
- data/spec/sample_feeds/GoogleDocsList.xml +0 -188
- data/spec/sample_feeds/HREFConsideredHarmful.xml +0 -314
- data/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml +0 -22
- data/spec/sample_feeds/ITunesWithSpacesInAttributes.xml +0 -63
- data/spec/sample_feeds/PaulDixExplainsNothing.xml +0 -175
- data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +0 -175
- data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +0 -19
- data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +0 -174
- data/spec/sample_feeds/SamRuby.xml +0 -583
- data/spec/sample_feeds/TechCrunch.xml +0 -1515
- data/spec/sample_feeds/TechCrunchFirstEntry.xml +0 -9
- data/spec/sample_feeds/TechCrunchFirstEntryDescription.xml +0 -3
- data/spec/sample_feeds/TenderLovemaking.xml +0 -516
- data/spec/sample_feeds/TenderLovemakingFirstEntry.xml +0 -66
- data/spec/sample_feeds/TrotterCashionHome.xml +0 -611
- data/spec/sample_feeds/TypePadNews.xml +0 -368
- data/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml +0 -31
- data/spec/sample_feeds/itunes.xml +0 -67
- data/spec/sample_feeds/pet_atom.xml +0 -497
- data/spec/spec_helper.rb +0 -88
@@ -1,2 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="utf-8"?>
|
2
|
-
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title>xkcd.com</title><link href="http://xkcd.com/" rel="alternate"></link><id>http://xkcd.com/</id><updated>2013-11-29T00:00:00Z</updated><entry><title>Oort Cloud</title><link href="http://xkcd.com/1297/" rel="alternate"></link><updated>2013-11-29T00:00:00Z</updated><id>http://xkcd.com/1297/</id><summary type="html"><img src="http://imgs.xkcd.com/comics/oort_cloud.png" title="... I wanna try. Hang on, be right back." alt="... I wanna try. Hang on, be right back." /></summary></entry><entry><title>Git Commit</title><link href="http://xkcd.com/1296/" rel="alternate"></link><updated>2013-11-27T00:00:00Z</updated><id>http://xkcd.com/1296/</id><summary type="html"><img src="http://imgs.xkcd.com/comics/git_commit.png" title="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" alt="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" /></summary></entry><entry><title>New Study</title><link href="http://xkcd.com/1295/" rel="alternate"></link><updated>2013-11-25T00:00:00Z</updated><id>http://xkcd.com/1295/</id><summary type="html"><img src="http://imgs.xkcd.com/comics/new_study.png" title="When the results are published, no one will be sure whether to report on them again." alt="When the results are published, no one will be sure whether to report on them again." /></summary></entry><entry><title>Telescope Names</title><link href="http://xkcd.com/1294/" rel="alternate"></link><updated>2013-11-22T00:00:00Z</updated><id>http://xkcd.com/1294/</id><summary type="html"><img src="http://imgs.xkcd.com/comics/telescope_names.png" title="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." alt="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." /></summary></entry></feed>
|
@@ -1,23 +0,0 @@
|
|
1
|
-
require 'benchmark'
|
2
|
-
require 'net/http'
|
3
|
-
require 'curb'
|
4
|
-
|
5
|
-
urls = ['http://www.google.com'] * 100
|
6
|
-
|
7
|
-
Benchmark.bm(11) do |b|
|
8
|
-
b.report('Net::HTTP') do
|
9
|
-
urls.each do |url|
|
10
|
-
Net::HTTP.get URI.parse url
|
11
|
-
end
|
12
|
-
end
|
13
|
-
|
14
|
-
b.report('Curl::Easy') do
|
15
|
-
urls.each do |url|
|
16
|
-
Curl::Easy.perform url
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
b.report('Curl::Multi') do
|
21
|
-
Curl::Multi.get urls
|
22
|
-
end
|
23
|
-
end
|
@@ -1,73 +0,0 @@
|
|
1
|
-
require 'benchmark'
|
2
|
-
require 'feedzirra'
|
3
|
-
require 'simple-rss'
|
4
|
-
require 'feed-normalizer'
|
5
|
-
require 'feed_me'
|
6
|
-
|
7
|
-
iterations = 10
|
8
|
-
urls = File.readlines(File.dirname(__FILE__) + '/feed_list.txt')
|
9
|
-
files = Dir.glob(File.dirname(__FILE__) + '/feed_xml/*.xml')
|
10
|
-
xmls = files.map { |file| File.open(file).read }
|
11
|
-
|
12
|
-
# suppress warnings
|
13
|
-
$VERBOSE = nil
|
14
|
-
|
15
|
-
puts 'Parsing benchmarks'
|
16
|
-
|
17
|
-
Benchmark.bm(15) do |b|
|
18
|
-
b.report('feedzirra') do
|
19
|
-
iterations.times do
|
20
|
-
xmls.each { |xml| Feedzirra::Feed.parse xml }
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
b.report('simple-rss') do
|
25
|
-
iterations.times do
|
26
|
-
xmls.each { |xml| SimpleRSS.parse xml }
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
b.report('feed-normalizer') do
|
31
|
-
iterations.times do
|
32
|
-
xmls.each { |xml| FeedNormalizer::FeedNormalizer.parse xml }
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
# incompatible with `ruby-feedparser`, same constant used
|
37
|
-
require 'feed_parser'
|
38
|
-
b.report('feed_parser') do
|
39
|
-
iterations.times do
|
40
|
-
xmls.each { |xml| FeedParser.new(feed_xml: xml).parse }
|
41
|
-
end
|
42
|
-
end
|
43
|
-
|
44
|
-
b.report('feed_me') do
|
45
|
-
iterations.times do
|
46
|
-
xmls.each { |xml| FeedMe.parse xml }
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
# incompatible with `feed_parser`, same constant used
|
51
|
-
# require 'feedparser'
|
52
|
-
# b.report('ruby-feedparser') do
|
53
|
-
# iterations.times do
|
54
|
-
# xmls.each { |xml| FeedParser::Feed::new xml }
|
55
|
-
# end
|
56
|
-
# end
|
57
|
-
end
|
58
|
-
|
59
|
-
puts "\nFetch and parse benchmarks"
|
60
|
-
|
61
|
-
Benchmark.bm(15) do |b|
|
62
|
-
b.report('feedzirra') do
|
63
|
-
iterations.times { Feedzirra::Feed.fetch_and_parse urls }
|
64
|
-
end
|
65
|
-
|
66
|
-
# incompatible with `ruby-feedparser`, same constant used
|
67
|
-
require 'feed_parser'
|
68
|
-
b.report('feed_parser') do
|
69
|
-
iterations.times do
|
70
|
-
urls.each { |url| FeedParser.new(url: url).parse }
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
data/lib/feedzirra/core_ext.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
# Date code pulled and adapted from:
|
2
|
-
# Ruby Cookbook by Lucas Carlson and Leonard Richardson
|
3
|
-
# Published by O'Reilly
|
4
|
-
# ISBN: 0-596-52369-6
|
5
|
-
class Date
|
6
|
-
def feed_utils_to_gm_time
|
7
|
-
feed_utils_to_time(new_offset, :gm)
|
8
|
-
end
|
9
|
-
|
10
|
-
def feed_utils_to_local_time
|
11
|
-
feed_utils_to_time(new_offset(DateTime.now.offset-offset), :local)
|
12
|
-
end
|
13
|
-
|
14
|
-
private
|
15
|
-
def feed_utils_to_time(dest, method)
|
16
|
-
Time.send(method, dest.year, dest.month, dest.day, dest.hour, dest.min,
|
17
|
-
dest.sec, dest.zone)
|
18
|
-
end
|
19
|
-
end
|
@@ -1,31 +0,0 @@
|
|
1
|
-
require "time"
|
2
|
-
require "date"
|
3
|
-
|
4
|
-
class Time
|
5
|
-
# Parse a time string and convert it to UTC without raising errors.
|
6
|
-
# Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC.
|
7
|
-
#
|
8
|
-
# === Parameters
|
9
|
-
# [dt<String or Time>] Time definition to be parsed.
|
10
|
-
#
|
11
|
-
# === Returns
|
12
|
-
# A Time instance in UTC or nil if there were errors while parsing.
|
13
|
-
def self.parse_safely(dt)
|
14
|
-
if dt
|
15
|
-
case
|
16
|
-
when dt.is_a?(Time)
|
17
|
-
dt.utc
|
18
|
-
when dt.respond_to?(:empty?) && dt.empty?
|
19
|
-
nil
|
20
|
-
when dt.respond_to?(:to_datetime)
|
21
|
-
dt.to_datetime.utc
|
22
|
-
when dt.to_s =~ /\A\d{14}\z/
|
23
|
-
parse("#{dt.to_s}Z", true)
|
24
|
-
else
|
25
|
-
parse(dt.to_s, true).utc
|
26
|
-
end
|
27
|
-
end
|
28
|
-
rescue StandardError
|
29
|
-
nil
|
30
|
-
end unless method_defined?(:parse_safely)
|
31
|
-
end
|
data/lib/feedzirra/feed.rb
DELETED
@@ -1,459 +0,0 @@
|
|
1
|
-
module Feedzirra
|
2
|
-
class Feed
|
3
|
-
USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master"
|
4
|
-
|
5
|
-
# Passes raw XML and callbacks to a parser.
|
6
|
-
# === Parameters
|
7
|
-
# [parser<Object>] The parser to pass arguments to - must respond to
|
8
|
-
# `parse` and should return a Feed object.
|
9
|
-
# [xml<String>] The XML that you would like parsed.
|
10
|
-
# === Returns
|
11
|
-
# An instance of the parser feed type.
|
12
|
-
def self.parse_with(parser, xml, &block)
|
13
|
-
parser.parse xml, &block
|
14
|
-
end
|
15
|
-
|
16
|
-
# Takes a raw XML feed and attempts to parse it. If no parser is available a Feedzirra::NoParserAvailable exception is raised.
|
17
|
-
# You can pass a block to be called when there's an error during the parsing.
|
18
|
-
# === Parameters
|
19
|
-
# [xml<String>] The XML that you would like parsed.
|
20
|
-
# === Returns
|
21
|
-
# An instance of the determined feed type. By default, one of these:
|
22
|
-
# * Feedzirra::Parser::RSSFeedBurner
|
23
|
-
# * Feedzirra::Parser::GoogleDocsAtom
|
24
|
-
# * Feedzirra::Parser::AtomFeedBurner
|
25
|
-
# * Feedzirra::Parser::Atom
|
26
|
-
# * Feedzirra::Parser::ITunesRSS
|
27
|
-
# * Feedzirra::Parser::RSS
|
28
|
-
# === Raises
|
29
|
-
# Feedzirra::NoParserAvailable : If no valid parser classes could be found for the feed.
|
30
|
-
def self.parse(xml, &block)
|
31
|
-
if parser = determine_feed_parser_for_xml(xml)
|
32
|
-
parse_with parser, xml, &block
|
33
|
-
else
|
34
|
-
raise NoParserAvailable.new("No valid parser for XML.")
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
# Determines the correct parser class to use for parsing the feed.
|
39
|
-
#
|
40
|
-
# === Parameters
|
41
|
-
# [xml<String>] The XML that you would like determine the parser for.
|
42
|
-
# === Returns
|
43
|
-
# The class name of the parser that can handle the XML.
|
44
|
-
def self.determine_feed_parser_for_xml(xml)
|
45
|
-
start_of_doc = xml.slice(0, 2000)
|
46
|
-
feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
|
47
|
-
end
|
48
|
-
|
49
|
-
# Adds a new feed parsing class that will be used for parsing.
|
50
|
-
#
|
51
|
-
# === Parameters
|
52
|
-
# [klass<Constant>] The class/constant that you want to register.
|
53
|
-
# === Returns
|
54
|
-
# A updated array of feed parser class names.
|
55
|
-
def self.add_feed_class(klass)
|
56
|
-
feed_classes.unshift klass
|
57
|
-
end
|
58
|
-
|
59
|
-
# Provides a list of registered feed parsing classes.
|
60
|
-
#
|
61
|
-
# === Returns
|
62
|
-
# A array of class names.
|
63
|
-
def self.feed_classes
|
64
|
-
@feed_classes ||= [
|
65
|
-
Feedzirra::Parser::RSSFeedBurner,
|
66
|
-
Feedzirra::Parser::GoogleDocsAtom,
|
67
|
-
Feedzirra::Parser::AtomFeedBurner,
|
68
|
-
Feedzirra::Parser::Atom,
|
69
|
-
Feedzirra::Parser::ITunesRSS,
|
70
|
-
Feedzirra::Parser::RSS
|
71
|
-
]
|
72
|
-
end
|
73
|
-
|
74
|
-
# Makes all registered feeds types look for the passed in element to parse.
|
75
|
-
# This is actually just a call to element (a SAXMachine call) in the class.
|
76
|
-
#
|
77
|
-
# === Parameters
|
78
|
-
# [element_tag<String>] The element tag
|
79
|
-
# [options<Hash>] Valid keys are same as with SAXMachine
|
80
|
-
def self.add_common_feed_element(element_tag, options = {})
|
81
|
-
feed_classes.each do |k|
|
82
|
-
k.element element_tag, options
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
# Makes all registered feeds types look for the passed in elements to parse.
|
87
|
-
# This is actually just a call to elements (a SAXMachine call) in the class.
|
88
|
-
#
|
89
|
-
# === Parameters
|
90
|
-
# [element_tag<String>] The element tag
|
91
|
-
# [options<Hash>] Valid keys are same as with SAXMachine
|
92
|
-
def self.add_common_feed_elements(element_tag, options = {})
|
93
|
-
feed_classes.each do |k|
|
94
|
-
k.elements element_tag, options
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
# Makes all registered entry types look for the passed in element to parse.
|
99
|
-
# This is actually just a call to element (a SAXMachine call) in the class.
|
100
|
-
#
|
101
|
-
# === Parameters
|
102
|
-
# [element_tag<String>]
|
103
|
-
# [options<Hash>] Valid keys are same as with SAXMachine
|
104
|
-
def self.add_common_feed_entry_element(element_tag, options = {})
|
105
|
-
call_on_each_feed_entry :element, element_tag, options
|
106
|
-
end
|
107
|
-
|
108
|
-
# Makes all registered entry types look for the passed in elements to parse.
|
109
|
-
# This is actually just a call to element (a SAXMachine call) in the class.
|
110
|
-
#
|
111
|
-
# === Parameters
|
112
|
-
# [element_tag<String>]
|
113
|
-
# [options<Hash>] Valid keys are same as with SAXMachine
|
114
|
-
def self.add_common_feed_entry_elements(element_tag, options = {})
|
115
|
-
call_on_each_feed_entry :elements, element_tag, options
|
116
|
-
end
|
117
|
-
|
118
|
-
# Call a method on all feed entries classes.
|
119
|
-
#
|
120
|
-
# === Parameters
|
121
|
-
# [method<Symbol>] The method name
|
122
|
-
# [parameters<Array>] The method parameters
|
123
|
-
def self.call_on_each_feed_entry(method, *parameters)
|
124
|
-
feed_classes.each do |k|
|
125
|
-
# iterate on the collections defined in the sax collection
|
126
|
-
k.sax_config.collection_elements.each_value do |vl|
|
127
|
-
# vl is a list of CollectionConfig mapped to an attribute name
|
128
|
-
# we'll look for the one set as 'entries' and add the new element
|
129
|
-
vl.find_all{|v| (v.accessor == 'entries') && (v.data_class.class == Class)}.each do |v|
|
130
|
-
v.data_class.send(method, *parameters)
|
131
|
-
end
|
132
|
-
end
|
133
|
-
end
|
134
|
-
end
|
135
|
-
|
136
|
-
# Setup curl from options.
|
137
|
-
# Possible parameters:
|
138
|
-
# * :user_agent - overrides the default user agent.
|
139
|
-
# * :compress - any value to enable compression
|
140
|
-
# * :enable_cookies - boolean
|
141
|
-
# * :cookiefile - file to read cookies
|
142
|
-
# * :cookies - contents of cookies header
|
143
|
-
# * :http_authentication - array containing username, then password
|
144
|
-
# * :proxy_url - proxy url
|
145
|
-
# * :proxy_port - proxy port
|
146
|
-
# * :max_redirects - max number of redirections
|
147
|
-
# * :timeout - timeout
|
148
|
-
# * :ssl_verify_host - boolean
|
149
|
-
# * :ssl_verify_peer - boolean
|
150
|
-
# * :ssl_version - the ssl version to use, see OpenSSL::SSL::SSLContext::METHODS for options
|
151
|
-
def self.setup_easy(curl, options={})
|
152
|
-
curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
|
153
|
-
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
154
|
-
curl.enable_cookies = options[:enable_cookies] if options.has_key?(:enable_cookies)
|
155
|
-
curl.cookiefile = options[:cookiefile] if options.has_key?(:cookiefile)
|
156
|
-
curl.cookies = options[:cookies] if options.has_key?(:cookies)
|
157
|
-
|
158
|
-
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
159
|
-
curl.proxy_url = options[:proxy_url] if options.has_key?(:proxy_url)
|
160
|
-
curl.proxy_port = options[:proxy_port] if options.has_key?(:proxy_port)
|
161
|
-
curl.max_redirects = options[:max_redirects] if options[:max_redirects]
|
162
|
-
curl.timeout = options[:timeout] if options[:timeout]
|
163
|
-
curl.ssl_verify_host = options[:ssl_verify_host] if options.has_key?(:ssl_verify_host)
|
164
|
-
curl.ssl_verify_peer = options[:ssl_verify_peer] if options.has_key?(:ssl_verify_peer)
|
165
|
-
curl.ssl_version = options[:ssl_version] if options.has_key?(:ssl_version)
|
166
|
-
|
167
|
-
curl.follow_location = true
|
168
|
-
end
|
169
|
-
|
170
|
-
# Fetches and returns the raw XML for each URL provided.
|
171
|
-
#
|
172
|
-
# === Parameters
|
173
|
-
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
174
|
-
# [options<Hash>] Valid keys for this argument as as followed:
|
175
|
-
# :if_modified_since - Time object representing when the feed was last updated.
|
176
|
-
# :if_none_match - String that's normally an etag for the request that was stored previously.
|
177
|
-
# :on_success - Block that gets executed after a successful request.
|
178
|
-
# :on_failure - Block that gets executed after a failed request.
|
179
|
-
# * all parameters defined in setup_easy
|
180
|
-
# === Returns
|
181
|
-
# A String of XML if a single URL is passed.
|
182
|
-
#
|
183
|
-
# A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
|
184
|
-
def self.fetch_raw(urls, options = {})
|
185
|
-
url_queue = [*urls]
|
186
|
-
multi = Curl::Multi.new
|
187
|
-
responses = {}
|
188
|
-
url_queue.each do |url|
|
189
|
-
easy = Curl::Easy.new(url) do |curl|
|
190
|
-
setup_easy curl, options
|
191
|
-
|
192
|
-
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
193
|
-
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
194
|
-
|
195
|
-
curl.on_success do |c|
|
196
|
-
responses[url] = decode_content(c)
|
197
|
-
end
|
198
|
-
|
199
|
-
curl.on_complete do |c, err|
|
200
|
-
responses[url] = c.response_code unless responses.has_key?(url)
|
201
|
-
end
|
202
|
-
end
|
203
|
-
multi.add(easy)
|
204
|
-
end
|
205
|
-
|
206
|
-
multi.perform
|
207
|
-
urls.is_a?(String) ? responses.values.first : responses
|
208
|
-
end
|
209
|
-
|
210
|
-
# Fetches and returns the parsed XML for each URL provided.
|
211
|
-
#
|
212
|
-
# === Parameters
|
213
|
-
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
214
|
-
# [options<Hash>] Valid keys for this argument as as followed:
|
215
|
-
# * :user_agent - String that overrides the default user agent.
|
216
|
-
# * :if_modified_since - Time object representing when the feed was last updated.
|
217
|
-
# * :if_none_match - String, an etag for the request that was stored previously.
|
218
|
-
# * :on_success - Block that gets executed after a successful request.
|
219
|
-
# * :on_failure - Block that gets executed after a failed request.
|
220
|
-
# === Returns
|
221
|
-
# A Feed object if a single URL is passed.
|
222
|
-
#
|
223
|
-
# A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
|
224
|
-
def self.fetch_and_parse(urls, options = {})
|
225
|
-
url_queue = [*urls]
|
226
|
-
multi = Curl::Multi.new
|
227
|
-
responses = {}
|
228
|
-
|
229
|
-
# I broke these down so I would only try to do 30 simultaneously because
|
230
|
-
# I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
|
231
|
-
url_queue.slice!(0, 30).each do |url|
|
232
|
-
add_url_to_multi(multi, url, url_queue, responses, options)
|
233
|
-
end
|
234
|
-
|
235
|
-
multi.perform
|
236
|
-
return urls.is_a?(String) ? responses.values.first : responses
|
237
|
-
end
|
238
|
-
|
239
|
-
# Decodes the XML document if it was compressed.
|
240
|
-
#
|
241
|
-
# === Parameters
|
242
|
-
# [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
|
243
|
-
# === Returns
|
244
|
-
# A decoded string of XML.
|
245
|
-
def self.decode_content(c)
|
246
|
-
if c.header_str.match(/Content-Encoding: gzip/i)
|
247
|
-
begin
|
248
|
-
gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
|
249
|
-
xml = gz.read
|
250
|
-
gz.close
|
251
|
-
rescue Zlib::GzipFile::Error
|
252
|
-
# Maybe this is not gzipped?
|
253
|
-
xml = c.body_str
|
254
|
-
end
|
255
|
-
elsif c.header_str.match(/Content-Encoding: deflate/i)
|
256
|
-
xml = Zlib::Inflate.inflate(c.body_str)
|
257
|
-
else
|
258
|
-
xml = c.body_str
|
259
|
-
end
|
260
|
-
|
261
|
-
xml
|
262
|
-
end
|
263
|
-
|
264
|
-
# Updates each feed for each Feed object provided.
|
265
|
-
#
|
266
|
-
# === Parameters
|
267
|
-
# [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
|
268
|
-
# [options<Hash>] Valid keys for this argument as as followed:
|
269
|
-
# * :on_success - Block that gets executed after a successful request.
|
270
|
-
# * :on_failure - Block that gets executed after a failed request.
|
271
|
-
# * all parameters defined in setup_easy
|
272
|
-
# === Returns
|
273
|
-
# A updated Feed object if a single URL is passed.
|
274
|
-
#
|
275
|
-
# A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
|
276
|
-
def self.update(feeds, options = {})
|
277
|
-
feed_queue = [*feeds]
|
278
|
-
multi = Curl::Multi.new
|
279
|
-
responses = {}
|
280
|
-
|
281
|
-
feed_queue.slice!(0, 30).each do |feed|
|
282
|
-
add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
283
|
-
end
|
284
|
-
|
285
|
-
multi.perform
|
286
|
-
feeds.is_a?(Array) ? responses : responses.values.first
|
287
|
-
end
|
288
|
-
|
289
|
-
# An abstraction for adding a feed by URL to the passed Curb::multi stack.
|
290
|
-
#
|
291
|
-
# === Parameters
|
292
|
-
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
293
|
-
# [url<String>] The URL of the feed that you would like to be fetched.
|
294
|
-
# [url_queue<Array>] An array of URLs that are queued for request.
|
295
|
-
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
296
|
-
# [feeds<String> or <Array>] A single feed object, or an array of feed objects.
|
297
|
-
# [options<Hash>] Valid keys for this argument as as followed:
|
298
|
-
# * :on_success - Block that gets executed after a successful request.
|
299
|
-
# * :on_failure - Block that gets executed after a failed request.
|
300
|
-
# * all parameters defined in setup_easy
|
301
|
-
# === Returns
|
302
|
-
# The updated Curl::Multi object with the request details added to it's stack.
|
303
|
-
def self.add_url_to_multi(multi, url, url_queue, responses, options)
|
304
|
-
easy = Curl::Easy.new(url) do |curl|
|
305
|
-
setup_easy curl, options
|
306
|
-
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
307
|
-
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
308
|
-
|
309
|
-
curl.on_success do |c|
|
310
|
-
xml = decode_content(c)
|
311
|
-
klass = determine_feed_parser_for_xml(xml)
|
312
|
-
|
313
|
-
if klass
|
314
|
-
begin
|
315
|
-
feed = parse_with klass, xml, &on_parser_failure(url)
|
316
|
-
|
317
|
-
feed.feed_url = c.last_effective_url
|
318
|
-
feed.etag = etag_from_header(c.header_str)
|
319
|
-
feed.last_modified = last_modified_from_header(c.header_str)
|
320
|
-
responses[url] = feed
|
321
|
-
options[:on_success].call(url, feed) if options.has_key?(:on_success)
|
322
|
-
rescue Exception => e
|
323
|
-
call_on_failure(c, e, options[:on_failure])
|
324
|
-
end
|
325
|
-
else
|
326
|
-
call_on_failure(c, "Can't determine a parser", options[:on_failure])
|
327
|
-
end
|
328
|
-
end
|
329
|
-
|
330
|
-
#
|
331
|
-
# trigger on_failure for 404s
|
332
|
-
#
|
333
|
-
curl.on_complete do |c|
|
334
|
-
add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
|
335
|
-
responses[url] = c.response_code unless responses.has_key?(url)
|
336
|
-
end
|
337
|
-
|
338
|
-
curl.on_redirect do |c|
|
339
|
-
if c.response_code == 304 # it's not modified. this isn't an error condition
|
340
|
-
options[:on_success].call(url, nil) if options.has_key?(:on_success)
|
341
|
-
end
|
342
|
-
end
|
343
|
-
|
344
|
-
curl.on_missing do |c|
|
345
|
-
if c.response_code == 404 && options.has_key?(:on_failure)
|
346
|
-
call_on_failure(c, 'Server returned a 404', options[:on_failure])
|
347
|
-
end
|
348
|
-
end
|
349
|
-
|
350
|
-
curl.on_failure do |c, err|
|
351
|
-
responses[url] = c.response_code
|
352
|
-
call_on_failure(c, err, options[:on_failure])
|
353
|
-
end
|
354
|
-
end
|
355
|
-
multi.add(easy)
|
356
|
-
end
|
357
|
-
|
358
|
-
# An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
|
359
|
-
#
|
360
|
-
# === Parameters
|
361
|
-
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
362
|
-
# [feed<Feed>] A feed object that you would like to be fetched.
|
363
|
-
# [url_queue<Array>] An array of feed objects that are queued for request.
|
364
|
-
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
365
|
-
# [feeds<String>] or <Array> A single feed object, or an array of feed objects.
|
366
|
-
# [options<Hash>] Valid keys for this argument as as followed:
|
367
|
-
# * :on_success - Block that gets executed after a successful request.
|
368
|
-
# * :on_failure - Block that gets executed after a failed request.
|
369
|
-
# * all parameters defined in setup_easy
|
370
|
-
# === Returns
|
371
|
-
# The updated Curl::Multi object with the request details added to it's stack.
|
372
|
-
def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
373
|
-
easy = Curl::Easy.new(feed.feed_url) do |curl|
|
374
|
-
setup_easy curl, options
|
375
|
-
curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
|
376
|
-
curl.headers["If-Modified-Since"] = options[:if_modified_since] if options[:if_modified_since] && (!feed.last_modified || (Time.parse(options[:if_modified_since].to_s) > feed.last_modified))
|
377
|
-
curl.headers["If-None-Match"] = feed.etag if feed.etag
|
378
|
-
|
379
|
-
curl.on_success do |c|
|
380
|
-
begin
|
381
|
-
updated_feed = Feed.parse c.body_str, &on_parser_failure(feed.feed_url)
|
382
|
-
|
383
|
-
updated_feed.feed_url = c.last_effective_url
|
384
|
-
updated_feed.etag = etag_from_header(c.header_str)
|
385
|
-
updated_feed.last_modified = last_modified_from_header(c.header_str)
|
386
|
-
feed.update_from_feed(updated_feed)
|
387
|
-
responses[feed.feed_url] = feed
|
388
|
-
options[:on_success].call(feed) if options.has_key?(:on_success)
|
389
|
-
rescue Exception => e
|
390
|
-
call_on_failure(c, e, options[:on_failure])
|
391
|
-
end
|
392
|
-
end
|
393
|
-
|
394
|
-
curl.on_failure do |c, err| # response code 50X
|
395
|
-
responses[feed.feed_url] = c.response_code
|
396
|
-
call_on_failure(c, 'Server returned a 404', options[:on_failure])
|
397
|
-
end
|
398
|
-
|
399
|
-
curl.on_redirect do |c, err| # response code 30X
|
400
|
-
if c.response_code == 304
|
401
|
-
options[:on_success].call(feed) if options.has_key?(:on_success)
|
402
|
-
else
|
403
|
-
responses[feed.feed_url] = c.response_code
|
404
|
-
call_on_failure(c, err, options[:on_failure])
|
405
|
-
end
|
406
|
-
end
|
407
|
-
|
408
|
-
curl.on_complete do |c|
|
409
|
-
add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
|
410
|
-
responses[feed.feed_url] = feed unless responses.has_key?(feed.feed_url)
|
411
|
-
end
|
412
|
-
end
|
413
|
-
multi.add(easy)
|
414
|
-
end
|
415
|
-
|
416
|
-
# Determines the etag from the request headers.
|
417
|
-
#
|
418
|
-
# === Parameters
|
419
|
-
# [header<String>] Raw request header returned from the request
|
420
|
-
# === Returns
|
421
|
-
# A string of the etag or nil if it cannot be found in the headers.
|
422
|
-
def self.etag_from_header(header)
|
423
|
-
header =~ /.*ETag:\s(.*)\r/
|
424
|
-
$1
|
425
|
-
end
|
426
|
-
|
427
|
-
# Determines the last modified date from the request headers.
|
428
|
-
#
|
429
|
-
# === Parameters
|
430
|
-
# [header<String>] Raw request header returned from the request
|
431
|
-
# === Returns
|
432
|
-
# A Time object of the last modified date or nil if it cannot be found in the headers.
|
433
|
-
def self.last_modified_from_header(header)
|
434
|
-
header =~ /.*Last-Modified:\s(.*)\r/
|
435
|
-
Time.parse_safely($1) if $1
|
436
|
-
end
|
437
|
-
|
438
|
-
class << self
|
439
|
-
private
|
440
|
-
|
441
|
-
def on_parser_failure(url)
|
442
|
-
Proc.new { |message| raise "Error while parsing [#{url}] #{message}" }
|
443
|
-
end
|
444
|
-
|
445
|
-
def call_on_failure(c, error, on_failure)
|
446
|
-
if on_failure
|
447
|
-
if on_failure.arity == 4
|
448
|
-
warn 'on_failure proc with deprecated arity 4 should include a fifth parameter containing the error'
|
449
|
-
on_failure.call(c.url, c.response_code, c.header_str, c.body_str)
|
450
|
-
elsif on_failure.arity == 2
|
451
|
-
on_failure.call(c, error)
|
452
|
-
else
|
453
|
-
warn "on_failure proc with invalid parameters number #{on_failure.arity} instead of 2, ignoring it"
|
454
|
-
end
|
455
|
-
end
|
456
|
-
end
|
457
|
-
end
|
458
|
-
end
|
459
|
-
end
|