feedjira 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +1 -0
- data/.travis.yml +8 -0
- data/CHANGELOG.md +162 -0
- data/Gemfile +17 -0
- data/Guardfile +5 -0
- data/README.md +242 -0
- data/Rakefile +6 -0
- data/benchmarks/README.md +90 -0
- data/benchmarks/basic.rb +31 -0
- data/benchmarks/feed_list.txt +10 -0
- data/benchmarks/feed_xml/apple.xml +149 -0
- data/benchmarks/feed_xml/cnn.xml +278 -0
- data/benchmarks/feed_xml/daring_fireball.xml +1697 -0
- data/benchmarks/feed_xml/engadget.xml +604 -0
- data/benchmarks/feed_xml/feedjira_commits.xml +370 -0
- data/benchmarks/feed_xml/gizmodo.xml +2 -0
- data/benchmarks/feed_xml/loop.xml +441 -0
- data/benchmarks/feed_xml/rails.xml +1938 -0
- data/benchmarks/feed_xml/white_house.xml +951 -0
- data/benchmarks/feed_xml/xkcd.xml +2 -0
- data/benchmarks/fetching_systems.rb +23 -0
- data/benchmarks/other_libraries.rb +73 -0
- data/feedjira.gemspec +27 -0
- data/lib/feedjira.rb +16 -0
- data/lib/feedjira/core_ext.rb +3 -0
- data/lib/feedjira/core_ext/date.rb +19 -0
- data/lib/feedjira/core_ext/string.rb +9 -0
- data/lib/feedjira/core_ext/time.rb +31 -0
- data/lib/feedjira/feed.rb +459 -0
- data/lib/feedjira/feed_entry_utilities.rb +66 -0
- data/lib/feedjira/feed_utilities.rb +103 -0
- data/lib/feedjira/parser.rb +20 -0
- data/lib/feedjira/parser/atom.rb +61 -0
- data/lib/feedjira/parser/atom_entry.rb +34 -0
- data/lib/feedjira/parser/atom_feed_burner.rb +22 -0
- data/lib/feedjira/parser/atom_feed_burner_entry.rb +35 -0
- data/lib/feedjira/parser/google_docs_atom.rb +28 -0
- data/lib/feedjira/parser/google_docs_atom_entry.rb +29 -0
- data/lib/feedjira/parser/itunes_rss.rb +50 -0
- data/lib/feedjira/parser/itunes_rss_item.rb +41 -0
- data/lib/feedjira/parser/itunes_rss_owner.rb +12 -0
- data/lib/feedjira/parser/rss.rb +24 -0
- data/lib/feedjira/parser/rss_entry.rb +37 -0
- data/lib/feedjira/parser/rss_feed_burner.rb +23 -0
- data/lib/feedjira/parser/rss_feed_burner_entry.rb +43 -0
- data/lib/feedjira/version.rb +3 -0
- data/spec/feedjira/feed_entry_utilities_spec.rb +62 -0
- data/spec/feedjira/feed_spec.rb +762 -0
- data/spec/feedjira/feed_utilities_spec.rb +273 -0
- data/spec/feedjira/parser/atom_entry_spec.rb +86 -0
- data/spec/feedjira/parser/atom_feed_burner_entry_spec.rb +47 -0
- data/spec/feedjira/parser/atom_feed_burner_spec.rb +56 -0
- data/spec/feedjira/parser/atom_spec.rb +76 -0
- data/spec/feedjira/parser/google_docs_atom_entry_spec.rb +22 -0
- data/spec/feedjira/parser/google_docs_atom_spec.rb +31 -0
- data/spec/feedjira/parser/itunes_rss_item_spec.rb +63 -0
- data/spec/feedjira/parser/itunes_rss_owner_spec.rb +18 -0
- data/spec/feedjira/parser/itunes_rss_spec.rb +58 -0
- data/spec/feedjira/parser/rss_entry_spec.rb +85 -0
- data/spec/feedjira/parser/rss_feed_burner_entry_spec.rb +85 -0
- data/spec/feedjira/parser/rss_feed_burner_spec.rb +57 -0
- data/spec/feedjira/parser/rss_spec.rb +57 -0
- data/spec/sample_feeds/AmazonWebServicesBlog.xml +797 -0
- data/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml +63 -0
- data/spec/sample_feeds/AtomFeedWithSpacesAroundEquals.xml +61 -0
- data/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml +28 -0
- data/spec/sample_feeds/GoogleDocsList.xml +188 -0
- data/spec/sample_feeds/HREFConsideredHarmful.xml +314 -0
- data/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml +22 -0
- data/spec/sample_feeds/ITunesWithSpacesInAttributes.xml +63 -0
- data/spec/sample_feeds/PaulDixExplainsNothing.xml +175 -0
- data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +175 -0
- data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +19 -0
- data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +174 -0
- data/spec/sample_feeds/SamRuby.xml +583 -0
- data/spec/sample_feeds/TechCrunch.xml +1515 -0
- data/spec/sample_feeds/TechCrunchFirstEntry.xml +9 -0
- data/spec/sample_feeds/TechCrunchFirstEntryDescription.xml +3 -0
- data/spec/sample_feeds/TenderLovemaking.xml +516 -0
- data/spec/sample_feeds/TenderLovemakingFirstEntry.xml +66 -0
- data/spec/sample_feeds/TrotterCashionHome.xml +611 -0
- data/spec/sample_feeds/TypePadNews.xml +368 -0
- data/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml +31 -0
- data/spec/sample_feeds/itunes.xml +67 -0
- data/spec/sample_feeds/pet_atom.xml +497 -0
- data/spec/spec_helper.rb +88 -0
- metadata +229 -0
@@ -0,0 +1,2 @@
|
|
1
|
+
<?xml version="1.0" encoding="utf-8"?>
|
2
|
+
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en"><title>xkcd.com</title><link href="http://xkcd.com/" rel="alternate"></link><id>http://xkcd.com/</id><updated>2013-11-29T00:00:00Z</updated><entry><title>Oort Cloud</title><link href="http://xkcd.com/1297/" rel="alternate"></link><updated>2013-11-29T00:00:00Z</updated><id>http://xkcd.com/1297/</id><summary type="html"><img src="http://imgs.xkcd.com/comics/oort_cloud.png" title="... I wanna try. Hang on, be right back." alt="... I wanna try. Hang on, be right back." /></summary></entry><entry><title>Git Commit</title><link href="http://xkcd.com/1296/" rel="alternate"></link><updated>2013-11-27T00:00:00Z</updated><id>http://xkcd.com/1296/</id><summary type="html"><img src="http://imgs.xkcd.com/comics/git_commit.png" title="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" alt="Merge branch 'asdfasjkfdlas/alkdjf' into sdkjfls-final" /></summary></entry><entry><title>New Study</title><link href="http://xkcd.com/1295/" rel="alternate"></link><updated>2013-11-25T00:00:00Z</updated><id>http://xkcd.com/1295/</id><summary type="html"><img src="http://imgs.xkcd.com/comics/new_study.png" title="When the results are published, no one will be sure whether to report on them again." alt="When the results are published, no one will be sure whether to report on them again." /></summary></entry><entry><title>Telescope Names</title><link href="http://xkcd.com/1294/" rel="alternate"></link><updated>2013-11-22T00:00:00Z</updated><id>http://xkcd.com/1294/</id><summary type="html"><img src="http://imgs.xkcd.com/comics/telescope_names.png" title="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." alt="The Thirty Meter Telescope will be renamed The Flesh-Searing Eye on the Volcano." /></summary></entry></feed>
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require 'net/http'
|
3
|
+
require 'curb'
|
4
|
+
|
5
|
+
urls = ['http://www.google.com'] * 100
|
6
|
+
|
7
|
+
Benchmark.bm(11) do |b|
|
8
|
+
b.report('Net::HTTP') do
|
9
|
+
urls.each do |url|
|
10
|
+
Net::HTTP.get URI.parse url
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
b.report('Curl::Easy') do
|
15
|
+
urls.each do |url|
|
16
|
+
Curl::Easy.perform url
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
b.report('Curl::Multi') do
|
21
|
+
Curl::Multi.get urls
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
require 'feedjira'
|
3
|
+
require 'simple-rss'
|
4
|
+
require 'feed-normalizer'
|
5
|
+
require 'feed_me'
|
6
|
+
|
7
|
+
iterations = 10
|
8
|
+
urls = File.readlines(File.dirname(__FILE__) + '/feed_list.txt')
|
9
|
+
files = Dir.glob(File.dirname(__FILE__) + '/feed_xml/*.xml')
|
10
|
+
xmls = files.map { |file| File.open(file).read }
|
11
|
+
|
12
|
+
# suppress warnings
|
13
|
+
$VERBOSE = nil
|
14
|
+
|
15
|
+
puts 'Parsing benchmarks'
|
16
|
+
|
17
|
+
Benchmark.bm(15) do |b|
|
18
|
+
b.report('feedjira') do
|
19
|
+
iterations.times do
|
20
|
+
xmls.each { |xml| Feedjira::Feed.parse xml }
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
b.report('simple-rss') do
|
25
|
+
iterations.times do
|
26
|
+
xmls.each { |xml| SimpleRSS.parse xml }
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
b.report('feed-normalizer') do
|
31
|
+
iterations.times do
|
32
|
+
xmls.each { |xml| FeedNormalizer::FeedNormalizer.parse xml }
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# incompatible with `ruby-feedparser`, same constant used
|
37
|
+
require 'feed_parser'
|
38
|
+
b.report('feed_parser') do
|
39
|
+
iterations.times do
|
40
|
+
xmls.each { |xml| FeedParser.new(feed_xml: xml).parse }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
b.report('feed_me') do
|
45
|
+
iterations.times do
|
46
|
+
xmls.each { |xml| FeedMe.parse xml }
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# incompatible with `feed_parser`, same constant used
|
51
|
+
# require 'feedparser'
|
52
|
+
# b.report('ruby-feedparser') do
|
53
|
+
# iterations.times do
|
54
|
+
# xmls.each { |xml| FeedParser::Feed::new xml }
|
55
|
+
# end
|
56
|
+
# end
|
57
|
+
end
|
58
|
+
|
59
|
+
puts "\nFetch and parse benchmarks"
|
60
|
+
|
61
|
+
Benchmark.bm(15) do |b|
|
62
|
+
b.report('feedjira') do
|
63
|
+
iterations.times { Feedjira::Feed.fetch_and_parse urls }
|
64
|
+
end
|
65
|
+
|
66
|
+
# incompatible with `ruby-feedparser`, same constant used
|
67
|
+
require 'feed_parser'
|
68
|
+
b.report('feed_parser') do
|
69
|
+
iterations.times do
|
70
|
+
urls.each { |url| FeedParser.new(url: url).parse }
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
data/feedjira.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/feedjira/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = 'feedjira'
|
6
|
+
s.version = Feedjira::VERSION
|
7
|
+
s.license = 'MIT'
|
8
|
+
|
9
|
+
s.authors = ['Paul Dix', 'Julien Kirch', 'Ezekiel Templin', 'Jon Allured']
|
10
|
+
s.email = 'feedjira@gmail.com'
|
11
|
+
s.homepage = 'http://feedjira.com'
|
12
|
+
|
13
|
+
s.summary = 'A feed fetching and parsing library'
|
14
|
+
s.description = 'A library designed to retrieve and parse feeds as quickly as possible'
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.require_paths = ['lib']
|
19
|
+
|
20
|
+
s.platform = Gem::Platform::RUBY
|
21
|
+
|
22
|
+
s.add_dependency 'sax-machine', '~> 0.2.1'
|
23
|
+
s.add_dependency 'curb', '~> 0.8.1'
|
24
|
+
s.add_dependency 'loofah', '~> 1.2.1'
|
25
|
+
|
26
|
+
s.add_development_dependency 'rspec', '~> 2.14.0'
|
27
|
+
end
|
data/lib/feedjira.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'zlib'
|
2
|
+
require 'curb'
|
3
|
+
require 'sax-machine'
|
4
|
+
require 'loofah'
|
5
|
+
|
6
|
+
require 'feedjira/core_ext'
|
7
|
+
require 'feedjira/version'
|
8
|
+
|
9
|
+
module Feedjira
|
10
|
+
autoload :FeedEntryUtilities, 'feedjira/feed_entry_utilities'
|
11
|
+
autoload :FeedUtilities, 'feedjira/feed_utilities'
|
12
|
+
autoload :Feed, 'feedjira/feed'
|
13
|
+
autoload :Parser, 'feedjira/parser'
|
14
|
+
|
15
|
+
class NoParserAvailable < StandardError; end
|
16
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# Date code pulled and adapted from:
|
2
|
+
# Ruby Cookbook by Lucas Carlson and Leonard Richardson
|
3
|
+
# Published by O'Reilly
|
4
|
+
# ISBN: 0-596-52369-6
|
5
|
+
class Date
|
6
|
+
def feed_utils_to_gm_time
|
7
|
+
feed_utils_to_time(new_offset, :gm)
|
8
|
+
end
|
9
|
+
|
10
|
+
def feed_utils_to_local_time
|
11
|
+
feed_utils_to_time(new_offset(DateTime.now.offset-offset), :local)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
def feed_utils_to_time(dest, method)
|
16
|
+
Time.send(method, dest.year, dest.month, dest.day, dest.hour, dest.min,
|
17
|
+
dest.sec, dest.zone)
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "time"
|
2
|
+
require "date"
|
3
|
+
|
4
|
+
class Time
|
5
|
+
# Parse a time string and convert it to UTC without raising errors.
|
6
|
+
# Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC.
|
7
|
+
#
|
8
|
+
# === Parameters
|
9
|
+
# [dt<String or Time>] Time definition to be parsed.
|
10
|
+
#
|
11
|
+
# === Returns
|
12
|
+
# A Time instance in UTC or nil if there were errors while parsing.
|
13
|
+
def self.parse_safely(dt)
|
14
|
+
if dt
|
15
|
+
case
|
16
|
+
when dt.is_a?(Time)
|
17
|
+
dt.utc
|
18
|
+
when dt.respond_to?(:empty?) && dt.empty?
|
19
|
+
nil
|
20
|
+
when dt.respond_to?(:to_datetime)
|
21
|
+
dt.to_datetime.utc
|
22
|
+
when dt.to_s =~ /\A\d{14}\z/
|
23
|
+
parse("#{dt.to_s}Z", true)
|
24
|
+
else
|
25
|
+
parse(dt.to_s, true).utc
|
26
|
+
end
|
27
|
+
end
|
28
|
+
rescue StandardError
|
29
|
+
nil
|
30
|
+
end unless method_defined?(:parse_safely)
|
31
|
+
end
|
@@ -0,0 +1,459 @@
|
|
1
|
+
module Feedjira
|
2
|
+
class Feed
|
3
|
+
USER_AGENT = 'feedjira http://feedjira.com'
|
4
|
+
|
5
|
+
# Passes raw XML and callbacks to a parser.
|
6
|
+
# === Parameters
|
7
|
+
# [parser<Object>] The parser to pass arguments to - must respond to
|
8
|
+
# `parse` and should return a Feed object.
|
9
|
+
# [xml<String>] The XML that you would like parsed.
|
10
|
+
# === Returns
|
11
|
+
# An instance of the parser feed type.
|
12
|
+
def self.parse_with(parser, xml, &block)
|
13
|
+
parser.parse xml, &block
|
14
|
+
end
|
15
|
+
|
16
|
+
# Takes a raw XML feed and attempts to parse it. If no parser is available a Feedjira::NoParserAvailable exception is raised.
|
17
|
+
# You can pass a block to be called when there's an error during the parsing.
|
18
|
+
# === Parameters
|
19
|
+
# [xml<String>] The XML that you would like parsed.
|
20
|
+
# === Returns
|
21
|
+
# An instance of the determined feed type. By default, one of these:
|
22
|
+
# * Feedjira::Parser::RSSFeedBurner
|
23
|
+
# * Feedjira::Parser::GoogleDocsAtom
|
24
|
+
# * Feedjira::Parser::AtomFeedBurner
|
25
|
+
# * Feedjira::Parser::Atom
|
26
|
+
# * Feedjira::Parser::ITunesRSS
|
27
|
+
# * Feedjira::Parser::RSS
|
28
|
+
# === Raises
|
29
|
+
# Feedjira::NoParserAvailable : If no valid parser classes could be found for the feed.
|
30
|
+
def self.parse(xml, &block)
|
31
|
+
if parser = determine_feed_parser_for_xml(xml)
|
32
|
+
parse_with parser, xml, &block
|
33
|
+
else
|
34
|
+
raise NoParserAvailable.new("No valid parser for XML.")
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Determines the correct parser class to use for parsing the feed.
|
39
|
+
#
|
40
|
+
# === Parameters
|
41
|
+
# [xml<String>] The XML that you would like determine the parser for.
|
42
|
+
# === Returns
|
43
|
+
# The class name of the parser that can handle the XML.
|
44
|
+
def self.determine_feed_parser_for_xml(xml)
|
45
|
+
start_of_doc = xml.slice(0, 2000)
|
46
|
+
feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
|
47
|
+
end
|
48
|
+
|
49
|
+
# Adds a new feed parsing class that will be used for parsing.
|
50
|
+
#
|
51
|
+
# === Parameters
|
52
|
+
# [klass<Constant>] The class/constant that you want to register.
|
53
|
+
# === Returns
|
54
|
+
# A updated array of feed parser class names.
|
55
|
+
def self.add_feed_class(klass)
|
56
|
+
feed_classes.unshift klass
|
57
|
+
end
|
58
|
+
|
59
|
+
# Provides a list of registered feed parsing classes.
|
60
|
+
#
|
61
|
+
# === Returns
|
62
|
+
# A array of class names.
|
63
|
+
def self.feed_classes
|
64
|
+
@feed_classes ||= [
|
65
|
+
Feedjira::Parser::RSSFeedBurner,
|
66
|
+
Feedjira::Parser::GoogleDocsAtom,
|
67
|
+
Feedjira::Parser::AtomFeedBurner,
|
68
|
+
Feedjira::Parser::Atom,
|
69
|
+
Feedjira::Parser::ITunesRSS,
|
70
|
+
Feedjira::Parser::RSS
|
71
|
+
]
|
72
|
+
end
|
73
|
+
|
74
|
+
# Makes all registered feeds types look for the passed in element to parse.
|
75
|
+
# This is actually just a call to element (a SAXMachine call) in the class.
|
76
|
+
#
|
77
|
+
# === Parameters
|
78
|
+
# [element_tag<String>] The element tag
|
79
|
+
# [options<Hash>] Valid keys are same as with SAXMachine
|
80
|
+
def self.add_common_feed_element(element_tag, options = {})
|
81
|
+
feed_classes.each do |k|
|
82
|
+
k.element element_tag, options
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
# Makes all registered feeds types look for the passed in elements to parse.
|
87
|
+
# This is actually just a call to elements (a SAXMachine call) in the class.
|
88
|
+
#
|
89
|
+
# === Parameters
|
90
|
+
# [element_tag<String>] The element tag
|
91
|
+
# [options<Hash>] Valid keys are same as with SAXMachine
|
92
|
+
def self.add_common_feed_elements(element_tag, options = {})
|
93
|
+
feed_classes.each do |k|
|
94
|
+
k.elements element_tag, options
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Makes all registered entry types look for the passed in element to parse.
|
99
|
+
# This is actually just a call to element (a SAXMachine call) in the class.
|
100
|
+
#
|
101
|
+
# === Parameters
|
102
|
+
# [element_tag<String>]
|
103
|
+
# [options<Hash>] Valid keys are same as with SAXMachine
|
104
|
+
def self.add_common_feed_entry_element(element_tag, options = {})
|
105
|
+
call_on_each_feed_entry :element, element_tag, options
|
106
|
+
end
|
107
|
+
|
108
|
+
# Makes all registered entry types look for the passed in elements to parse.
|
109
|
+
# This is actually just a call to element (a SAXMachine call) in the class.
|
110
|
+
#
|
111
|
+
# === Parameters
|
112
|
+
# [element_tag<String>]
|
113
|
+
# [options<Hash>] Valid keys are same as with SAXMachine
|
114
|
+
def self.add_common_feed_entry_elements(element_tag, options = {})
|
115
|
+
call_on_each_feed_entry :elements, element_tag, options
|
116
|
+
end
|
117
|
+
|
118
|
+
# Call a method on all feed entries classes.
|
119
|
+
#
|
120
|
+
# === Parameters
|
121
|
+
# [method<Symbol>] The method name
|
122
|
+
# [parameters<Array>] The method parameters
|
123
|
+
def self.call_on_each_feed_entry(method, *parameters)
|
124
|
+
feed_classes.each do |k|
|
125
|
+
# iterate on the collections defined in the sax collection
|
126
|
+
k.sax_config.collection_elements.each_value do |vl|
|
127
|
+
# vl is a list of CollectionConfig mapped to an attribute name
|
128
|
+
# we'll look for the one set as 'entries' and add the new element
|
129
|
+
vl.find_all{|v| (v.accessor == 'entries') && (v.data_class.class == Class)}.each do |v|
|
130
|
+
v.data_class.send(method, *parameters)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
# Setup curl from options.
|
137
|
+
# Possible parameters:
|
138
|
+
# * :user_agent - overrides the default user agent.
|
139
|
+
# * :compress - any value to enable compression
|
140
|
+
# * :enable_cookies - boolean
|
141
|
+
# * :cookiefile - file to read cookies
|
142
|
+
# * :cookies - contents of cookies header
|
143
|
+
# * :http_authentication - array containing username, then password
|
144
|
+
# * :proxy_url - proxy url
|
145
|
+
# * :proxy_port - proxy port
|
146
|
+
# * :max_redirects - max number of redirections
|
147
|
+
# * :timeout - timeout
|
148
|
+
# * :ssl_verify_host - boolean
|
149
|
+
# * :ssl_verify_peer - boolean
|
150
|
+
# * :ssl_version - the ssl version to use, see OpenSSL::SSL::SSLContext::METHODS for options
|
151
|
+
def self.setup_easy(curl, options={})
|
152
|
+
curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
|
153
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
154
|
+
curl.enable_cookies = options[:enable_cookies] if options.has_key?(:enable_cookies)
|
155
|
+
curl.cookiefile = options[:cookiefile] if options.has_key?(:cookiefile)
|
156
|
+
curl.cookies = options[:cookies] if options.has_key?(:cookies)
|
157
|
+
|
158
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
159
|
+
curl.proxy_url = options[:proxy_url] if options.has_key?(:proxy_url)
|
160
|
+
curl.proxy_port = options[:proxy_port] if options.has_key?(:proxy_port)
|
161
|
+
curl.max_redirects = options[:max_redirects] if options[:max_redirects]
|
162
|
+
curl.timeout = options[:timeout] if options[:timeout]
|
163
|
+
curl.ssl_verify_host = options[:ssl_verify_host] if options.has_key?(:ssl_verify_host)
|
164
|
+
curl.ssl_verify_peer = options[:ssl_verify_peer] if options.has_key?(:ssl_verify_peer)
|
165
|
+
curl.ssl_version = options[:ssl_version] if options.has_key?(:ssl_version)
|
166
|
+
|
167
|
+
curl.follow_location = true
|
168
|
+
end
|
169
|
+
|
170
|
+
# Fetches and returns the raw XML for each URL provided.
|
171
|
+
#
|
172
|
+
# === Parameters
|
173
|
+
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
174
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
175
|
+
# :if_modified_since - Time object representing when the feed was last updated.
|
176
|
+
# :if_none_match - String that's normally an etag for the request that was stored previously.
|
177
|
+
# :on_success - Block that gets executed after a successful request.
|
178
|
+
# :on_failure - Block that gets executed after a failed request.
|
179
|
+
# * all parameters defined in setup_easy
|
180
|
+
# === Returns
|
181
|
+
# A String of XML if a single URL is passed.
|
182
|
+
#
|
183
|
+
# A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
|
184
|
+
def self.fetch_raw(urls, options = {})
|
185
|
+
url_queue = [*urls]
|
186
|
+
multi = Curl::Multi.new
|
187
|
+
responses = {}
|
188
|
+
url_queue.each do |url|
|
189
|
+
easy = Curl::Easy.new(url) do |curl|
|
190
|
+
setup_easy curl, options
|
191
|
+
|
192
|
+
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
193
|
+
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
194
|
+
|
195
|
+
curl.on_success do |c|
|
196
|
+
responses[url] = decode_content(c)
|
197
|
+
end
|
198
|
+
|
199
|
+
curl.on_complete do |c, err|
|
200
|
+
responses[url] = c.response_code unless responses.has_key?(url)
|
201
|
+
end
|
202
|
+
end
|
203
|
+
multi.add(easy)
|
204
|
+
end
|
205
|
+
|
206
|
+
multi.perform
|
207
|
+
urls.is_a?(String) ? responses.values.first : responses
|
208
|
+
end
|
209
|
+
|
210
|
+
# Fetches and returns the parsed XML for each URL provided.
|
211
|
+
#
|
212
|
+
# === Parameters
|
213
|
+
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
214
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
215
|
+
# * :user_agent - String that overrides the default user agent.
|
216
|
+
# * :if_modified_since - Time object representing when the feed was last updated.
|
217
|
+
# * :if_none_match - String, an etag for the request that was stored previously.
|
218
|
+
# * :on_success - Block that gets executed after a successful request.
|
219
|
+
# * :on_failure - Block that gets executed after a failed request.
|
220
|
+
# === Returns
|
221
|
+
# A Feed object if a single URL is passed.
|
222
|
+
#
|
223
|
+
# A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
|
224
|
+
def self.fetch_and_parse(urls, options = {})
|
225
|
+
url_queue = [*urls]
|
226
|
+
multi = Curl::Multi.new
|
227
|
+
responses = {}
|
228
|
+
|
229
|
+
# I broke these down so I would only try to do 30 simultaneously because
|
230
|
+
# I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
|
231
|
+
url_queue.slice!(0, 30).each do |url|
|
232
|
+
add_url_to_multi(multi, url, url_queue, responses, options)
|
233
|
+
end
|
234
|
+
|
235
|
+
multi.perform
|
236
|
+
return urls.is_a?(String) ? responses.values.first : responses
|
237
|
+
end
|
238
|
+
|
239
|
+
# Decodes the XML document if it was compressed.
|
240
|
+
#
|
241
|
+
# === Parameters
|
242
|
+
# [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
|
243
|
+
# === Returns
|
244
|
+
# A decoded string of XML.
|
245
|
+
def self.decode_content(c)
|
246
|
+
if c.header_str.match(/Content-Encoding: gzip/i)
|
247
|
+
begin
|
248
|
+
gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
|
249
|
+
xml = gz.read
|
250
|
+
gz.close
|
251
|
+
rescue Zlib::GzipFile::Error
|
252
|
+
# Maybe this is not gzipped?
|
253
|
+
xml = c.body_str
|
254
|
+
end
|
255
|
+
elsif c.header_str.match(/Content-Encoding: deflate/i)
|
256
|
+
xml = Zlib::Inflate.inflate(c.body_str)
|
257
|
+
else
|
258
|
+
xml = c.body_str
|
259
|
+
end
|
260
|
+
|
261
|
+
xml
|
262
|
+
end
|
263
|
+
|
264
|
+
# Updates each feed for each Feed object provided.
|
265
|
+
#
|
266
|
+
# === Parameters
|
267
|
+
# [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
|
268
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
269
|
+
# * :on_success - Block that gets executed after a successful request.
|
270
|
+
# * :on_failure - Block that gets executed after a failed request.
|
271
|
+
# * all parameters defined in setup_easy
|
272
|
+
# === Returns
|
273
|
+
# A updated Feed object if a single URL is passed.
|
274
|
+
#
|
275
|
+
# A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
|
276
|
+
def self.update(feeds, options = {})
|
277
|
+
feed_queue = [*feeds]
|
278
|
+
multi = Curl::Multi.new
|
279
|
+
responses = {}
|
280
|
+
|
281
|
+
feed_queue.slice!(0, 30).each do |feed|
|
282
|
+
add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
283
|
+
end
|
284
|
+
|
285
|
+
multi.perform
|
286
|
+
feeds.is_a?(Array) ? responses : responses.values.first
|
287
|
+
end
|
288
|
+
|
289
|
+
# An abstraction for adding a feed by URL to the passed Curb::multi stack.
|
290
|
+
#
|
291
|
+
# === Parameters
|
292
|
+
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
293
|
+
# [url<String>] The URL of the feed that you would like to be fetched.
|
294
|
+
# [url_queue<Array>] An array of URLs that are queued for request.
|
295
|
+
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
296
|
+
# [feeds<String> or <Array>] A single feed object, or an array of feed objects.
|
297
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
298
|
+
# * :on_success - Block that gets executed after a successful request.
|
299
|
+
# * :on_failure - Block that gets executed after a failed request.
|
300
|
+
# * all parameters defined in setup_easy
|
301
|
+
# === Returns
|
302
|
+
# The updated Curl::Multi object with the request details added to it's stack.
|
303
|
+
def self.add_url_to_multi(multi, url, url_queue, responses, options)
|
304
|
+
easy = Curl::Easy.new(url) do |curl|
|
305
|
+
setup_easy curl, options
|
306
|
+
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
307
|
+
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
308
|
+
|
309
|
+
curl.on_success do |c|
|
310
|
+
xml = decode_content(c)
|
311
|
+
klass = determine_feed_parser_for_xml(xml)
|
312
|
+
|
313
|
+
if klass
|
314
|
+
begin
|
315
|
+
feed = parse_with klass, xml, &on_parser_failure(url)
|
316
|
+
|
317
|
+
feed.feed_url = c.last_effective_url
|
318
|
+
feed.etag = etag_from_header(c.header_str)
|
319
|
+
feed.last_modified = last_modified_from_header(c.header_str)
|
320
|
+
responses[url] = feed
|
321
|
+
options[:on_success].call(url, feed) if options.has_key?(:on_success)
|
322
|
+
rescue Exception => e
|
323
|
+
call_on_failure(c, e, options[:on_failure])
|
324
|
+
end
|
325
|
+
else
|
326
|
+
call_on_failure(c, "Can't determine a parser", options[:on_failure])
|
327
|
+
end
|
328
|
+
end
|
329
|
+
|
330
|
+
#
|
331
|
+
# trigger on_failure for 404s
|
332
|
+
#
|
333
|
+
curl.on_complete do |c|
|
334
|
+
add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
|
335
|
+
responses[url] = c.response_code unless responses.has_key?(url)
|
336
|
+
end
|
337
|
+
|
338
|
+
curl.on_redirect do |c|
|
339
|
+
if c.response_code == 304 # it's not modified. this isn't an error condition
|
340
|
+
options[:on_success].call(url, nil) if options.has_key?(:on_success)
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
344
|
+
curl.on_missing do |c|
|
345
|
+
if c.response_code == 404 && options.has_key?(:on_failure)
|
346
|
+
call_on_failure(c, 'Server returned a 404', options[:on_failure])
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
curl.on_failure do |c, err|
|
351
|
+
responses[url] = c.response_code
|
352
|
+
call_on_failure(c, err, options[:on_failure])
|
353
|
+
end
|
354
|
+
end
|
355
|
+
multi.add(easy)
|
356
|
+
end
|
357
|
+
|
358
|
+
# An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
|
359
|
+
#
|
360
|
+
# === Parameters
|
361
|
+
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
362
|
+
# [feed<Feed>] A feed object that you would like to be fetched.
|
363
|
+
# [url_queue<Array>] An array of feed objects that are queued for request.
|
364
|
+
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
365
|
+
# [feeds<String>] or <Array> A single feed object, or an array of feed objects.
|
366
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
367
|
+
# * :on_success - Block that gets executed after a successful request.
|
368
|
+
# * :on_failure - Block that gets executed after a failed request.
|
369
|
+
# * all parameters defined in setup_easy
|
370
|
+
# === Returns
|
371
|
+
# The updated Curl::Multi object with the request details added to it's stack.
|
372
|
+
def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
373
|
+
easy = Curl::Easy.new(feed.feed_url) do |curl|
|
374
|
+
setup_easy curl, options
|
375
|
+
curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
|
376
|
+
curl.headers["If-Modified-Since"] = options[:if_modified_since] if options[:if_modified_since] && (!feed.last_modified || (Time.parse(options[:if_modified_since].to_s) > feed.last_modified))
|
377
|
+
curl.headers["If-None-Match"] = feed.etag if feed.etag
|
378
|
+
|
379
|
+
curl.on_success do |c|
|
380
|
+
begin
|
381
|
+
updated_feed = Feed.parse c.body_str, &on_parser_failure(feed.feed_url)
|
382
|
+
|
383
|
+
updated_feed.feed_url = c.last_effective_url
|
384
|
+
updated_feed.etag = etag_from_header(c.header_str)
|
385
|
+
updated_feed.last_modified = last_modified_from_header(c.header_str)
|
386
|
+
feed.update_from_feed(updated_feed)
|
387
|
+
responses[feed.feed_url] = feed
|
388
|
+
options[:on_success].call(feed) if options.has_key?(:on_success)
|
389
|
+
rescue Exception => e
|
390
|
+
call_on_failure(c, e, options[:on_failure])
|
391
|
+
end
|
392
|
+
end
|
393
|
+
|
394
|
+
curl.on_failure do |c, err| # response code 50X
|
395
|
+
responses[feed.feed_url] = c.response_code
|
396
|
+
call_on_failure(c, 'Server returned a 404', options[:on_failure])
|
397
|
+
end
|
398
|
+
|
399
|
+
curl.on_redirect do |c, err| # response code 30X
|
400
|
+
if c.response_code == 304
|
401
|
+
options[:on_success].call(feed) if options.has_key?(:on_success)
|
402
|
+
else
|
403
|
+
responses[feed.feed_url] = c.response_code
|
404
|
+
call_on_failure(c, err, options[:on_failure])
|
405
|
+
end
|
406
|
+
end
|
407
|
+
|
408
|
+
curl.on_complete do |c|
|
409
|
+
add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
|
410
|
+
responses[feed.feed_url] = feed unless responses.has_key?(feed.feed_url)
|
411
|
+
end
|
412
|
+
end
|
413
|
+
multi.add(easy)
|
414
|
+
end
|
415
|
+
|
416
|
+
# Determines the etag from the request headers.
|
417
|
+
#
|
418
|
+
# === Parameters
|
419
|
+
# [header<String>] Raw request header returned from the request
|
420
|
+
# === Returns
|
421
|
+
# A string of the etag or nil if it cannot be found in the headers.
|
422
|
+
def self.etag_from_header(header)
|
423
|
+
header =~ /.*ETag:\s(.*)\r/
|
424
|
+
$1
|
425
|
+
end
|
426
|
+
|
427
|
+
# Determines the last modified date from the request headers.
|
428
|
+
#
|
429
|
+
# === Parameters
|
430
|
+
# [header<String>] Raw request header returned from the request
|
431
|
+
# === Returns
|
432
|
+
# A Time object of the last modified date or nil if it cannot be found in the headers.
|
433
|
+
def self.last_modified_from_header(header)
|
434
|
+
header =~ /.*Last-Modified:\s(.*)\r/
|
435
|
+
Time.parse_safely($1) if $1
|
436
|
+
end
|
437
|
+
|
438
|
+
class << self
|
439
|
+
private
|
440
|
+
|
441
|
+
def on_parser_failure(url)
|
442
|
+
Proc.new { |message| raise "Error while parsing [#{url}] #{message}" }
|
443
|
+
end
|
444
|
+
|
445
|
+
def call_on_failure(c, error, on_failure)
|
446
|
+
if on_failure
|
447
|
+
if on_failure.arity == 4
|
448
|
+
warn 'on_failure proc with deprecated arity 4 should include a fifth parameter containing the error'
|
449
|
+
on_failure.call(c.url, c.response_code, c.header_str, c.body_str)
|
450
|
+
elsif on_failure.arity == 2
|
451
|
+
on_failure.call(c, error)
|
452
|
+
else
|
453
|
+
warn "on_failure proc with invalid parameters number #{on_failure.arity} instead of 2, ignoring it"
|
454
|
+
end
|
455
|
+
end
|
456
|
+
end
|
457
|
+
end
|
458
|
+
end
|
459
|
+
end
|