codders-feedzirra 0.2.0.rc2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.gitignore +12 -0
  2. data/.rspec +1 -0
  3. data/.travis.yml +9 -0
  4. data/Gemfile +10 -0
  5. data/Guardfile +6 -0
  6. data/HISTORY.md +25 -0
  7. data/README.md +179 -0
  8. data/Rakefile +6 -0
  9. data/feedzirra.gemspec +28 -0
  10. data/lib/feedzirra.rb +17 -0
  11. data/lib/feedzirra/core_ext.rb +3 -0
  12. data/lib/feedzirra/core_ext/date.rb +19 -0
  13. data/lib/feedzirra/core_ext/string.rb +9 -0
  14. data/lib/feedzirra/core_ext/time.rb +29 -0
  15. data/lib/feedzirra/feed.rb +382 -0
  16. data/lib/feedzirra/feed_entry_utilities.rb +65 -0
  17. data/lib/feedzirra/feed_utilities.rb +72 -0
  18. data/lib/feedzirra/parser.rb +20 -0
  19. data/lib/feedzirra/parser/atom.rb +29 -0
  20. data/lib/feedzirra/parser/atom_entry.rb +30 -0
  21. data/lib/feedzirra/parser/atom_feed_burner.rb +21 -0
  22. data/lib/feedzirra/parser/atom_feed_burner_entry.rb +31 -0
  23. data/lib/feedzirra/parser/google_docs_atom.rb +28 -0
  24. data/lib/feedzirra/parser/google_docs_atom_entry.rb +29 -0
  25. data/lib/feedzirra/parser/itunes_rss.rb +50 -0
  26. data/lib/feedzirra/parser/itunes_rss_item.rb +32 -0
  27. data/lib/feedzirra/parser/itunes_rss_owner.rb +12 -0
  28. data/lib/feedzirra/parser/rss.rb +22 -0
  29. data/lib/feedzirra/parser/rss_entry.rb +34 -0
  30. data/lib/feedzirra/parser/rss_feed_burner.rb +22 -0
  31. data/lib/feedzirra/parser/rss_feed_burner_entry.rb +40 -0
  32. data/lib/feedzirra/version.rb +3 -0
  33. data/spec/benchmarks/feed_benchmarks.rb +98 -0
  34. data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
  35. data/spec/benchmarks/fetching_benchmarks.rb +28 -0
  36. data/spec/benchmarks/parsing_benchmark.rb +30 -0
  37. data/spec/benchmarks/updating_benchmarks.rb +33 -0
  38. data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
  39. data/spec/feedzirra/feed_spec.rb +599 -0
  40. data/spec/feedzirra/feed_utilities_spec.rb +150 -0
  41. data/spec/feedzirra/parser/atom_entry_spec.rb +86 -0
  42. data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +47 -0
  43. data/spec/feedzirra/parser/atom_feed_burner_spec.rb +47 -0
  44. data/spec/feedzirra/parser/atom_spec.rb +51 -0
  45. data/spec/feedzirra/parser/google_docs_atom_entry_spec.rb +22 -0
  46. data/spec/feedzirra/parser/google_docs_atom_spec.rb +31 -0
  47. data/spec/feedzirra/parser/itunes_rss_item_spec.rb +48 -0
  48. data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +18 -0
  49. data/spec/feedzirra/parser/itunes_rss_spec.rb +54 -0
  50. data/spec/feedzirra/parser/rss_entry_spec.rb +85 -0
  51. data/spec/feedzirra/parser/rss_feed_burner_entry_spec.rb +85 -0
  52. data/spec/feedzirra/parser/rss_feed_burner_spec.rb +52 -0
  53. data/spec/feedzirra/parser/rss_spec.rb +49 -0
  54. data/spec/sample_feeds/AmazonWebServicesBlog.xml +796 -0
  55. data/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml +63 -0
  56. data/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml +27 -0
  57. data/spec/sample_feeds/GoogleDocsList.xml +187 -0
  58. data/spec/sample_feeds/HREFConsideredHarmful.xml +313 -0
  59. data/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml +22 -0
  60. data/spec/sample_feeds/PaulDixExplainsNothing.xml +174 -0
  61. data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +174 -0
  62. data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +19 -0
  63. data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +174 -0
  64. data/spec/sample_feeds/TechCrunch.xml +1514 -0
  65. data/spec/sample_feeds/TechCrunchFirstEntry.xml +9 -0
  66. data/spec/sample_feeds/TechCrunchFirstEntryDescription.xml +3 -0
  67. data/spec/sample_feeds/TenderLovemaking.xml +515 -0
  68. data/spec/sample_feeds/TenderLovemakingFirstEntry.xml +66 -0
  69. data/spec/sample_feeds/TrotterCashionHome.xml +610 -0
  70. data/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml +30 -0
  71. data/spec/sample_feeds/itunes.xml +60 -0
  72. data/spec/sample_feeds/run_against_sample.rb +20 -0
  73. data/spec/sample_feeds/top5kfeeds.dat +2170 -0
  74. data/spec/sample_feeds/trouble_feeds.txt +16 -0
  75. data/spec/spec_helper.rb +75 -0
  76. metadata +203 -0
@@ -0,0 +1,12 @@
1
+ .DS_Store
2
+ .rvmrc
3
+ TODO
4
+ Gemfile.lock
5
+ rdoc/
6
+ doc/
7
+ pkg/
8
+ coverage/
9
+ .rbx
10
+ .bundle
11
+ *.swp
12
+ *.swo
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,9 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.8.7
4
+ - 1.9.2
5
+ - 1.9.3
6
+ - rbx-18mode
7
+ - rbx-19mode
8
+ notifications:
9
+ irc: "irc.freenode.org#feedzirra"
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ source :rubygems
2
+
3
+ gemspec
4
+
5
+ group :development, :test do
6
+ gem 'rake'
7
+ gem 'guard-rspec'
8
+ gem 'growl', :require => false
9
+ gem 'simplecov', :require => false, :platforms => :mri_19
10
+ end
@@ -0,0 +1,6 @@
1
+ guard 'rspec', :rvm => ['1.9.3@feedzirra', 'rbx-head@feedzirra'] do
2
+ watch(%r{^spec/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { "spec" }
5
+ end
6
+
@@ -0,0 +1,25 @@
1
+ # HEAD
2
+ * Remove gorillib dependency [[#113](https://github.com/pauldix/feedzirra/pull/113)].
3
+
4
+ # 0.2.0.rc2
5
+ * Bump sax-machine to `v0.2.0.rc1`, fixes encoding issues [[#76](https://github.com/pauldix/feedzirra/issues/76)].
6
+
7
+ # 0.2.0.rc1
8
+ * Remove ActiveSupport dependency
9
+ * No longer tethered to any version of Rails!
10
+ * Update curb (v0.8.0) and rspec (v2.10.0)
11
+ * Revert [3008ceb](https://github.com/pauldix/feedzirra/commit/3008ceb338df1f4c37a211d0aab8a6ad4f584dbc)
12
+ * Add Travis-CI integration
13
+ * General repository and gem maintenance
14
+
15
+ # 0.1.3
16
+ * ?
17
+
18
+ # 0.1.2
19
+ * ?
20
+
21
+ # 0.1.1
22
+ * make FeedEntries enumerable (patch by Daniel Gregoire)
23
+
24
+ # 0.1.0
25
+ * lower builder requirement to make it rails-3 friendly
@@ -0,0 +1,179 @@
1
+ # Feedzirra [![Build Status](https://secure.travis-ci.org/pauldix/feedzirra.png)](http://travis-ci.org/pauldix/feedzirra)
2
+
3
+ I'd like feedback on the api and any bugs encountered on feeds in the wild. I've set up a [google group here](http://groups.google.com/group/feedzirra).
4
+
5
+ ## Description
6
+
7
+ Feedzirra is a feed library that is designed to get and update many feeds as quickly as possible. This includes using libcurl-multi through the [curb](https://github.com/taf2/curb) gem for faster http gets, and libxml through [nokogiri](https://github.com/tenderlove/nokogiri) and [sax-machine](https://github.com/pauldix/sax-machine) for faster parsing.
8
+
9
+ Once you have fetched feeds using Feedzirra, they can be updated using the feed objects. Feedzirra automatically inserts etag and last-modified information from the http response headers to lower bandwidth usage, eliminate unnecessary parsing, and make things speedier in general.
10
+
11
+ Another feature present in Feedzirra is the ability to create callback functions that get called "on success" and "on failure" when getting a feed. This makes it easy to do things like log errors or update data stores.
12
+
13
+ The fetching and parsing logic have been decoupled so that either of them can be used in isolation if you'd prefer not to use everything that Feedzirra offers. However, the code examples below use helper methods in the Feed class that put everything together to make things as simple as possible.
14
+
15
+ The final feature of Feedzirra is the ability to define custom parsing classes. In truth, Feedzirra could be used to parse much more than feeds. Microformats, page scraping, and almost anything else are fair game.
16
+
17
+ ## Speedup date parsing
18
+
19
+ In MRI the date parsing code is written in ruby and is optimized for readability over speed, to speed up this part you can install the [home_run](https://github.com/jeremyevans/home_run) gem to replace it with an optimized C version.
20
+
21
+ ## Usage
22
+
23
+ [A gist of the following code](http://gist.github.com/57285)
24
+
25
+ require 'feedzirra'
26
+
27
+ # fetching a single feed
28
+ feed = Feedzirra::Feed.fetch_and_parse("http://feeds.feedburner.com/PaulDixExplainsNothing")
29
+
30
+ # feed and entries accessors
31
+ feed.title # => "Paul Dix Explains Nothing"
32
+ feed.url # => "http://www.pauldix.net"
33
+ feed.feed_url # => "http://feeds.feedburner.com/PaulDixExplainsNothing"
34
+ feed.etag # => "GunxqnEP4NeYhrqq9TyVKTuDnh0"
35
+ feed.last_modified # => Sat Jan 31 17:58:16 -0500 2009 # it's a Time object
36
+
37
+ entry = feed.entries.first
38
+ entry.title # => "Ruby Http Client Library Performance"
39
+ entry.url # => "http://www.pauldix.net/2009/01/ruby-http-client-library-performance.html"
40
+ entry.author # => "Paul Dix"
41
+ entry.summary # => "..."
42
+ entry.content # => "..."
43
+ entry.published # => Thu Jan 29 17:00:19 UTC 2009 # it's a Time object
44
+ entry.categories # => ["...", "..."]
45
+
46
+ # sanitizing an entry's content
47
+ entry.title.sanitize # => returns the title with harmful stuff escaped
48
+ entry.author.sanitize # => returns the author with harmful stuff escaped
49
+ entry.content.sanitize # => returns the content with harmful stuff escaped
50
+ entry.content.sanitize! # => returns content with harmful stuff escaped and replaces original (also exists for author and title)
51
+ entry.sanitize! # => sanitizes the entry's title, author, and content in place (as in, it changes the value to clean versions)
52
+ feed.sanitize_entries! # => sanitizes all entries in place
53
+
54
+ # updating a single feed
55
+ updated_feed = Feedzirra::Feed.update(feed)
56
+
57
+ # an updated feed has the following extra accessors
58
+ updated_feed.updated? # returns true if any of the feed attributes have been modified. will return false if only new entries
59
+ updated_feed.new_entries # a collection of the entry objects that are newer than the latest in the feed before update
60
+
61
+ # fetching multiple feeds
62
+ feed_urls = ["http://feeds.feedburner.com/PaulDixExplainsNothing", "http://feeds.feedburner.com/trottercashion"]
63
+ feeds = Feedzirra::Feed.fetch_and_parse(feed_urls)
64
+
65
+ # feeds is now a hash with the feed_urls as keys and the parsed feed objects as values. If an error was thrown
66
+ # there will be a Fixnum of the http response code instead of a feed object
67
+
68
+ # updating multiple feeds. it expects a collection of feed objects
69
+ updated_feeds = Feedzirra::Feed.update(feeds.values)
70
+
71
+ # defining custom behavior on failure or success. note that a return status of 304 (not updated) will call the on_success handler
72
+ feed = Feedzirra::Feed.fetch_and_parse("http://feeds.feedburner.com/PaulDixExplainsNothing",
73
+ :on_success => lambda [|url, feed| puts feed.title ],
74
+ :on_failure => lambda [|url, response_code, response_header, response_body| puts response_body ])
75
+ # if a collection was passed into fetch_and_parse, the handlers will be called for each one
76
+
77
+ # the behavior for the handlers when using Feedzirra::Feed.update is slightly different. The feed passed into on_success will be
78
+ # the updated feed with the standard updated accessors. on failure it will be the original feed object passed into update
79
+
80
+ # fetching a feed via a proxy (optional)
81
+ feed = Feedzirra::Feed.fetch_and_parse("http://feeds.feedburner.com/PaulDixExplainsNothing", [:proxy_url => '10.0.0.1', :proxy_port => 3084])
82
+
83
+ ## Extending
84
+
85
+ ### Adding a feed parsing class
86
+
87
+ # Adds a new feed parsing class, this class will be used first
88
+ Feedzirra::Feed.add_feed_class MyFeedClass
89
+
90
+ ### Adding attributes to all feeds types / all entries types
91
+
92
+ # Add the generator attribute to all feed types
93
+ Feedzirra::Feed.add_common_feed_element('generator')
94
+ Feedzirra::Feed.fetch_and_parse("href="http://www.pauldix.net/atom.xml").generator # => 'TypePad'
95
+
96
+ # Add some GeoRss information
97
+ Feedzirra::Feed.add_common_feed_entry_element('geo:lat', :as => :lat)
98
+ Feedzirra::Feed.fetch_and_parse("http://www.earthpublisher.com/georss.php").entries.each do |e|
99
+ p "lat: #[e.lat}, long: #{e.long]"
100
+ end
101
+
102
+ ### Adding attributes to only one class
103
+
104
+ If you want to add attributes for only on class you simply have to declare them in the class
105
+
106
+ # Add some GeoRss information
107
+ require 'lib/feedzirra/parser/rss_entry'
108
+
109
+ class Feedzirra::Parser::RSSEntry
110
+ element 'geo:lat', :as => :lat
111
+ element 'geo:long', :as => :long
112
+ end
113
+
114
+ # Fetch a feed containing GeoRss info and print them
115
+ Feedzirra::Feed.fetch_and_parse("http://www.earthpublisher.com/georss.php").entries.each do |e|
116
+ p "lat: #[e.lat}, long: #{e.long]"
117
+ end
118
+
119
+ ## Benchmarks
120
+
121
+ One of the goals of Feedzirra is speed. This includes not only parsing, but fetching multiple feeds as quickly as possible. I ran a benchmark getting 20 feeds 10 times using Feedzirra, rFeedParser, and FeedNormalizer. For more details the [benchmark code can be found in the project in spec/benchmarks/feedzirra_benchmarks.rb](https://github.com/pauldix/feedzirra/blob/7fb5634c5c16e9c6ec971767b462c6518cd55f5d/spec/benchmarks/feedzirra_benchmarks.rb)
122
+
123
+ feedzirra 5.170000 1.290000 6.460000 ( 18.917796)
124
+ rfeedparser 104.260000 12.220000 116.480000 (244.799063)
125
+ feed-normalizer 66.250000 4.010000 70.260000 (191.589862)
126
+
127
+ The result of that benchmark is a bit sketchy because of the network variability. Running 10 times against the same 20 feeds was meant to smooth some of that out. However, there is also a [benchmark comparing parsing speed in spec/benchmarks/parsing_benchmark.rb](https://github.com/pauldix/feedzirra/blob/7fb5634c5c16e9c6ec971767b462c6518cd55f5d/spec/benchmarks/parsing_benchmark.rb) on an atom feed.
128
+
129
+ feedzirra 0.500000 0.030000 0.530000 ( 0.658744)
130
+ rfeedparser 8.400000 1.110000 9.510000 ( 11.839827)
131
+ feed-normalizer 5.980000 0.160000 6.140000 ( 7.576140)
132
+
133
+ There's also a [benchmark that shows the results of using Feedzirra to perform updates on feeds](https://github.com/pauldix/feedzirra/blob/45d64319544c61a4c9eb9f7f825c73b9f9030cb3/spec/benchmarks/updating_benchmarks.rb) you've already pulled in. I tested against 179 feeds. The first is the initial pull and the second is an update 65 seconds later. I'm not sure how many of them support etag and last-modified, so performance may be better or worse depending on what feeds you're requesting.
134
+
135
+ feedzirra fetch and parse 4.010000 0.710000 4.720000 ( 15.110101)
136
+ feedzirra update 0.660000 0.280000 0.940000 ( 5.152709)
137
+
138
+ ## TODO
139
+
140
+ This thing needs to hammer on many different feeds in the wild. I'm sure there will be bugs. I want to find them and crush them. I didn't bother using the test suite for feedparser. i wanted to start fresh.
141
+
142
+ Here are some more specific TODOs.
143
+ * Make a feedzirra-rails gem to integrate feedzirra seamlessly with Rails and ActiveRecord.
144
+ * Add support for authenticated feeds.
145
+ * Create a super sweet DSL for defining new parsers.
146
+ * Test against Ruby 1.9.1 and fix any bugs.
147
+ * I'm not keeping track of modified on entries. Should I add this?
148
+ * Clean up the fetching code inside feed.rb so it doesn't suck so hard.
149
+ * Make the feed_spec actually mock stuff out so it doesn't hit the net.
150
+ * Readdress how feeds determine if they can parse a document. Maybe I should use namespaces instead?
151
+
152
+ ## LICENSE
153
+
154
+ (The MIT License)
155
+
156
+ Copyright (c) 2009-2012:
157
+
158
+ - [Paul Dix](http://pauldix.net)
159
+ - [Julien Kirch](http://archiloque.net/)
160
+ - [Ezekiel Templin](http://zeke.templ.in/)
161
+
162
+ Permission is hereby granted, free of charge, to any person obtaining
163
+ a copy of this software and associated documentation files (the
164
+ 'Software'), to deal in the Software without restriction, including
165
+ without limitation the rights to use, copy, modify, merge, publish,
166
+ distribute, sublicense, and/or sell copies of the Software, and to
167
+ permit persons to whom the Software is furnished to do so, subject to
168
+ the following conditions:
169
+
170
+ The above copyright notice and this permission notice shall be
171
+ included in all copies or substantial portions of the Software.
172
+
173
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
174
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
175
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
176
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
177
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
178
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
179
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require 'rspec/core/rake_task'
4
+ RSpec::Core::RakeTask.new(:spec)
5
+ task :test => :spec
6
+ task :default => :test
@@ -0,0 +1,28 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/feedzirra/version', __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'codders-feedzirra'
6
+ s.version = Feedzirra::VERSION
7
+
8
+ s.authors = ['Paul Dix', 'Julien Kirch', "Ezekiel Templin"]
9
+ s.date = Date.today
10
+ s.email = 'feedzirra@googlegroups.com'
11
+ s.homepage = 'http://github.com/codders/feedzirra'
12
+
13
+ s.summary = 'A feed fetching and parsing library'
14
+ s.description = 'A feed fetching and parsing library that treats the internet like Godzilla treats Japan: it dominates and eats all.'
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.require_paths = ['lib']
19
+
20
+ s.platform = Gem::Platform::RUBY
21
+
22
+ s.add_dependency 'nokogiri', '~> 1.5.3'
23
+ s.add_dependency 'sax-machine', '~> 0.2.0.rc1'
24
+ s.add_dependency 'codders-curb', '~> 0.8.0'
25
+ s.add_dependency 'loofah', '~> 1.2.1'
26
+
27
+ s.add_development_dependency 'rspec', '~> 2.10.0'
28
+ end
@@ -0,0 +1,17 @@
1
+ require 'zlib'
2
+ require 'curb'
3
+ require 'sax-machine'
4
+ require 'loofah'
5
+ require 'uri'
6
+
7
+ require 'feedzirra/core_ext'
8
+ require 'feedzirra/version'
9
+
10
+ module Feedzirra
11
+ autoload :FeedEntryUtilities, 'feedzirra/feed_entry_utilities'
12
+ autoload :FeedUtilities, 'feedzirra/feed_utilities'
13
+ autoload :Feed, 'feedzirra/feed'
14
+ autoload :Parser, 'feedzirra/parser'
15
+
16
+ class NoParserAvailable < StandardError; end
17
+ end
@@ -0,0 +1,3 @@
1
+ require "feedzirra/core_ext/time"
2
+ require "feedzirra/core_ext/date"
3
+ require "feedzirra/core_ext/string"
@@ -0,0 +1,19 @@
1
+ # Date code pulled and adapted from:
2
+ # Ruby Cookbook by Lucas Carlson and Leonard Richardson
3
+ # Published by O'Reilly
4
+ # ISBN: 0-596-52369-6
5
+ class Date
6
+ def feed_utils_to_gm_time
7
+ feed_utils_to_time(new_offset, :gm)
8
+ end
9
+
10
+ def feed_utils_to_local_time
11
+ feed_utils_to_time(new_offset(DateTime.now.offset-offset), :local)
12
+ end
13
+
14
+ private
15
+ def feed_utils_to_time(dest, method)
16
+ Time.send(method, dest.year, dest.month, dest.day, dest.hour, dest.min,
17
+ dest.sec, dest.zone)
18
+ end
19
+ end
@@ -0,0 +1,9 @@
1
+ class String
2
+ def sanitize!
3
+ self.replace(sanitize)
4
+ end
5
+
6
+ def sanitize
7
+ Loofah.scrub_fragment(self, :prune).to_s
8
+ end
9
+ end
@@ -0,0 +1,29 @@
1
+ require "time"
2
+ require "date"
3
+
4
+ class Time
5
+ # Parse a time string and convert it to UTC without raising errors.
6
+ # Parses a flattened 14-digit time (YYYYmmddHHMMMSS) as UTC.
7
+ #
8
+ # === Parameters
9
+ # [dt<String or Time>] Time definition to be parsed.
10
+ #
11
+ # === Returns
12
+ # A Time instance in UTC or nil if there were errors while parsing.
13
+ def self.parse_safely(dt)
14
+ if dt
15
+ case
16
+ when dt.is_a?(Time)
17
+ dt.utc
18
+ when dt.respond_to?(:empty?) && dt.empty?
19
+ nil
20
+ when dt.to_s =~ /\A\d{14}\z/
21
+ parse("#{dt.to_s}Z", true)
22
+ else
23
+ parse(dt.to_s, true).utc
24
+ end
25
+ end
26
+ rescue StandardError
27
+ nil
28
+ end unless method_defined?(:parse_safely)
29
+ end
@@ -0,0 +1,382 @@
1
+ module Feedzirra
2
+ class Feed
3
+ USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master"
4
+
5
+ # Takes a raw XML feed and attempts to parse it. If no parser is available a Feedzirra::NoParserAvailable exception is raised.
6
+ # You can pass a block to be called when there's an error during the parsing.
7
+ # === Parameters
8
+ # [xml<String>] The XML that you would like parsed.
9
+ # === Returns
10
+ # An instance of the determined feed type. By default a Feedzirra::Atom, Feedzirra::AtomFeedBurner, Feedzirra::RDF, or Feedzirra::RSS object.
11
+ # === Raises
12
+ # Feedzirra::NoParserAvailable : If no valid parser classes could be found for the feed.
13
+ def self.parse(xml, &block)
14
+ if parser = determine_feed_parser_for_xml(xml)
15
+ parser.parse(xml, block)
16
+ else
17
+ raise NoParserAvailable.new("No valid parser for XML.")
18
+ end
19
+ end
20
+
21
+ # Determines the correct parser class to use for parsing the feed.
22
+ #
23
+ # === Parameters
24
+ # [xml<String>] The XML that you would like determine the parser for.
25
+ # === Returns
26
+ # The class name of the parser that can handle the XML.
27
+ def self.determine_feed_parser_for_xml(xml)
28
+ start_of_doc = xml.slice(0, 2000)
29
+ feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
30
+ end
31
+
32
+ # Adds a new feed parsing class that will be used for parsing.
33
+ #
34
+ # === Parameters
35
+ # [klass<Constant>] The class/constant that you want to register.
36
+ # === Returns
37
+ # A updated array of feed parser class names.
38
+ def self.add_feed_class(klass)
39
+ feed_classes.unshift klass
40
+ end
41
+
42
+ # Provides a list of registered feed parsing classes.
43
+ #
44
+ # === Returns
45
+ # A array of class names.
46
+ def self.feed_classes
47
+ @feed_classes ||= [Feedzirra::Parser::RSSFeedBurner, Feedzirra::Parser::RSS, Feedzirra::Parser::GoogleDocsAtom, Feedzirra::Parser::AtomFeedBurner, Feedzirra::Parser::Atom, Feedzirra::Parser::ITunesRSS]
48
+ end
49
+
50
+ # Makes all registered feeds types look for the passed in element to parse.
51
+ # This is actually just a call to element (a SAXMachine call) in the class.
52
+ #
53
+ # === Parameters
54
+ # [element_tag<String>] The element tag
55
+ # [options<Hash>] Valid keys are same as with SAXMachine
56
+ def self.add_common_feed_element(element_tag, options = {})
57
+ feed_classes.each do |k|
58
+ k.element element_tag, options
59
+ end
60
+ end
61
+
62
+ # Makes all registered feeds types look for the passed in elements to parse.
63
+ # This is actually just a call to elements (a SAXMachine call) in the class.
64
+ #
65
+ # === Parameters
66
+ # [element_tag<String>] The element tag
67
+ # [options<Hash>] Valid keys are same as with SAXMachine
68
+ def self.add_common_feed_elements(element_tag, options = {})
69
+ feed_classes.each do |k|
70
+ k.elements element_tag, options
71
+ end
72
+ end
73
+
74
+ # Makes all registered entry types look for the passed in element to parse.
75
+ # This is actually just a call to element (a SAXMachine call) in the class.
76
+ #
77
+ # === Parameters
78
+ # [element_tag<String>]
79
+ # [options<Hash>] Valid keys are same as with SAXMachine
80
+ def self.add_common_feed_entry_element(element_tag, options = {})
81
+ call_on_each_feed_entry :element, element_tag, options
82
+ end
83
+
84
+ # Makes all registered entry types look for the passed in elements to parse.
85
+ # This is actually just a call to element (a SAXMachine call) in the class.
86
+ #
87
+ # === Parameters
88
+ # [element_tag<String>]
89
+ # [options<Hash>] Valid keys are same as with SAXMachine
90
+ def self.add_common_feed_entry_elements(element_tag, options = {})
91
+ call_on_each_feed_entry :elements, element_tag, options
92
+ end
93
+
94
+ # Call a method on all feed entries classes.
95
+ #
96
+ # === Parameters
97
+ # [method<Symbol>] The method name
98
+ # [parameters<Array>] The method parameters
99
+ def self.call_on_each_feed_entry(method, *parameters)
100
+ feed_classes.each do |k|
101
+ # iterate on the collections defined in the sax collection
102
+ k.sax_config.collection_elements.each_value do |vl|
103
+ # vl is a list of CollectionConfig mapped to an attribute name
104
+ # we'll look for the one set as 'entries' and add the new element
105
+ vl.find_all{|v| (v.accessor == 'entries') && (v.data_class.class == Class)}.each do |v|
106
+ v.data_class.send(method, *parameters)
107
+ end
108
+ end
109
+ end
110
+ end
111
+
112
+ # Setup curl from options.
113
+ # Possible parameters:
114
+ # * :user_agent - overrides the default user agent.
115
+ # * :compress - any value to enable compression
116
+ # * :http_authentication - array containing http authentication parameters
117
+ # * :proxy_url - proxy url
118
+ # * :proxy_port - proxy port
119
+ # * :max_redirects - max number of redirections
120
+ # * :timeout - timeout
121
+ def self.setup_easy curl, options
122
+ curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
123
+ curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
124
+
125
+ curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
126
+ curl.proxy_url = options[:proxy_url] if options.has_key?(:proxy_url)
127
+ curl.proxy_port = options[:proxy_port] if options.has_key?(:proxy_port)
128
+ curl.max_redirects = options[:max_redirects] if options[:max_redirects]
129
+ curl.timeout = options[:timeout] if options[:timeout]
130
+ curl.ssl_verify_host = options[:ssl_verify_host] if options.has_key?(:ssl_verify_host)
131
+
132
+ curl.follow_location = true
133
+ end
134
+
135
+ # Fetches and returns the raw XML for each URL provided.
136
+ #
137
+ # === Parameters
138
+ # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
139
+ # [options<Hash>] Valid keys for this argument as as followed:
140
+ # :if_modified_since - Time object representing when the feed was last updated.
141
+ # :if_none_match - String that's normally an etag for the request that was stored previously.
142
+ # :on_success - Block that gets executed after a successful request.
143
+ # :on_failure - Block that gets executed after a failed request.
144
+ # * all parameters defined in setup_easy
145
+ # === Returns
146
+ # A String of XML if a single URL is passed.
147
+ #
148
+ # A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
149
+ def self.fetch_raw(urls, options = {})
150
+ url_queue = [*urls]
151
+ multi = Curl::Multi.new
152
+ responses = {}
153
+ url_queue.each do |url|
154
+ easy = Curl::Easy.new(url) do |curl|
155
+ setup_easy curl, options
156
+
157
+ curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
158
+ curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
159
+
160
+ curl.on_success do |c|
161
+ responses[url] = decode_content(c)
162
+ end
163
+ curl.on_failure do |c, err|
164
+ responses[url] = c.response_code
165
+ end
166
+ end
167
+ multi.add(easy)
168
+ end
169
+
170
+ multi.perform
171
+ urls.is_a?(String) ? responses.values.first : responses
172
+ end
173
+
174
+ # Fetches and returns the parsed XML for each URL provided.
175
+ #
176
+ # === Parameters
177
+ # [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
178
+ # [options<Hash>] Valid keys for this argument as as followed:
179
+ # * :user_agent - String that overrides the default user agent.
180
+ # * :if_modified_since - Time object representing when the feed was last updated.
181
+ # * :if_none_match - String, an etag for the request that was stored previously.
182
+ # * :on_success - Block that gets executed after a successful request.
183
+ # * :on_failure - Block that gets executed after a failed request.
184
+ # === Returns
185
+ # A Feed object if a single URL is passed.
186
+ #
187
+ # A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
188
+ def self.fetch_and_parse(urls, options = {})
189
+ url_queue = [*urls]
190
+ multi = Curl::Multi.new
191
+ responses = {}
192
+
193
+ # I broke these down so I would only try to do 30 simultaneously because
194
+ # I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
195
+ url_queue.slice!(0, 30).each do |url|
196
+ add_url_to_multi(multi, url, url_queue, responses, options)
197
+ end
198
+
199
+ multi.perform
200
+ return urls.is_a?(String) ? responses.values.first : responses
201
+ end
202
+
203
+ # Decodes the XML document if it was compressed.
204
+ #
205
+ # === Parameters
206
+ # [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
207
+ # === Returns
208
+ # A decoded string of XML.
209
+ def self.decode_content(c)
210
+ if c.header_str.match(/Content-Encoding: gzip/i)
211
+ begin
212
+ gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
213
+ xml = gz.read
214
+ gz.close
215
+ rescue Zlib::GzipFile::Error
216
+ # Maybe this is not gzipped?
217
+ xml = c.body_str
218
+ end
219
+ elsif c.header_str.match(/Content-Encoding: deflate/i)
220
+ xml = Zlib::Inflate.inflate(c.body_str)
221
+ else
222
+ xml = c.body_str
223
+ end
224
+
225
+ xml
226
+ end
227
+
228
+ # Updates each feed for each Feed object provided.
229
+ #
230
+ # === Parameters
231
+ # [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
232
+ # [options<Hash>] Valid keys for this argument as as followed:
233
+ # * :on_success - Block that gets executed after a successful request.
234
+ # * :on_failure - Block that gets executed after a failed request.
235
+ # * all parameters defined in setup_easy
236
+ # === Returns
237
+ # A updated Feed object if a single URL is passed.
238
+ #
239
+ # A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
240
+ def self.update(feeds, options = {})
241
+ feed_queue = [*feeds]
242
+ multi = Curl::Multi.new
243
+ responses = {}
244
+
245
+ feed_queue.slice!(0, 30).each do |feed|
246
+ add_feed_to_multi(multi, feed, feed_queue, responses, options)
247
+ end
248
+
249
+ multi.perform
250
+ responses.is_a?(Array)? responses.values : responses.values.first
251
+ end
252
+
253
+ # An abstraction for adding a feed by URL to the passed Curb::multi stack.
254
+ #
255
+ # === Parameters
256
+ # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
257
+ # [url<String>] The URL of the feed that you would like to be fetched.
258
+ # [url_queue<Array>] An array of URLs that are queued for request.
259
+ # [responses<Hash>] Existing responses that you want the response from the request added to.
260
+ # [feeds<String> or <Array>] A single feed object, or an array of feed objects.
261
+ # [options<Hash>] Valid keys for this argument as as followed:
262
+ # * :on_success - Block that gets executed after a successful request.
263
+ # * :on_failure - Block that gets executed after a failed request.
264
+ # * all parameters defined in setup_easy
265
+ # === Returns
266
+ # The updated Curl::Multi object with the request details added to it's stack.
267
+ def self.add_url_to_multi(multi, url, url_queue, responses, options)
268
+ easy = Curl::Easy.new(url) do |curl|
269
+ setup_easy curl, options
270
+ curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
271
+ curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
272
+
273
+ curl.on_success do |c|
274
+ add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
275
+ xml = decode_content(c)
276
+ klass = determine_feed_parser_for_xml(xml)
277
+
278
+ if klass
279
+ begin
280
+ feed = klass.parse(xml, Proc.new{|message| warn "Error while parsing [#{url}] #{message}" })
281
+ feed.feed_url = c.last_effective_url
282
+ feed.etag = etag_from_header(c.header_str)
283
+ feed.last_modified = last_modified_from_header(c.header_str)
284
+ responses[url] = feed
285
+ options[:on_success].call(url, feed) if options.has_key?(:on_success)
286
+ rescue Exception => e
287
+ options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
288
+ end
289
+ else
290
+ # puts "Error determining parser for #{url} - #{c.last_effective_url}"
291
+ # raise NoParserAvailable.new("no valid parser for content.") (this would unfortunately fail the whole 'multi', so it's not really usable)
292
+ options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
293
+ end
294
+ end
295
+
296
+ curl.on_failure do |c, err|
297
+ add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
298
+ responses[url] = c.response_code
299
+ if c.response_code == 304 # it's not modified. this isn't an error condition
300
+ options[:on_success].call(url, nil) if options.has_key?(:on_success)
301
+ else
302
+ options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
303
+ end
304
+ end
305
+ end
306
+ multi.add(easy)
307
+ end
308
+
309
+ # An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
310
+ #
311
+ # === Parameters
312
+ # [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
313
+ # [feed<Feed>] A feed object that you would like to be fetched.
314
+ # [url_queue<Array>] An array of feed objects that are queued for request.
315
+ # [responses<Hash>] Existing responses that you want the response from the request added to.
316
+ # [feeds<String>] or <Array> A single feed object, or an array of feed objects.
317
+ # [options<Hash>] Valid keys for this argument as as followed:
318
+ # * :on_success - Block that gets executed after a successful request.
319
+ # * :on_failure - Block that gets executed after a failed request.
320
+ # * all parameters defined in setup_easy
321
+ # === Returns
322
+ # The updated Curl::Multi object with the request details added to it's stack.
323
+ def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
324
+ easy = Curl::Easy.new(feed.feed_url) do |curl|
325
+ setup_easy curl, options
326
+ curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
327
+ curl.headers["If-Modified-Since"] = options[:if_modified_since] if options[:if_modified_since] && (!feed.last_modified || (Time.parse(options[:if_modified_since].to_s) > feed.last_modified))
328
+ curl.headers["If-None-Match"] = feed.etag if feed.etag
329
+
330
+ curl.on_success do |c|
331
+ begin
332
+ add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
333
+ updated_feed = Feed.parse(c.body_str){ |message| warn "Error while parsing [#{feed.feed_url}] #{message}" }
334
+ updated_feed.feed_url = c.last_effective_url
335
+ updated_feed.etag = etag_from_header(c.header_str)
336
+ updated_feed.last_modified = last_modified_from_header(c.header_str)
337
+ feed.update_from_feed(updated_feed)
338
+ responses[feed.feed_url] = feed
339
+ options[:on_success].call(feed) if options.has_key?(:on_success)
340
+ rescue Exception => e
341
+ options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
342
+ end
343
+ end
344
+
345
+ curl.on_failure do |c, err|
346
+ add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
347
+ response_code = c.response_code
348
+ if response_code == 304 # it's not modified. this isn't an error condition
349
+ responses[feed.feed_url] = feed
350
+ options[:on_success].call(feed) if options.has_key?(:on_success)
351
+ else
352
+ responses[feed.url] = c.response_code
353
+ options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
354
+ end
355
+ end
356
+ end
357
+ multi.add(easy)
358
+ end
359
+
360
+ # Determines the etag from the request headers.
361
+ #
362
+ # === Parameters
363
+ # [header<String>] Raw request header returned from the request
364
+ # === Returns
365
+ # A string of the etag or nil if it cannot be found in the headers.
366
+ def self.etag_from_header(header)
367
+ header =~ /.*ETag:\s(.*)\r/
368
+ $1
369
+ end
370
+
371
+ # Determines the last modified date from the request headers.
372
+ #
373
+ # === Parameters
374
+ # [header<String>] Raw request header returned from the request
375
+ # === Returns
376
+ # A Time object of the last modified date or nil if it cannot be found in the headers.
377
+ def self.last_modified_from_header(header)
378
+ header =~ /.*Last-Modified:\s(.*)\r/
379
+ Time.parse($1) if $1
380
+ end
381
+ end
382
+ end