feedzirra 0.0.24 → 0.0.30

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. data/.rspec +1 -0
  2. data/README.rdoc +207 -0
  3. data/Rakefile +19 -24
  4. data/lib/feedzirra.rb +7 -28
  5. data/lib/feedzirra/core_ext.rb +3 -0
  6. data/lib/{core_ext → feedzirra/core_ext}/date.rb +2 -4
  7. data/lib/{core_ext → feedzirra/core_ext}/string.rb +0 -0
  8. data/lib/feedzirra/feed.rb +99 -41
  9. data/lib/feedzirra/feed_entry_utilities.rb +12 -11
  10. data/lib/feedzirra/parser.rb +15 -0
  11. data/lib/feedzirra/parser/atom.rb +7 -13
  12. data/lib/feedzirra/parser/atom_entry.rb +4 -14
  13. data/lib/feedzirra/parser/atom_feed_burner.rb +4 -10
  14. data/lib/feedzirra/parser/atom_feed_burner_entry.rb +8 -13
  15. data/lib/feedzirra/parser/itunes_rss.rb +4 -4
  16. data/lib/feedzirra/parser/itunes_rss_item.rb +1 -1
  17. data/lib/feedzirra/parser/rss.rb +4 -10
  18. data/lib/feedzirra/parser/rss_entry.rb +2 -12
  19. data/lib/feedzirra/version.rb +3 -0
  20. data/spec/benchmarks/feed_benchmarks.rb +98 -0
  21. data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
  22. data/spec/benchmarks/fetching_benchmarks.rb +28 -0
  23. data/spec/benchmarks/parsing_benchmark.rb +30 -0
  24. data/spec/benchmarks/updating_benchmarks.rb +33 -0
  25. data/spec/feedzirra/feed_entry_utilities_spec.rb +1 -1
  26. data/spec/feedzirra/feed_spec.rb +38 -5
  27. data/spec/feedzirra/feed_utilities_spec.rb +7 -4
  28. data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +5 -0
  29. data/spec/feedzirra/parser/atom_feed_burner_spec.rb +5 -1
  30. data/spec/feedzirra/parser/atom_spec.rb +5 -1
  31. data/spec/feedzirra/parser/itunes_rss_item_spec.rb +1 -1
  32. data/spec/feedzirra/parser/rss_entry_spec.rb +2 -1
  33. data/spec/feedzirra/parser/rss_spec.rb +5 -1
  34. data/spec/sample_feeds/run_against_sample.rb +20 -0
  35. data/spec/spec_helper.rb +10 -2
  36. metadata +141 -59
  37. data/README.textile +0 -208
  38. data/spec/spec.opts +0 -2
@@ -1,34 +1,35 @@
1
1
  module Feedzirra
2
2
  module FeedEntryUtilities
3
+
3
4
  def published
4
- @published || @updated
5
+ @published ||= @updated
5
6
  end
6
7
 
7
8
  def parse_datetime(string)
8
9
  begin
9
10
  DateTime.parse(string).feed_utils_to_gm_time
10
11
  rescue
11
- puts "DATE CAN'T BE PARSED: #{string}"
12
+ puts "DATE CAN'T BE PARSED: [#{string}]"
12
13
  nil
13
14
  end
14
15
  end
15
16
 
16
17
  ##
17
18
  # Returns the id of the entry or its url if not id is present, as some formats don't support it
18
- def id
19
- @entry_id || @url
19
+ def id
20
+ @entry_id ||= @url
20
21
  end
21
-
22
+
22
23
  ##
23
- # Writter for published. By default, we keep the "oldest" publish time found.
24
- def published=(val)
24
+ # Writer for published. By default, we keep the "oldest" publish time found.
25
+ def published=(val)
25
26
  parsed = parse_datetime(val)
26
- @published = parsed if !@published || parsed < @published
27
+ @published = parsed if !@published || parsed < @published
27
28
  end
28
-
29
+
29
30
  ##
30
- # Writter for udapted. By default, we keep the most recenet update time found.
31
- def updated=(val)
31
+ # Writer for updated. By default, we keep the most recent update time found.
32
+ def updated=(val)
32
33
  parsed = parse_datetime(val)
33
34
  @updated = parsed if !@updated || parsed > @updated
34
35
  end
@@ -0,0 +1,15 @@
1
+ module Feedzirra
2
+ module Parser
3
+ autoload :RSS, 'feedzirra/parser/rss'
4
+ autoload :RSSEntry, 'feedzirra/parser/rss_entry'
5
+
6
+ autoload :ITunesRSS, 'feedzirra/parser/itunes_rss'
7
+ autoload :ITunesRSSItem, 'feedzirra/parser/itunes_rss_item'
8
+ autoload :ITunesRSSOwner, 'feedzirra/parser/itunes_rss_owner'
9
+
10
+ autoload :Atom, 'feedzirra/parser/atom'
11
+ autoload :AtomEntry, 'feedzirra/parser/atom_entry'
12
+ autoload :AtomFeedBurner, 'feedzirra/parser/atom_feed_burner'
13
+ autoload :AtomFeedBurnerEntry, 'feedzirra/parser/atom_feed_burner_entry'
14
+ end
15
+ end
@@ -1,35 +1,29 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with Atom feeds.
6
- #
7
- # == Attributes
8
- # * title
9
- # * feed_url
10
- # * url
11
- # * entries
12
5
  class Atom
13
6
  include SAXMachine
14
7
  include FeedUtilities
15
8
  element :title
9
+ element :subtitle, :as => :description
16
10
  element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
11
  element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
12
  elements :link, :as => :links, :value => :href
19
13
  elements :entry, :as => :entries, :class => AtomEntry
20
14
 
21
15
  def self.able_to_parse?(xml) #:nodoc:
22
- xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
16
+ /\<feed[^\>]+xmlns=[\"|\'](http:\/\/www\.w3\.org\/2005\/Atom|http:\/\/purl\.org\/atom\/ns\#)[\"|\'][^\>]*\>/ =~ xml
23
17
  end
24
-
18
+
25
19
  def url
26
20
  @url || links.last
27
21
  end
28
-
22
+
29
23
  def feed_url
30
- @feed_url || links.first
24
+ @feed_url ||= links.first
31
25
  end
32
26
  end
33
27
  end
34
-
28
+
35
29
  end
@@ -1,17 +1,7 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with Atom feed entries.
6
- #
7
- # == Attributes
8
- # * title
9
- # * url
10
- # * author
11
- # * content
12
- # * summary
13
- # * published
14
- # * categories
15
5
  class AtomEntry
16
6
  include SAXMachine
17
7
  include FeedEntryUtilities
@@ -28,12 +18,12 @@ module Feedzirra
28
18
  element :modified, :as => :updated
29
19
  elements :category, :as => :categories, :value => :term
30
20
  elements :link, :as => :links, :value => :href
31
-
21
+
32
22
  def url
33
- @url || links.first
23
+ @url ||= links.first
34
24
  end
35
25
  end
36
26
 
37
27
  end
38
-
28
+
39
29
  end
@@ -1,27 +1,21 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with Feedburner Atom feeds.
6
- #
7
- # == Attributes
8
- # * title
9
- # * feed_url
10
- # * url
11
- # * entries
12
5
  class AtomFeedBurner
13
6
  include SAXMachine
14
7
  include FeedUtilities
15
8
  element :title
9
+ element :subtitle, :as => :description
16
10
  element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
11
  element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
12
  elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
19
13
 
20
14
  def self.able_to_parse?(xml) #:nodoc:
21
- (xml =~ /Atom/ && xml =~ /feedburner/) || false
15
+ ((/Atom/ =~ xml) && (/feedburner/ =~ xml)) || false
22
16
  end
23
17
  end
24
18
 
25
19
  end
26
-
20
+
27
21
  end
@@ -1,17 +1,7 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with Feedburner Atom feed entries.
6
- #
7
- # == Attributes
8
- # * title
9
- # * url
10
- # * author
11
- # * content
12
- # * summary
13
- # * published
14
- # * categories
15
5
  class AtomFeedBurnerEntry
16
6
  include SAXMachine
17
7
  include FeedEntryUtilities
@@ -28,8 +18,13 @@ module Feedzirra
28
18
  element :updated
29
19
  element :modified, :as => :updated
30
20
  elements :category, :as => :categories, :value => :term
31
- end
21
+ elements :link, :as => :links, :value => :href
22
+
23
+ def url
24
+ @url ||= links.first
25
+ end
32
26
 
27
+ end
33
28
  end
34
-
29
+
35
30
  end
@@ -1,5 +1,5 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
4
  # iTunes is RSS 2.0 + some apple extensions
5
5
  # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
@@ -40,11 +40,11 @@ module Feedzirra
40
40
  elements :item, :as => :entries, :class => ITunesRSSItem
41
41
 
42
42
  def self.able_to_parse?(xml)
43
- xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
43
+ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/i =~ xml
44
44
  end
45
45
 
46
46
  end
47
-
47
+
48
48
  end
49
-
49
+
50
50
  end
@@ -5,7 +5,7 @@ module Feedzirra
5
5
  # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
6
  class ITunesRSSItem
7
7
  include SAXMachine
8
- include FeedUtilities
8
+ include FeedEntryUtilities
9
9
  element :author
10
10
  element :guid
11
11
  element :title
@@ -1,28 +1,22 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with RSS feeds.
6
- #
7
- # == Attributes
8
- # * title
9
- # * feed_url
10
- # * url
11
- # * entries
12
5
  class RSS
13
6
  include SAXMachine
14
7
  include FeedUtilities
15
8
  element :title
9
+ element :description
16
10
  element :link, :as => :url
17
11
  elements :item, :as => :entries, :class => RSSEntry
18
12
 
19
13
  attr_accessor :feed_url
20
14
 
21
15
  def self.able_to_parse?(xml) #:nodoc:
22
- xml =~ /\<rss|\<rdf/
16
+ /\<rss|\<rdf/ =~ xml
23
17
  end
24
18
  end
25
19
 
26
20
  end
27
-
21
+
28
22
  end
@@ -1,17 +1,7 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with RDF feed entries.
6
- #
7
- # == Attributes
8
- # * title
9
- # * url
10
- # * author
11
- # * content
12
- # * summary
13
- # * published
14
- # * categories
15
5
  class RSSEntry
16
6
  include SAXMachine
17
7
  include FeedEntryUtilities
@@ -38,5 +28,5 @@ module Feedzirra
38
28
  end
39
29
 
40
30
  end
41
-
31
+
42
32
  end
@@ -0,0 +1,3 @@
1
+ module Feedzirra
2
+ VERSION = "0.0.30"
3
+ end
@@ -0,0 +1,98 @@
1
+ # this is some spike code to compare the speed of different methods for performing
2
+ # multiple feed fetches
3
+ require 'rubygems'
4
+ require 'curb'
5
+ require 'active_support'
6
+
7
+ require 'net/http'
8
+ require 'uri'
9
+
10
+ require 'benchmark'
11
+ include Benchmark
12
+
13
+ GET_COUNT = 1
14
+ urls = ["http://www.pauldix.net"] * GET_COUNT
15
+
16
+
17
+ benchmark do |t|
18
+ t.report("taf2-curb") do
19
+ multi = Curl::Multi.new
20
+ urls.each do |url|
21
+ easy = Curl::Easy.new(url) do |curl|
22
+ curl.headers["User-Agent"] = "feedzirra"
23
+ # curl.headers["If-Modified-Since"] = Time.now.httpdate
24
+ # curl.headers["If-None-Match"] = "ziEyTl4q9GH04BR4jgkImd0GvSE"
25
+ curl.follow_location = true
26
+ curl.on_success do |c|
27
+ # puts c.header_str.inspect
28
+ # puts c.response_code
29
+ # puts c.body_str.slice(0, 500)
30
+ end
31
+ curl.on_failure do |c|
32
+ puts "**** #{c.response_code}"
33
+ end
34
+ end
35
+ multi.add(easy)
36
+ end
37
+
38
+ multi.perform
39
+ end
40
+
41
+ t.report("nethttp") do
42
+ urls.each do |url|
43
+ res = Net::HTTP.get(URI.parse(url))
44
+ # puts res.slice(0, 500)
45
+ end
46
+ end
47
+
48
+ require 'rfuzz/session'
49
+ include RFuzz
50
+ t.report("rfuzz") do
51
+ GET_COUNT.times do
52
+ http = HttpClient.new("www.pauldix.net", 80)
53
+ response = http.get("/")
54
+ if response.http_status != "200"
55
+ puts "***** #{response.http_status}"
56
+ else
57
+ # puts response.http_status
58
+ # puts response.http_body.slice(0, 500)
59
+ end
60
+ end
61
+ end
62
+
63
+ require 'eventmachine'
64
+ t.report("eventmachine") do
65
+ counter = GET_COUNT
66
+ EM.run do
67
+ GET_COUNT.times do
68
+ http = EM::Protocols::HttpClient2.connect("www.pauldix.net", 80)
69
+ request = http.get("/")
70
+ request.callback do
71
+ # puts request.status
72
+ # puts request.content.slice(0, 500)
73
+ counter -= 1
74
+ EM.stop if counter == 0
75
+ end
76
+ end
77
+ end
78
+ end
79
+
80
+
81
+ require 'curl-multi'
82
+ t.report("curl multi") do
83
+ multi = Curl::Multi.new
84
+ urls.each do |url|
85
+ on_failure = lambda do |ex|
86
+ puts "****** Failed to retrieve #{url}"
87
+ end
88
+
89
+ on_success = lambda do |body|
90
+ # puts "got #{url}"
91
+ # puts body.slice(0, 500)
92
+ end
93
+ multi.get(url, on_success, on_failure)
94
+ end
95
+
96
+ multi.select([], []) while multi.size > 0
97
+ end
98
+ end
@@ -0,0 +1,40 @@
1
+ require File.dirname(__FILE__) + '/../../lib/feedzirra.rb'
2
+ require 'rfeedparser'
3
+ require 'feed-normalizer'
4
+ require 'open-uri'
5
+
6
+ require 'benchmark'
7
+ include Benchmark
8
+
9
+ iterations = 10
10
+ urls = File.readlines(File.dirname(__FILE__) + "/../sample_feeds/successful_feed_urls.txt").slice(0, 20)
11
+ puts "benchmarks on #{urls.size} feeds"
12
+ puts "************************************"
13
+ benchmark do |t|
14
+ t.report("feedzirra") do
15
+ iterations.times do
16
+ Feedzirra::Feed.fetch_and_parse(urls, :on_success => lambda { |url, feed| $stdout.print '.'; $stdout.flush })
17
+ end
18
+ end
19
+
20
+ t.report("rfeedparser") do
21
+ iterations.times do
22
+ urls.each do |url|
23
+ feed = FeedParser.parse(url)
24
+ $stdout.print '.'
25
+ $stdout.flush
26
+ end
27
+ end
28
+ end
29
+
30
+ t.report("feed-normalizer") do
31
+ iterations.times do
32
+ urls.each do |url|
33
+ # have to use the :force option to make feed-normalizer parse an atom feed
34
+ feed = FeedNormalizer::FeedNormalizer.parse(open(url), :force_parser => FeedNormalizer::SimpleRssParser)
35
+ $stdout.print '.'
36
+ $stdout.flush
37
+ end
38
+ end
39
+ end
40
+ end