feedzirra 0.0.24 → 0.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. data/.rspec +1 -0
  2. data/README.rdoc +207 -0
  3. data/Rakefile +19 -24
  4. data/lib/feedzirra.rb +7 -28
  5. data/lib/feedzirra/core_ext.rb +3 -0
  6. data/lib/{core_ext → feedzirra/core_ext}/date.rb +2 -4
  7. data/lib/{core_ext → feedzirra/core_ext}/string.rb +0 -0
  8. data/lib/feedzirra/feed.rb +99 -41
  9. data/lib/feedzirra/feed_entry_utilities.rb +12 -11
  10. data/lib/feedzirra/parser.rb +15 -0
  11. data/lib/feedzirra/parser/atom.rb +7 -13
  12. data/lib/feedzirra/parser/atom_entry.rb +4 -14
  13. data/lib/feedzirra/parser/atom_feed_burner.rb +4 -10
  14. data/lib/feedzirra/parser/atom_feed_burner_entry.rb +8 -13
  15. data/lib/feedzirra/parser/itunes_rss.rb +4 -4
  16. data/lib/feedzirra/parser/itunes_rss_item.rb +1 -1
  17. data/lib/feedzirra/parser/rss.rb +4 -10
  18. data/lib/feedzirra/parser/rss_entry.rb +2 -12
  19. data/lib/feedzirra/version.rb +3 -0
  20. data/spec/benchmarks/feed_benchmarks.rb +98 -0
  21. data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
  22. data/spec/benchmarks/fetching_benchmarks.rb +28 -0
  23. data/spec/benchmarks/parsing_benchmark.rb +30 -0
  24. data/spec/benchmarks/updating_benchmarks.rb +33 -0
  25. data/spec/feedzirra/feed_entry_utilities_spec.rb +1 -1
  26. data/spec/feedzirra/feed_spec.rb +38 -5
  27. data/spec/feedzirra/feed_utilities_spec.rb +7 -4
  28. data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +5 -0
  29. data/spec/feedzirra/parser/atom_feed_burner_spec.rb +5 -1
  30. data/spec/feedzirra/parser/atom_spec.rb +5 -1
  31. data/spec/feedzirra/parser/itunes_rss_item_spec.rb +1 -1
  32. data/spec/feedzirra/parser/rss_entry_spec.rb +2 -1
  33. data/spec/feedzirra/parser/rss_spec.rb +5 -1
  34. data/spec/sample_feeds/run_against_sample.rb +20 -0
  35. data/spec/spec_helper.rb +10 -2
  36. metadata +141 -59
  37. data/README.textile +0 -208
  38. data/spec/spec.opts +0 -2
@@ -1,34 +1,35 @@
1
1
  module Feedzirra
2
2
  module FeedEntryUtilities
3
+
3
4
  def published
4
- @published || @updated
5
+ @published ||= @updated
5
6
  end
6
7
 
7
8
  def parse_datetime(string)
8
9
  begin
9
10
  DateTime.parse(string).feed_utils_to_gm_time
10
11
  rescue
11
- puts "DATE CAN'T BE PARSED: #{string}"
12
+ puts "DATE CAN'T BE PARSED: [#{string}]"
12
13
  nil
13
14
  end
14
15
  end
15
16
 
16
17
  ##
17
18
  # Returns the id of the entry or its url if not id is present, as some formats don't support it
18
- def id
19
- @entry_id || @url
19
+ def id
20
+ @entry_id ||= @url
20
21
  end
21
-
22
+
22
23
  ##
23
- # Writter for published. By default, we keep the "oldest" publish time found.
24
- def published=(val)
24
+ # Writer for published. By default, we keep the "oldest" publish time found.
25
+ def published=(val)
25
26
  parsed = parse_datetime(val)
26
- @published = parsed if !@published || parsed < @published
27
+ @published = parsed if !@published || parsed < @published
27
28
  end
28
-
29
+
29
30
  ##
30
- # Writter for udapted. By default, we keep the most recenet update time found.
31
- def updated=(val)
31
+ # Writer for updated. By default, we keep the most recent update time found.
32
+ def updated=(val)
32
33
  parsed = parse_datetime(val)
33
34
  @updated = parsed if !@updated || parsed > @updated
34
35
  end
@@ -0,0 +1,15 @@
1
+ module Feedzirra
2
+ module Parser
3
+ autoload :RSS, 'feedzirra/parser/rss'
4
+ autoload :RSSEntry, 'feedzirra/parser/rss_entry'
5
+
6
+ autoload :ITunesRSS, 'feedzirra/parser/itunes_rss'
7
+ autoload :ITunesRSSItem, 'feedzirra/parser/itunes_rss_item'
8
+ autoload :ITunesRSSOwner, 'feedzirra/parser/itunes_rss_owner'
9
+
10
+ autoload :Atom, 'feedzirra/parser/atom'
11
+ autoload :AtomEntry, 'feedzirra/parser/atom_entry'
12
+ autoload :AtomFeedBurner, 'feedzirra/parser/atom_feed_burner'
13
+ autoload :AtomFeedBurnerEntry, 'feedzirra/parser/atom_feed_burner_entry'
14
+ end
15
+ end
@@ -1,35 +1,29 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with Atom feeds.
6
- #
7
- # == Attributes
8
- # * title
9
- # * feed_url
10
- # * url
11
- # * entries
12
5
  class Atom
13
6
  include SAXMachine
14
7
  include FeedUtilities
15
8
  element :title
9
+ element :subtitle, :as => :description
16
10
  element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
11
  element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
12
  elements :link, :as => :links, :value => :href
19
13
  elements :entry, :as => :entries, :class => AtomEntry
20
14
 
21
15
  def self.able_to_parse?(xml) #:nodoc:
22
- xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
16
+ /\<feed[^\>]+xmlns=[\"|\'](http:\/\/www\.w3\.org\/2005\/Atom|http:\/\/purl\.org\/atom\/ns\#)[\"|\'][^\>]*\>/ =~ xml
23
17
  end
24
-
18
+
25
19
  def url
26
20
  @url || links.last
27
21
  end
28
-
22
+
29
23
  def feed_url
30
- @feed_url || links.first
24
+ @feed_url ||= links.first
31
25
  end
32
26
  end
33
27
  end
34
-
28
+
35
29
  end
@@ -1,17 +1,7 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with Atom feed entries.
6
- #
7
- # == Attributes
8
- # * title
9
- # * url
10
- # * author
11
- # * content
12
- # * summary
13
- # * published
14
- # * categories
15
5
  class AtomEntry
16
6
  include SAXMachine
17
7
  include FeedEntryUtilities
@@ -28,12 +18,12 @@ module Feedzirra
28
18
  element :modified, :as => :updated
29
19
  elements :category, :as => :categories, :value => :term
30
20
  elements :link, :as => :links, :value => :href
31
-
21
+
32
22
  def url
33
- @url || links.first
23
+ @url ||= links.first
34
24
  end
35
25
  end
36
26
 
37
27
  end
38
-
28
+
39
29
  end
@@ -1,27 +1,21 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with Feedburner Atom feeds.
6
- #
7
- # == Attributes
8
- # * title
9
- # * feed_url
10
- # * url
11
- # * entries
12
5
  class AtomFeedBurner
13
6
  include SAXMachine
14
7
  include FeedUtilities
15
8
  element :title
9
+ element :subtitle, :as => :description
16
10
  element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
11
  element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
12
  elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
19
13
 
20
14
  def self.able_to_parse?(xml) #:nodoc:
21
- (xml =~ /Atom/ && xml =~ /feedburner/) || false
15
+ ((/Atom/ =~ xml) && (/feedburner/ =~ xml)) || false
22
16
  end
23
17
  end
24
18
 
25
19
  end
26
-
20
+
27
21
  end
@@ -1,17 +1,7 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with Feedburner Atom feed entries.
6
- #
7
- # == Attributes
8
- # * title
9
- # * url
10
- # * author
11
- # * content
12
- # * summary
13
- # * published
14
- # * categories
15
5
  class AtomFeedBurnerEntry
16
6
  include SAXMachine
17
7
  include FeedEntryUtilities
@@ -28,8 +18,13 @@ module Feedzirra
28
18
  element :updated
29
19
  element :modified, :as => :updated
30
20
  elements :category, :as => :categories, :value => :term
31
- end
21
+ elements :link, :as => :links, :value => :href
22
+
23
+ def url
24
+ @url ||= links.first
25
+ end
32
26
 
27
+ end
33
28
  end
34
-
29
+
35
30
  end
@@ -1,5 +1,5 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
4
  # iTunes is RSS 2.0 + some apple extensions
5
5
  # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
@@ -40,11 +40,11 @@ module Feedzirra
40
40
  elements :item, :as => :entries, :class => ITunesRSSItem
41
41
 
42
42
  def self.able_to_parse?(xml)
43
- xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
43
+ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/i =~ xml
44
44
  end
45
45
 
46
46
  end
47
-
47
+
48
48
  end
49
-
49
+
50
50
  end
@@ -5,7 +5,7 @@ module Feedzirra
5
5
  # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
6
  class ITunesRSSItem
7
7
  include SAXMachine
8
- include FeedUtilities
8
+ include FeedEntryUtilities
9
9
  element :author
10
10
  element :guid
11
11
  element :title
@@ -1,28 +1,22 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with RSS feeds.
6
- #
7
- # == Attributes
8
- # * title
9
- # * feed_url
10
- # * url
11
- # * entries
12
5
  class RSS
13
6
  include SAXMachine
14
7
  include FeedUtilities
15
8
  element :title
9
+ element :description
16
10
  element :link, :as => :url
17
11
  elements :item, :as => :entries, :class => RSSEntry
18
12
 
19
13
  attr_accessor :feed_url
20
14
 
21
15
  def self.able_to_parse?(xml) #:nodoc:
22
- xml =~ /\<rss|\<rdf/
16
+ /\<rss|\<rdf/ =~ xml
23
17
  end
24
18
  end
25
19
 
26
20
  end
27
-
21
+
28
22
  end
@@ -1,17 +1,7 @@
1
1
  module Feedzirra
2
-
2
+
3
3
  module Parser
4
- # == Summary
5
4
  # Parser for dealing with RDF feed entries.
6
- #
7
- # == Attributes
8
- # * title
9
- # * url
10
- # * author
11
- # * content
12
- # * summary
13
- # * published
14
- # * categories
15
5
  class RSSEntry
16
6
  include SAXMachine
17
7
  include FeedEntryUtilities
@@ -38,5 +28,5 @@ module Feedzirra
38
28
  end
39
29
 
40
30
  end
41
-
31
+
42
32
  end
@@ -0,0 +1,3 @@
1
+ module Feedzirra
2
+ VERSION = "0.0.30"
3
+ end
@@ -0,0 +1,98 @@
1
+ # this is some spike code to compare the speed of different methods for performing
2
+ # multiple feed fetches
3
+ require 'rubygems'
4
+ require 'curb'
5
+ require 'active_support'
6
+
7
+ require 'net/http'
8
+ require 'uri'
9
+
10
+ require 'benchmark'
11
+ include Benchmark
12
+
13
+ GET_COUNT = 1
14
+ urls = ["http://www.pauldix.net"] * GET_COUNT
15
+
16
+
17
+ benchmark do |t|
18
+ t.report("taf2-curb") do
19
+ multi = Curl::Multi.new
20
+ urls.each do |url|
21
+ easy = Curl::Easy.new(url) do |curl|
22
+ curl.headers["User-Agent"] = "feedzirra"
23
+ # curl.headers["If-Modified-Since"] = Time.now.httpdate
24
+ # curl.headers["If-None-Match"] = "ziEyTl4q9GH04BR4jgkImd0GvSE"
25
+ curl.follow_location = true
26
+ curl.on_success do |c|
27
+ # puts c.header_str.inspect
28
+ # puts c.response_code
29
+ # puts c.body_str.slice(0, 500)
30
+ end
31
+ curl.on_failure do |c|
32
+ puts "**** #{c.response_code}"
33
+ end
34
+ end
35
+ multi.add(easy)
36
+ end
37
+
38
+ multi.perform
39
+ end
40
+
41
+ t.report("nethttp") do
42
+ urls.each do |url|
43
+ res = Net::HTTP.get(URI.parse(url))
44
+ # puts res.slice(0, 500)
45
+ end
46
+ end
47
+
48
+ require 'rfuzz/session'
49
+ include RFuzz
50
+ t.report("rfuzz") do
51
+ GET_COUNT.times do
52
+ http = HttpClient.new("www.pauldix.net", 80)
53
+ response = http.get("/")
54
+ if response.http_status != "200"
55
+ puts "***** #{response.http_status}"
56
+ else
57
+ # puts response.http_status
58
+ # puts response.http_body.slice(0, 500)
59
+ end
60
+ end
61
+ end
62
+
63
+ require 'eventmachine'
64
+ t.report("eventmachine") do
65
+ counter = GET_COUNT
66
+ EM.run do
67
+ GET_COUNT.times do
68
+ http = EM::Protocols::HttpClient2.connect("www.pauldix.net", 80)
69
+ request = http.get("/")
70
+ request.callback do
71
+ # puts request.status
72
+ # puts request.content.slice(0, 500)
73
+ counter -= 1
74
+ EM.stop if counter == 0
75
+ end
76
+ end
77
+ end
78
+ end
79
+
80
+
81
+ require 'curl-multi'
82
+ t.report("curl multi") do
83
+ multi = Curl::Multi.new
84
+ urls.each do |url|
85
+ on_failure = lambda do |ex|
86
+ puts "****** Failed to retrieve #{url}"
87
+ end
88
+
89
+ on_success = lambda do |body|
90
+ # puts "got #{url}"
91
+ # puts body.slice(0, 500)
92
+ end
93
+ multi.get(url, on_success, on_failure)
94
+ end
95
+
96
+ multi.select([], []) while multi.size > 0
97
+ end
98
+ end
@@ -0,0 +1,40 @@
1
+ require File.dirname(__FILE__) + '/../../lib/feedzirra.rb'
2
+ require 'rfeedparser'
3
+ require 'feed-normalizer'
4
+ require 'open-uri'
5
+
6
+ require 'benchmark'
7
+ include Benchmark
8
+
9
+ iterations = 10
10
+ urls = File.readlines(File.dirname(__FILE__) + "/../sample_feeds/successful_feed_urls.txt").slice(0, 20)
11
+ puts "benchmarks on #{urls.size} feeds"
12
+ puts "************************************"
13
+ benchmark do |t|
14
+ t.report("feedzirra") do
15
+ iterations.times do
16
+ Feedzirra::Feed.fetch_and_parse(urls, :on_success => lambda { |url, feed| $stdout.print '.'; $stdout.flush })
17
+ end
18
+ end
19
+
20
+ t.report("rfeedparser") do
21
+ iterations.times do
22
+ urls.each do |url|
23
+ feed = FeedParser.parse(url)
24
+ $stdout.print '.'
25
+ $stdout.flush
26
+ end
27
+ end
28
+ end
29
+
30
+ t.report("feed-normalizer") do
31
+ iterations.times do
32
+ urls.each do |url|
33
+ # have to use the :force option to make feed-normalizer parse an atom feed
34
+ feed = FeedNormalizer::FeedNormalizer.parse(open(url), :force_parser => FeedNormalizer::SimpleRssParser)
35
+ $stdout.print '.'
36
+ $stdout.flush
37
+ end
38
+ end
39
+ end
40
+ end