feedzirra 0.0.24 → 0.0.30
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/README.rdoc +207 -0
- data/Rakefile +19 -24
- data/lib/feedzirra.rb +7 -28
- data/lib/feedzirra/core_ext.rb +3 -0
- data/lib/{core_ext → feedzirra/core_ext}/date.rb +2 -4
- data/lib/{core_ext → feedzirra/core_ext}/string.rb +0 -0
- data/lib/feedzirra/feed.rb +99 -41
- data/lib/feedzirra/feed_entry_utilities.rb +12 -11
- data/lib/feedzirra/parser.rb +15 -0
- data/lib/feedzirra/parser/atom.rb +7 -13
- data/lib/feedzirra/parser/atom_entry.rb +4 -14
- data/lib/feedzirra/parser/atom_feed_burner.rb +4 -10
- data/lib/feedzirra/parser/atom_feed_burner_entry.rb +8 -13
- data/lib/feedzirra/parser/itunes_rss.rb +4 -4
- data/lib/feedzirra/parser/itunes_rss_item.rb +1 -1
- data/lib/feedzirra/parser/rss.rb +4 -10
- data/lib/feedzirra/parser/rss_entry.rb +2 -12
- data/lib/feedzirra/version.rb +3 -0
- data/spec/benchmarks/feed_benchmarks.rb +98 -0
- data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
- data/spec/benchmarks/fetching_benchmarks.rb +28 -0
- data/spec/benchmarks/parsing_benchmark.rb +30 -0
- data/spec/benchmarks/updating_benchmarks.rb +33 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +1 -1
- data/spec/feedzirra/feed_spec.rb +38 -5
- data/spec/feedzirra/feed_utilities_spec.rb +7 -4
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +5 -0
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +5 -1
- data/spec/feedzirra/parser/atom_spec.rb +5 -1
- data/spec/feedzirra/parser/itunes_rss_item_spec.rb +1 -1
- data/spec/feedzirra/parser/rss_entry_spec.rb +2 -1
- data/spec/feedzirra/parser/rss_spec.rb +5 -1
- data/spec/sample_feeds/run_against_sample.rb +20 -0
- data/spec/spec_helper.rb +10 -2
- metadata +141 -59
- data/README.textile +0 -208
- data/spec/spec.opts +0 -2
@@ -1,34 +1,35 @@
|
|
1
1
|
module Feedzirra
|
2
2
|
module FeedEntryUtilities
|
3
|
+
|
3
4
|
def published
|
4
|
-
@published
|
5
|
+
@published ||= @updated
|
5
6
|
end
|
6
7
|
|
7
8
|
def parse_datetime(string)
|
8
9
|
begin
|
9
10
|
DateTime.parse(string).feed_utils_to_gm_time
|
10
11
|
rescue
|
11
|
-
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
puts "DATE CAN'T BE PARSED: [#{string}]"
|
12
13
|
nil
|
13
14
|
end
|
14
15
|
end
|
15
16
|
|
16
17
|
##
|
17
18
|
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
-
def id
|
19
|
-
@entry_id
|
19
|
+
def id
|
20
|
+
@entry_id ||= @url
|
20
21
|
end
|
21
|
-
|
22
|
+
|
22
23
|
##
|
23
|
-
#
|
24
|
-
def published=(val)
|
24
|
+
# Writer for published. By default, we keep the "oldest" publish time found.
|
25
|
+
def published=(val)
|
25
26
|
parsed = parse_datetime(val)
|
26
|
-
@published = parsed if !@published || parsed < @published
|
27
|
+
@published = parsed if !@published || parsed < @published
|
27
28
|
end
|
28
|
-
|
29
|
+
|
29
30
|
##
|
30
|
-
#
|
31
|
-
def updated=(val)
|
31
|
+
# Writer for updated. By default, we keep the most recent update time found.
|
32
|
+
def updated=(val)
|
32
33
|
parsed = parse_datetime(val)
|
33
34
|
@updated = parsed if !@updated || parsed > @updated
|
34
35
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module Parser
|
3
|
+
autoload :RSS, 'feedzirra/parser/rss'
|
4
|
+
autoload :RSSEntry, 'feedzirra/parser/rss_entry'
|
5
|
+
|
6
|
+
autoload :ITunesRSS, 'feedzirra/parser/itunes_rss'
|
7
|
+
autoload :ITunesRSSItem, 'feedzirra/parser/itunes_rss_item'
|
8
|
+
autoload :ITunesRSSOwner, 'feedzirra/parser/itunes_rss_owner'
|
9
|
+
|
10
|
+
autoload :Atom, 'feedzirra/parser/atom'
|
11
|
+
autoload :AtomEntry, 'feedzirra/parser/atom_entry'
|
12
|
+
autoload :AtomFeedBurner, 'feedzirra/parser/atom_feed_burner'
|
13
|
+
autoload :AtomFeedBurnerEntry, 'feedzirra/parser/atom_feed_burner_entry'
|
14
|
+
end
|
15
|
+
end
|
@@ -1,35 +1,29 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with Atom feeds.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * feed_url
|
10
|
-
# * url
|
11
|
-
# * entries
|
12
5
|
class Atom
|
13
6
|
include SAXMachine
|
14
7
|
include FeedUtilities
|
15
8
|
element :title
|
9
|
+
element :subtitle, :as => :description
|
16
10
|
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
11
|
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
12
|
elements :link, :as => :links, :value => :href
|
19
13
|
elements :entry, :as => :entries, :class => AtomEntry
|
20
14
|
|
21
15
|
def self.able_to_parse?(xml) #:nodoc:
|
22
|
-
|
16
|
+
/\<feed[^\>]+xmlns=[\"|\'](http:\/\/www\.w3\.org\/2005\/Atom|http:\/\/purl\.org\/atom\/ns\#)[\"|\'][^\>]*\>/ =~ xml
|
23
17
|
end
|
24
|
-
|
18
|
+
|
25
19
|
def url
|
26
20
|
@url || links.last
|
27
21
|
end
|
28
|
-
|
22
|
+
|
29
23
|
def feed_url
|
30
|
-
@feed_url
|
24
|
+
@feed_url ||= links.first
|
31
25
|
end
|
32
26
|
end
|
33
27
|
end
|
34
|
-
|
28
|
+
|
35
29
|
end
|
@@ -1,17 +1,7 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with Atom feed entries.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * url
|
10
|
-
# * author
|
11
|
-
# * content
|
12
|
-
# * summary
|
13
|
-
# * published
|
14
|
-
# * categories
|
15
5
|
class AtomEntry
|
16
6
|
include SAXMachine
|
17
7
|
include FeedEntryUtilities
|
@@ -28,12 +18,12 @@ module Feedzirra
|
|
28
18
|
element :modified, :as => :updated
|
29
19
|
elements :category, :as => :categories, :value => :term
|
30
20
|
elements :link, :as => :links, :value => :href
|
31
|
-
|
21
|
+
|
32
22
|
def url
|
33
|
-
@url
|
23
|
+
@url ||= links.first
|
34
24
|
end
|
35
25
|
end
|
36
26
|
|
37
27
|
end
|
38
|
-
|
28
|
+
|
39
29
|
end
|
@@ -1,27 +1,21 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with Feedburner Atom feeds.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * feed_url
|
10
|
-
# * url
|
11
|
-
# * entries
|
12
5
|
class AtomFeedBurner
|
13
6
|
include SAXMachine
|
14
7
|
include FeedUtilities
|
15
8
|
element :title
|
9
|
+
element :subtitle, :as => :description
|
16
10
|
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
11
|
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
12
|
elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
|
19
13
|
|
20
14
|
def self.able_to_parse?(xml) #:nodoc:
|
21
|
-
(
|
15
|
+
((/Atom/ =~ xml) && (/feedburner/ =~ xml)) || false
|
22
16
|
end
|
23
17
|
end
|
24
18
|
|
25
19
|
end
|
26
|
-
|
20
|
+
|
27
21
|
end
|
@@ -1,17 +1,7 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with Feedburner Atom feed entries.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * url
|
10
|
-
# * author
|
11
|
-
# * content
|
12
|
-
# * summary
|
13
|
-
# * published
|
14
|
-
# * categories
|
15
5
|
class AtomFeedBurnerEntry
|
16
6
|
include SAXMachine
|
17
7
|
include FeedEntryUtilities
|
@@ -28,8 +18,13 @@ module Feedzirra
|
|
28
18
|
element :updated
|
29
19
|
element :modified, :as => :updated
|
30
20
|
elements :category, :as => :categories, :value => :term
|
31
|
-
|
21
|
+
elements :link, :as => :links, :value => :href
|
22
|
+
|
23
|
+
def url
|
24
|
+
@url ||= links.first
|
25
|
+
end
|
32
26
|
|
27
|
+
end
|
33
28
|
end
|
34
|
-
|
29
|
+
|
35
30
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
4
|
# iTunes is RSS 2.0 + some apple extensions
|
5
5
|
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
@@ -40,11 +40,11 @@ module Feedzirra
|
|
40
40
|
elements :item, :as => :entries, :class => ITunesRSSItem
|
41
41
|
|
42
42
|
def self.able_to_parse?(xml)
|
43
|
-
|
43
|
+
/xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/i =~ xml
|
44
44
|
end
|
45
45
|
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
end
|
data/lib/feedzirra/parser/rss.rb
CHANGED
@@ -1,28 +1,22 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with RSS feeds.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * feed_url
|
10
|
-
# * url
|
11
|
-
# * entries
|
12
5
|
class RSS
|
13
6
|
include SAXMachine
|
14
7
|
include FeedUtilities
|
15
8
|
element :title
|
9
|
+
element :description
|
16
10
|
element :link, :as => :url
|
17
11
|
elements :item, :as => :entries, :class => RSSEntry
|
18
12
|
|
19
13
|
attr_accessor :feed_url
|
20
14
|
|
21
15
|
def self.able_to_parse?(xml) #:nodoc:
|
22
|
-
|
16
|
+
/\<rss|\<rdf/ =~ xml
|
23
17
|
end
|
24
18
|
end
|
25
19
|
|
26
20
|
end
|
27
|
-
|
21
|
+
|
28
22
|
end
|
@@ -1,17 +1,7 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with RDF feed entries.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * url
|
10
|
-
# * author
|
11
|
-
# * content
|
12
|
-
# * summary
|
13
|
-
# * published
|
14
|
-
# * categories
|
15
5
|
class RSSEntry
|
16
6
|
include SAXMachine
|
17
7
|
include FeedEntryUtilities
|
@@ -38,5 +28,5 @@ module Feedzirra
|
|
38
28
|
end
|
39
29
|
|
40
30
|
end
|
41
|
-
|
31
|
+
|
42
32
|
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# this is some spike code to compare the speed of different methods for performing
|
2
|
+
# multiple feed fetches
|
3
|
+
require 'rubygems'
|
4
|
+
require 'curb'
|
5
|
+
require 'active_support'
|
6
|
+
|
7
|
+
require 'net/http'
|
8
|
+
require 'uri'
|
9
|
+
|
10
|
+
require 'benchmark'
|
11
|
+
include Benchmark
|
12
|
+
|
13
|
+
GET_COUNT = 1
|
14
|
+
urls = ["http://www.pauldix.net"] * GET_COUNT
|
15
|
+
|
16
|
+
|
17
|
+
benchmark do |t|
|
18
|
+
t.report("taf2-curb") do
|
19
|
+
multi = Curl::Multi.new
|
20
|
+
urls.each do |url|
|
21
|
+
easy = Curl::Easy.new(url) do |curl|
|
22
|
+
curl.headers["User-Agent"] = "feedzirra"
|
23
|
+
# curl.headers["If-Modified-Since"] = Time.now.httpdate
|
24
|
+
# curl.headers["If-None-Match"] = "ziEyTl4q9GH04BR4jgkImd0GvSE"
|
25
|
+
curl.follow_location = true
|
26
|
+
curl.on_success do |c|
|
27
|
+
# puts c.header_str.inspect
|
28
|
+
# puts c.response_code
|
29
|
+
# puts c.body_str.slice(0, 500)
|
30
|
+
end
|
31
|
+
curl.on_failure do |c|
|
32
|
+
puts "**** #{c.response_code}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
multi.add(easy)
|
36
|
+
end
|
37
|
+
|
38
|
+
multi.perform
|
39
|
+
end
|
40
|
+
|
41
|
+
t.report("nethttp") do
|
42
|
+
urls.each do |url|
|
43
|
+
res = Net::HTTP.get(URI.parse(url))
|
44
|
+
# puts res.slice(0, 500)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
require 'rfuzz/session'
|
49
|
+
include RFuzz
|
50
|
+
t.report("rfuzz") do
|
51
|
+
GET_COUNT.times do
|
52
|
+
http = HttpClient.new("www.pauldix.net", 80)
|
53
|
+
response = http.get("/")
|
54
|
+
if response.http_status != "200"
|
55
|
+
puts "***** #{response.http_status}"
|
56
|
+
else
|
57
|
+
# puts response.http_status
|
58
|
+
# puts response.http_body.slice(0, 500)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
require 'eventmachine'
|
64
|
+
t.report("eventmachine") do
|
65
|
+
counter = GET_COUNT
|
66
|
+
EM.run do
|
67
|
+
GET_COUNT.times do
|
68
|
+
http = EM::Protocols::HttpClient2.connect("www.pauldix.net", 80)
|
69
|
+
request = http.get("/")
|
70
|
+
request.callback do
|
71
|
+
# puts request.status
|
72
|
+
# puts request.content.slice(0, 500)
|
73
|
+
counter -= 1
|
74
|
+
EM.stop if counter == 0
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
require 'curl-multi'
|
82
|
+
t.report("curl multi") do
|
83
|
+
multi = Curl::Multi.new
|
84
|
+
urls.each do |url|
|
85
|
+
on_failure = lambda do |ex|
|
86
|
+
puts "****** Failed to retrieve #{url}"
|
87
|
+
end
|
88
|
+
|
89
|
+
on_success = lambda do |body|
|
90
|
+
# puts "got #{url}"
|
91
|
+
# puts body.slice(0, 500)
|
92
|
+
end
|
93
|
+
multi.get(url, on_success, on_failure)
|
94
|
+
end
|
95
|
+
|
96
|
+
multi.select([], []) while multi.size > 0
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../lib/feedzirra.rb'
|
2
|
+
require 'rfeedparser'
|
3
|
+
require 'feed-normalizer'
|
4
|
+
require 'open-uri'
|
5
|
+
|
6
|
+
require 'benchmark'
|
7
|
+
include Benchmark
|
8
|
+
|
9
|
+
iterations = 10
|
10
|
+
urls = File.readlines(File.dirname(__FILE__) + "/../sample_feeds/successful_feed_urls.txt").slice(0, 20)
|
11
|
+
puts "benchmarks on #{urls.size} feeds"
|
12
|
+
puts "************************************"
|
13
|
+
benchmark do |t|
|
14
|
+
t.report("feedzirra") do
|
15
|
+
iterations.times do
|
16
|
+
Feedzirra::Feed.fetch_and_parse(urls, :on_success => lambda { |url, feed| $stdout.print '.'; $stdout.flush })
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
t.report("rfeedparser") do
|
21
|
+
iterations.times do
|
22
|
+
urls.each do |url|
|
23
|
+
feed = FeedParser.parse(url)
|
24
|
+
$stdout.print '.'
|
25
|
+
$stdout.flush
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
t.report("feed-normalizer") do
|
31
|
+
iterations.times do
|
32
|
+
urls.each do |url|
|
33
|
+
# have to use the :force option to make feed-normalizer parse an atom feed
|
34
|
+
feed = FeedNormalizer::FeedNormalizer.parse(open(url), :force_parser => FeedNormalizer::SimpleRssParser)
|
35
|
+
$stdout.print '.'
|
36
|
+
$stdout.flush
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|