feedzirra 0.0.24 → 0.0.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.rspec +1 -0
- data/README.rdoc +207 -0
- data/Rakefile +19 -24
- data/lib/feedzirra.rb +7 -28
- data/lib/feedzirra/core_ext.rb +3 -0
- data/lib/{core_ext → feedzirra/core_ext}/date.rb +2 -4
- data/lib/{core_ext → feedzirra/core_ext}/string.rb +0 -0
- data/lib/feedzirra/feed.rb +99 -41
- data/lib/feedzirra/feed_entry_utilities.rb +12 -11
- data/lib/feedzirra/parser.rb +15 -0
- data/lib/feedzirra/parser/atom.rb +7 -13
- data/lib/feedzirra/parser/atom_entry.rb +4 -14
- data/lib/feedzirra/parser/atom_feed_burner.rb +4 -10
- data/lib/feedzirra/parser/atom_feed_burner_entry.rb +8 -13
- data/lib/feedzirra/parser/itunes_rss.rb +4 -4
- data/lib/feedzirra/parser/itunes_rss_item.rb +1 -1
- data/lib/feedzirra/parser/rss.rb +4 -10
- data/lib/feedzirra/parser/rss_entry.rb +2 -12
- data/lib/feedzirra/version.rb +3 -0
- data/spec/benchmarks/feed_benchmarks.rb +98 -0
- data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
- data/spec/benchmarks/fetching_benchmarks.rb +28 -0
- data/spec/benchmarks/parsing_benchmark.rb +30 -0
- data/spec/benchmarks/updating_benchmarks.rb +33 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +1 -1
- data/spec/feedzirra/feed_spec.rb +38 -5
- data/spec/feedzirra/feed_utilities_spec.rb +7 -4
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +5 -0
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +5 -1
- data/spec/feedzirra/parser/atom_spec.rb +5 -1
- data/spec/feedzirra/parser/itunes_rss_item_spec.rb +1 -1
- data/spec/feedzirra/parser/rss_entry_spec.rb +2 -1
- data/spec/feedzirra/parser/rss_spec.rb +5 -1
- data/spec/sample_feeds/run_against_sample.rb +20 -0
- data/spec/spec_helper.rb +10 -2
- metadata +141 -59
- data/README.textile +0 -208
- data/spec/spec.opts +0 -2
@@ -1,34 +1,35 @@
|
|
1
1
|
module Feedzirra
|
2
2
|
module FeedEntryUtilities
|
3
|
+
|
3
4
|
def published
|
4
|
-
@published
|
5
|
+
@published ||= @updated
|
5
6
|
end
|
6
7
|
|
7
8
|
def parse_datetime(string)
|
8
9
|
begin
|
9
10
|
DateTime.parse(string).feed_utils_to_gm_time
|
10
11
|
rescue
|
11
|
-
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
puts "DATE CAN'T BE PARSED: [#{string}]"
|
12
13
|
nil
|
13
14
|
end
|
14
15
|
end
|
15
16
|
|
16
17
|
##
|
17
18
|
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
-
def id
|
19
|
-
@entry_id
|
19
|
+
def id
|
20
|
+
@entry_id ||= @url
|
20
21
|
end
|
21
|
-
|
22
|
+
|
22
23
|
##
|
23
|
-
#
|
24
|
-
def published=(val)
|
24
|
+
# Writer for published. By default, we keep the "oldest" publish time found.
|
25
|
+
def published=(val)
|
25
26
|
parsed = parse_datetime(val)
|
26
|
-
@published = parsed if !@published || parsed < @published
|
27
|
+
@published = parsed if !@published || parsed < @published
|
27
28
|
end
|
28
|
-
|
29
|
+
|
29
30
|
##
|
30
|
-
#
|
31
|
-
def updated=(val)
|
31
|
+
# Writer for updated. By default, we keep the most recent update time found.
|
32
|
+
def updated=(val)
|
32
33
|
parsed = parse_datetime(val)
|
33
34
|
@updated = parsed if !@updated || parsed > @updated
|
34
35
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module Parser
|
3
|
+
autoload :RSS, 'feedzirra/parser/rss'
|
4
|
+
autoload :RSSEntry, 'feedzirra/parser/rss_entry'
|
5
|
+
|
6
|
+
autoload :ITunesRSS, 'feedzirra/parser/itunes_rss'
|
7
|
+
autoload :ITunesRSSItem, 'feedzirra/parser/itunes_rss_item'
|
8
|
+
autoload :ITunesRSSOwner, 'feedzirra/parser/itunes_rss_owner'
|
9
|
+
|
10
|
+
autoload :Atom, 'feedzirra/parser/atom'
|
11
|
+
autoload :AtomEntry, 'feedzirra/parser/atom_entry'
|
12
|
+
autoload :AtomFeedBurner, 'feedzirra/parser/atom_feed_burner'
|
13
|
+
autoload :AtomFeedBurnerEntry, 'feedzirra/parser/atom_feed_burner_entry'
|
14
|
+
end
|
15
|
+
end
|
@@ -1,35 +1,29 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with Atom feeds.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * feed_url
|
10
|
-
# * url
|
11
|
-
# * entries
|
12
5
|
class Atom
|
13
6
|
include SAXMachine
|
14
7
|
include FeedUtilities
|
15
8
|
element :title
|
9
|
+
element :subtitle, :as => :description
|
16
10
|
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
11
|
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
12
|
elements :link, :as => :links, :value => :href
|
19
13
|
elements :entry, :as => :entries, :class => AtomEntry
|
20
14
|
|
21
15
|
def self.able_to_parse?(xml) #:nodoc:
|
22
|
-
|
16
|
+
/\<feed[^\>]+xmlns=[\"|\'](http:\/\/www\.w3\.org\/2005\/Atom|http:\/\/purl\.org\/atom\/ns\#)[\"|\'][^\>]*\>/ =~ xml
|
23
17
|
end
|
24
|
-
|
18
|
+
|
25
19
|
def url
|
26
20
|
@url || links.last
|
27
21
|
end
|
28
|
-
|
22
|
+
|
29
23
|
def feed_url
|
30
|
-
@feed_url
|
24
|
+
@feed_url ||= links.first
|
31
25
|
end
|
32
26
|
end
|
33
27
|
end
|
34
|
-
|
28
|
+
|
35
29
|
end
|
@@ -1,17 +1,7 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with Atom feed entries.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * url
|
10
|
-
# * author
|
11
|
-
# * content
|
12
|
-
# * summary
|
13
|
-
# * published
|
14
|
-
# * categories
|
15
5
|
class AtomEntry
|
16
6
|
include SAXMachine
|
17
7
|
include FeedEntryUtilities
|
@@ -28,12 +18,12 @@ module Feedzirra
|
|
28
18
|
element :modified, :as => :updated
|
29
19
|
elements :category, :as => :categories, :value => :term
|
30
20
|
elements :link, :as => :links, :value => :href
|
31
|
-
|
21
|
+
|
32
22
|
def url
|
33
|
-
@url
|
23
|
+
@url ||= links.first
|
34
24
|
end
|
35
25
|
end
|
36
26
|
|
37
27
|
end
|
38
|
-
|
28
|
+
|
39
29
|
end
|
@@ -1,27 +1,21 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with Feedburner Atom feeds.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * feed_url
|
10
|
-
# * url
|
11
|
-
# * entries
|
12
5
|
class AtomFeedBurner
|
13
6
|
include SAXMachine
|
14
7
|
include FeedUtilities
|
15
8
|
element :title
|
9
|
+
element :subtitle, :as => :description
|
16
10
|
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
11
|
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
12
|
elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
|
19
13
|
|
20
14
|
def self.able_to_parse?(xml) #:nodoc:
|
21
|
-
(
|
15
|
+
((/Atom/ =~ xml) && (/feedburner/ =~ xml)) || false
|
22
16
|
end
|
23
17
|
end
|
24
18
|
|
25
19
|
end
|
26
|
-
|
20
|
+
|
27
21
|
end
|
@@ -1,17 +1,7 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with Feedburner Atom feed entries.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * url
|
10
|
-
# * author
|
11
|
-
# * content
|
12
|
-
# * summary
|
13
|
-
# * published
|
14
|
-
# * categories
|
15
5
|
class AtomFeedBurnerEntry
|
16
6
|
include SAXMachine
|
17
7
|
include FeedEntryUtilities
|
@@ -28,8 +18,13 @@ module Feedzirra
|
|
28
18
|
element :updated
|
29
19
|
element :modified, :as => :updated
|
30
20
|
elements :category, :as => :categories, :value => :term
|
31
|
-
|
21
|
+
elements :link, :as => :links, :value => :href
|
22
|
+
|
23
|
+
def url
|
24
|
+
@url ||= links.first
|
25
|
+
end
|
32
26
|
|
27
|
+
end
|
33
28
|
end
|
34
|
-
|
29
|
+
|
35
30
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
4
|
# iTunes is RSS 2.0 + some apple extensions
|
5
5
|
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
@@ -40,11 +40,11 @@ module Feedzirra
|
|
40
40
|
elements :item, :as => :entries, :class => ITunesRSSItem
|
41
41
|
|
42
42
|
def self.able_to_parse?(xml)
|
43
|
-
|
43
|
+
/xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/i =~ xml
|
44
44
|
end
|
45
45
|
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
end
|
data/lib/feedzirra/parser/rss.rb
CHANGED
@@ -1,28 +1,22 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with RSS feeds.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * feed_url
|
10
|
-
# * url
|
11
|
-
# * entries
|
12
5
|
class RSS
|
13
6
|
include SAXMachine
|
14
7
|
include FeedUtilities
|
15
8
|
element :title
|
9
|
+
element :description
|
16
10
|
element :link, :as => :url
|
17
11
|
elements :item, :as => :entries, :class => RSSEntry
|
18
12
|
|
19
13
|
attr_accessor :feed_url
|
20
14
|
|
21
15
|
def self.able_to_parse?(xml) #:nodoc:
|
22
|
-
|
16
|
+
/\<rss|\<rdf/ =~ xml
|
23
17
|
end
|
24
18
|
end
|
25
19
|
|
26
20
|
end
|
27
|
-
|
21
|
+
|
28
22
|
end
|
@@ -1,17 +1,7 @@
|
|
1
1
|
module Feedzirra
|
2
|
-
|
2
|
+
|
3
3
|
module Parser
|
4
|
-
# == Summary
|
5
4
|
# Parser for dealing with RDF feed entries.
|
6
|
-
#
|
7
|
-
# == Attributes
|
8
|
-
# * title
|
9
|
-
# * url
|
10
|
-
# * author
|
11
|
-
# * content
|
12
|
-
# * summary
|
13
|
-
# * published
|
14
|
-
# * categories
|
15
5
|
class RSSEntry
|
16
6
|
include SAXMachine
|
17
7
|
include FeedEntryUtilities
|
@@ -38,5 +28,5 @@ module Feedzirra
|
|
38
28
|
end
|
39
29
|
|
40
30
|
end
|
41
|
-
|
31
|
+
|
42
32
|
end
|
@@ -0,0 +1,98 @@
|
|
1
|
+
# this is some spike code to compare the speed of different methods for performing
|
2
|
+
# multiple feed fetches
|
3
|
+
require 'rubygems'
|
4
|
+
require 'curb'
|
5
|
+
require 'active_support'
|
6
|
+
|
7
|
+
require 'net/http'
|
8
|
+
require 'uri'
|
9
|
+
|
10
|
+
require 'benchmark'
|
11
|
+
include Benchmark
|
12
|
+
|
13
|
+
GET_COUNT = 1
|
14
|
+
urls = ["http://www.pauldix.net"] * GET_COUNT
|
15
|
+
|
16
|
+
|
17
|
+
benchmark do |t|
|
18
|
+
t.report("taf2-curb") do
|
19
|
+
multi = Curl::Multi.new
|
20
|
+
urls.each do |url|
|
21
|
+
easy = Curl::Easy.new(url) do |curl|
|
22
|
+
curl.headers["User-Agent"] = "feedzirra"
|
23
|
+
# curl.headers["If-Modified-Since"] = Time.now.httpdate
|
24
|
+
# curl.headers["If-None-Match"] = "ziEyTl4q9GH04BR4jgkImd0GvSE"
|
25
|
+
curl.follow_location = true
|
26
|
+
curl.on_success do |c|
|
27
|
+
# puts c.header_str.inspect
|
28
|
+
# puts c.response_code
|
29
|
+
# puts c.body_str.slice(0, 500)
|
30
|
+
end
|
31
|
+
curl.on_failure do |c|
|
32
|
+
puts "**** #{c.response_code}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
multi.add(easy)
|
36
|
+
end
|
37
|
+
|
38
|
+
multi.perform
|
39
|
+
end
|
40
|
+
|
41
|
+
t.report("nethttp") do
|
42
|
+
urls.each do |url|
|
43
|
+
res = Net::HTTP.get(URI.parse(url))
|
44
|
+
# puts res.slice(0, 500)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
require 'rfuzz/session'
|
49
|
+
include RFuzz
|
50
|
+
t.report("rfuzz") do
|
51
|
+
GET_COUNT.times do
|
52
|
+
http = HttpClient.new("www.pauldix.net", 80)
|
53
|
+
response = http.get("/")
|
54
|
+
if response.http_status != "200"
|
55
|
+
puts "***** #{response.http_status}"
|
56
|
+
else
|
57
|
+
# puts response.http_status
|
58
|
+
# puts response.http_body.slice(0, 500)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
require 'eventmachine'
|
64
|
+
t.report("eventmachine") do
|
65
|
+
counter = GET_COUNT
|
66
|
+
EM.run do
|
67
|
+
GET_COUNT.times do
|
68
|
+
http = EM::Protocols::HttpClient2.connect("www.pauldix.net", 80)
|
69
|
+
request = http.get("/")
|
70
|
+
request.callback do
|
71
|
+
# puts request.status
|
72
|
+
# puts request.content.slice(0, 500)
|
73
|
+
counter -= 1
|
74
|
+
EM.stop if counter == 0
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
require 'curl-multi'
|
82
|
+
t.report("curl multi") do
|
83
|
+
multi = Curl::Multi.new
|
84
|
+
urls.each do |url|
|
85
|
+
on_failure = lambda do |ex|
|
86
|
+
puts "****** Failed to retrieve #{url}"
|
87
|
+
end
|
88
|
+
|
89
|
+
on_success = lambda do |body|
|
90
|
+
# puts "got #{url}"
|
91
|
+
# puts body.slice(0, 500)
|
92
|
+
end
|
93
|
+
multi.get(url, on_success, on_failure)
|
94
|
+
end
|
95
|
+
|
96
|
+
multi.select([], []) while multi.size > 0
|
97
|
+
end
|
98
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../lib/feedzirra.rb'
|
2
|
+
require 'rfeedparser'
|
3
|
+
require 'feed-normalizer'
|
4
|
+
require 'open-uri'
|
5
|
+
|
6
|
+
require 'benchmark'
|
7
|
+
include Benchmark
|
8
|
+
|
9
|
+
iterations = 10
|
10
|
+
urls = File.readlines(File.dirname(__FILE__) + "/../sample_feeds/successful_feed_urls.txt").slice(0, 20)
|
11
|
+
puts "benchmarks on #{urls.size} feeds"
|
12
|
+
puts "************************************"
|
13
|
+
benchmark do |t|
|
14
|
+
t.report("feedzirra") do
|
15
|
+
iterations.times do
|
16
|
+
Feedzirra::Feed.fetch_and_parse(urls, :on_success => lambda { |url, feed| $stdout.print '.'; $stdout.flush })
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
t.report("rfeedparser") do
|
21
|
+
iterations.times do
|
22
|
+
urls.each do |url|
|
23
|
+
feed = FeedParser.parse(url)
|
24
|
+
$stdout.print '.'
|
25
|
+
$stdout.flush
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
t.report("feed-normalizer") do
|
31
|
+
iterations.times do
|
32
|
+
urls.each do |url|
|
33
|
+
# have to use the :force option to make feed-normalizer parse an atom feed
|
34
|
+
feed = FeedNormalizer::FeedNormalizer.parse(open(url), :force_parser => FeedNormalizer::SimpleRssParser)
|
35
|
+
$stdout.print '.'
|
36
|
+
$stdout.flush
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|