eric-feedzirra 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +186 -0
- data/Rakefile +53 -0
- data/lib/core_ext/date.rb +21 -0
- data/lib/core_ext/string.rb +9 -0
- data/lib/feedzirra.rb +34 -0
- data/lib/feedzirra/atom.rb +22 -0
- data/lib/feedzirra/atom_entry.rb +29 -0
- data/lib/feedzirra/atom_feed_burner.rb +22 -0
- data/lib/feedzirra/atom_feed_burner_entry.rb +30 -0
- data/lib/feedzirra/feed.rb +321 -0
- data/lib/feedzirra/feed_entry_utilities.rb +45 -0
- data/lib/feedzirra/feed_utilities.rb +71 -0
- data/lib/feedzirra/itunes_rss.rb +46 -0
- data/lib/feedzirra/itunes_rss_item.rb +28 -0
- data/lib/feedzirra/itunes_rss_owner.rb +8 -0
- data/lib/feedzirra/rss.rb +23 -0
- data/lib/feedzirra/rss_entry.rb +35 -0
- data/spec/feedzirra/atom_entry_spec.rb +45 -0
- data/spec/feedzirra/atom_feed_burner_entry_spec.rb +42 -0
- data/spec/feedzirra/atom_feed_burner_spec.rb +39 -0
- data/spec/feedzirra/atom_spec.rb +35 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
- data/spec/feedzirra/feed_spec.rb +543 -0
- data/spec/feedzirra/feed_utilities_spec.rb +149 -0
- data/spec/feedzirra/itunes_rss_item_spec.rb +48 -0
- data/spec/feedzirra/itunes_rss_owner_spec.rb +18 -0
- data/spec/feedzirra/itunes_rss_spec.rb +50 -0
- data/spec/feedzirra/rss_entry_spec.rb +41 -0
- data/spec/feedzirra/rss_spec.rb +41 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +58 -0
- metadata +142 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedEntryUtilities
|
3
|
+
def published
|
4
|
+
@published || @updated
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_datetime(string)
|
8
|
+
begin
|
9
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
10
|
+
rescue
|
11
|
+
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
+
def id
|
19
|
+
@id || @url
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Writter for published. By default, we keep the "oldest" publish time found.
|
24
|
+
def published=(val)
|
25
|
+
parsed = parse_datetime(val)
|
26
|
+
@published = parsed if !@published || parsed < @published
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Writter for udapted. By default, we keep the most recenet update time found.
|
31
|
+
def updated=(val)
|
32
|
+
parsed = parse_datetime(val)
|
33
|
+
@updated = parsed if !@updated || parsed > @updated
|
34
|
+
end
|
35
|
+
|
36
|
+
def sanitize!
|
37
|
+
self.title.sanitize! if self.title
|
38
|
+
self.author.sanitize! if self.author
|
39
|
+
self.summary.sanitize! if self.summary
|
40
|
+
self.content.sanitize! if self.content
|
41
|
+
end
|
42
|
+
|
43
|
+
alias_method :last_modified, :published
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedUtilities
|
3
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
|
4
|
+
|
5
|
+
attr_writer :new_entries, :updated, :last_modified
|
6
|
+
attr_accessor :etag
|
7
|
+
|
8
|
+
def last_modified
|
9
|
+
@last_modified ||= begin
|
10
|
+
entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
|
11
|
+
entry ? entry.published : nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def updated?
|
16
|
+
@updated
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_entries
|
20
|
+
@new_entries ||= []
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_new_entries?
|
24
|
+
new_entries.size > 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_from_feed(feed)
|
28
|
+
self.new_entries += find_new_entries_for(feed)
|
29
|
+
self.entries.unshift(*self.new_entries)
|
30
|
+
|
31
|
+
updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def update_attribute(feed, name)
|
35
|
+
old_value, new_value = send(name), feed.send(name)
|
36
|
+
|
37
|
+
if old_value != new_value
|
38
|
+
send("#{name}=", new_value)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def sanitize_entries!
|
43
|
+
entries.each {|entry| entry.sanitize!}
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def updated!
|
49
|
+
@updated = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_new_entries_for(feed)
|
53
|
+
# this implementation is a hack, which is why it's so ugly.
|
54
|
+
# it's to get around the fact that not all feeds have a published date.
|
55
|
+
# however, they're always ordered with the newest one first.
|
56
|
+
# So we go through the entries just parsed and insert each one as a new entry
|
57
|
+
# until we get to one that has the same url as the the newest for the feed
|
58
|
+
latest_entry = self.entries.first
|
59
|
+
found_new_entries = []
|
60
|
+
feed.entries.each do |entry|
|
61
|
+
break if entry.url == latest_entry.url
|
62
|
+
found_new_entries << entry
|
63
|
+
end
|
64
|
+
found_new_entries
|
65
|
+
end
|
66
|
+
|
67
|
+
def existing_entry?(test_entry)
|
68
|
+
entries.any? { |entry| entry.url == test_entry.url }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
# iTunes is RSS 2.0 + some apple extensions
|
3
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
4
|
+
class ITunesRSS
|
5
|
+
include SAXMachine
|
6
|
+
include FeedUtilities
|
7
|
+
|
8
|
+
attr_accessor :feed_url
|
9
|
+
|
10
|
+
# RSS 2.0 elements that need including
|
11
|
+
element :copyright
|
12
|
+
element :description
|
13
|
+
element :language
|
14
|
+
element :managingEditor
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url
|
17
|
+
|
18
|
+
# If author is not present use managingEditor on the channel
|
19
|
+
element :"itunes:author", :as => :itunes_author
|
20
|
+
element :"itunes:block", :as => :itunes_block
|
21
|
+
element :"itunes:image", :value => :href, :as => :itunes_image
|
22
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
23
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
24
|
+
# New URL for the podcast feed
|
25
|
+
element :"itunes:new-feed-url", :as => :itunes_new_feed_url
|
26
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
27
|
+
# If summary is not present, use the description tag
|
28
|
+
element :"itunes:summary", :as => :itunes_summary
|
29
|
+
|
30
|
+
# iTunes RSS feeds can have multiple main categories...
|
31
|
+
# ...and multiple sub-categories per category
|
32
|
+
# TODO subcategories not supported correctly - they are at the same level
|
33
|
+
# as the main categories
|
34
|
+
elements :"itunes:category", :as => :itunes_categories, :value => :text
|
35
|
+
|
36
|
+
elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
|
37
|
+
|
38
|
+
elements :item, :as => :entries, :class => ITunesRSSItem
|
39
|
+
|
40
|
+
def self.able_to_parse?(xml)
|
41
|
+
xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
# iTunes extensions to the standard RSS2.0 item
|
3
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
4
|
+
class ITunesRSSItem
|
5
|
+
include SAXMachine
|
6
|
+
include FeedUtilities
|
7
|
+
element :author
|
8
|
+
element :guid
|
9
|
+
element :title
|
10
|
+
element :link, :as => :url
|
11
|
+
element :description, :as => :summary
|
12
|
+
element :pubDate, :as => :published
|
13
|
+
|
14
|
+
# If author is not present use author tag on the item
|
15
|
+
element :"itunes:author", :as => :itunes_author
|
16
|
+
element :"itunes:block", :as => :itunes_block
|
17
|
+
element :"itunes:duration", :as => :itunes_duration
|
18
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
19
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
20
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
21
|
+
# If summary is not present, use the description tag
|
22
|
+
element :"itunes:summary", :as => :itunes_summary
|
23
|
+
element :enclosure, :value => :length, :as => :enclosure_length
|
24
|
+
element :enclosure, :value => :type, :as => :enclosure_type
|
25
|
+
element :enclosure, :value => :url, :as => :enclosure_url
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
# == Summary
|
3
|
+
# Parser for dealing with RSS feeds.
|
4
|
+
#
|
5
|
+
# == Attributes
|
6
|
+
# * title
|
7
|
+
# * feed_url
|
8
|
+
# * url
|
9
|
+
# * entries
|
10
|
+
class RSS
|
11
|
+
include SAXMachine
|
12
|
+
include FeedUtilities
|
13
|
+
element :title
|
14
|
+
element :link, :as => :url
|
15
|
+
elements :item, :as => :entries, :class => RSSEntry
|
16
|
+
|
17
|
+
attr_accessor :feed_url
|
18
|
+
|
19
|
+
def self.able_to_parse?(xml) #:nodoc:
|
20
|
+
xml =~ /\<rss|rdf/
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
# == Summary
|
3
|
+
# Parser for dealing with RDF feed entries.
|
4
|
+
#
|
5
|
+
# == Attributes
|
6
|
+
# * title
|
7
|
+
# * url
|
8
|
+
# * author
|
9
|
+
# * content
|
10
|
+
# * summary
|
11
|
+
# * published
|
12
|
+
# * categories
|
13
|
+
class RSSEntry
|
14
|
+
include SAXMachine
|
15
|
+
include FeedEntryUtilities
|
16
|
+
element :title
|
17
|
+
element :link, :as => :url
|
18
|
+
|
19
|
+
element :"dc:creator", :as => :author
|
20
|
+
element :"content:encoded", :as => :content
|
21
|
+
element :description, :as => :summary
|
22
|
+
|
23
|
+
element :pubDate, :as => :published
|
24
|
+
element :"dc:date", :as => :published
|
25
|
+
element :"dc:Date", :as => :published
|
26
|
+
element :"dcterms:created", :as => :published
|
27
|
+
|
28
|
+
|
29
|
+
element :"dcterms:modified", :as => :updated
|
30
|
+
element :issued, :as => :published
|
31
|
+
elements :category, :as => :categories
|
32
|
+
|
33
|
+
element :guid, :as => :id
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Feedzirra::AtomEntry do
|
4
|
+
before(:each) do
|
5
|
+
# I don't really like doing it this way because these unit test should only rely on AtomEntry,
|
6
|
+
# but this is actually how it should work. You would never just pass entry xml straight to the AtomEnry
|
7
|
+
@entry = Feedzirra::Atom.parse(sample_atom_feed).entries.first
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should parse the title" do
|
11
|
+
@entry.title.should == "AWS Job: Architect & Designer Position in Turkey"
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should parse the url" do
|
15
|
+
@entry.url.should == "http://aws.typepad.com/aws/2009/01/aws-job-architect-designer-position-in-turkey.html"
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should parse the author" do
|
19
|
+
@entry.author.should == "AWS Editor"
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should parse the content" do
|
23
|
+
@entry.content.should == sample_atom_entry_content
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should provide a summary" do
|
27
|
+
@entry.summary.should == "Late last year an entrepreneur from Turkey visited me at Amazon HQ in Seattle. We talked about his plans to use AWS as part of his new social video portal startup. I won't spill any beans before he's ready to..."
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should parse the published date" do
|
31
|
+
@entry.published.to_s.should == "Fri Jan 16 18:21:00 UTC 2009"
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should parse the categories" do
|
35
|
+
@entry.categories.should == ['Turkey', 'Seattle']
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should parse the updated date" do
|
39
|
+
@entry.updated.to_s.should == "Fri Jan 16 18:21:00 UTC 2009"
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should parse the id" do
|
43
|
+
@entry.id.should == "tag:typepad.com,2003:post-61484736"
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Feedzirra::AtomFeedBurnerEntry do
|
4
|
+
before(:each) do
|
5
|
+
# I don't really like doing it this way because these unit test should only rely on AtomEntry,
|
6
|
+
# but this is actually how it should work. You would never just pass entry xml straight to the AtomEnry
|
7
|
+
@entry = Feedzirra::AtomFeedBurner.parse(sample_feedburner_atom_feed).entries.first
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should parse the title" do
|
11
|
+
@entry.title.should == "Making a Ruby C library even faster"
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should be able to fetch a url via the 'alternate' rel if no origLink exists" do
|
15
|
+
entry = Feedzirra::AtomFeedBurner.parse(File.read("#{File.dirname(__FILE__)}/../sample_feeds/PaulDixExplainsNothingAlternate.xml")).entries.first
|
16
|
+
entry.url.should == 'http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/519925023/making-a-ruby-c-library-even-faster.html'
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should parse the url" do
|
20
|
+
@entry.url.should == "http://www.pauldix.net/2009/01/making-a-ruby-c-library-even-faster.html"
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should parse the author" do
|
24
|
+
@entry.author.should == "Paul Dix"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should parse the content" do
|
28
|
+
@entry.content.should == sample_feedburner_atom_entry_content
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should provide a summary" do
|
32
|
+
@entry.summary.should == "Last week I released the first version of a SAX based XML parsing library called SAX-Machine. It uses Nokogiri, which uses libxml, so it's pretty fast. However, I felt that it could be even faster. The only question was how..."
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should parse the published date" do
|
36
|
+
@entry.published.to_s.should == "Thu Jan 22 15:50:22 UTC 2009"
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should parse the categories" do
|
40
|
+
@entry.categories.should == ['Ruby', 'Another Category']
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Feedzirra::AtomFeedBurner do
|
4
|
+
describe "#will_parse?" do
|
5
|
+
it "should return true for a feedburner atom feed" do
|
6
|
+
Feedzirra::AtomFeedBurner.should be_able_to_parse(sample_feedburner_atom_feed)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should return false for an rdf feed" do
|
10
|
+
Feedzirra::AtomFeedBurner.should_not be_able_to_parse(sample_rdf_feed)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should return false for a regular atom feed" do
|
14
|
+
Feedzirra::AtomFeedBurner.should_not be_able_to_parse(sample_atom_feed)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "parsing" do
|
19
|
+
before(:each) do
|
20
|
+
@feed = Feedzirra::AtomFeedBurner.parse(sample_feedburner_atom_feed)
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should parse the title" do
|
24
|
+
@feed.title.should == "Paul Dix Explains Nothing"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should parse the url" do
|
28
|
+
@feed.url.should == "http://www.pauldix.net/"
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should parse the feed_url" do
|
32
|
+
@feed.feed_url.should == "http://feeds.feedburner.com/PaulDixExplainsNothing"
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should parse entries" do
|
36
|
+
@feed.entries.size.should == 5
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Feedzirra::Atom do
|
4
|
+
describe "#will_parse?" do
|
5
|
+
it "should return true for an atom feed" do
|
6
|
+
Feedzirra::Atom.should be_able_to_parse(sample_atom_feed)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should return false for an rdf feed" do
|
10
|
+
Feedzirra::Atom.should_not be_able_to_parse(sample_rdf_feed)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
describe "parsing" do
|
15
|
+
before(:each) do
|
16
|
+
@feed = Feedzirra::Atom.parse(sample_atom_feed)
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should parse the title" do
|
20
|
+
@feed.title.should == "Amazon Web Services Blog"
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should parse the url" do
|
24
|
+
@feed.url.should == "http://aws.typepad.com/aws/"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should parse the feed_url" do
|
28
|
+
@feed.feed_url.should == "http://aws.typepad.com/aws/atom.xml"
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should parse entries" do
|
32
|
+
@feed.entries.size.should == 10
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Feedzirra::FeedUtilities do
|
4
|
+
before(:each) do
|
5
|
+
@klass = Class.new do
|
6
|
+
include Feedzirra::FeedEntryUtilities
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "handling dates" do
|
11
|
+
it "should parse an ISO 8601 formatted datetime into Time" do
|
12
|
+
time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00")
|
13
|
+
time.class.should == Time
|
14
|
+
time.to_s.should == "Wed Feb 20 18:05:00 UTC 2008"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "sanitizing" do
|
19
|
+
before(:each) do
|
20
|
+
@feed = Feedzirra::Feed.parse(sample_atom_feed)
|
21
|
+
@entry = @feed.entries.first
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should provide a sanitized title" do
|
25
|
+
new_title = "<script>" + @entry.title
|
26
|
+
@entry.title = new_title
|
27
|
+
@entry.title.sanitize.should == Dryopteris.sanitize(new_title)
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should sanitize content in place" do
|
31
|
+
new_content = "<script>" + @entry.content
|
32
|
+
@entry.content = new_content.dup
|
33
|
+
@entry.content.sanitize!.should == Dryopteris.sanitize(new_content)
|
34
|
+
@entry.content.should == Dryopteris.sanitize(new_content)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should sanitize things in place" do
|
38
|
+
@entry.title += "<script>"
|
39
|
+
@entry.author += "<script>"
|
40
|
+
@entry.content += "<script>"
|
41
|
+
|
42
|
+
cleaned_title = Dryopteris.sanitize(@entry.title)
|
43
|
+
cleaned_author = Dryopteris.sanitize(@entry.author)
|
44
|
+
cleaned_content = Dryopteris.sanitize(@entry.content)
|
45
|
+
|
46
|
+
@entry.sanitize!
|
47
|
+
@entry.title.should == cleaned_title
|
48
|
+
@entry.author.should == cleaned_author
|
49
|
+
@entry.content.should == cleaned_content
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|