eric-feedzirra 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,45 @@
1
+ module Feedzirra
2
+ module FeedEntryUtilities
3
+ def published
4
+ @published || @updated
5
+ end
6
+
7
+ def parse_datetime(string)
8
+ begin
9
+ DateTime.parse(string).feed_utils_to_gm_time
10
+ rescue
11
+ puts "DATE CAN'T BE PARSED: #{string}"
12
+ nil
13
+ end
14
+ end
15
+
16
+ ##
17
+ # Returns the id of the entry or its url if not id is present, as some formats don't support it
18
+ def id
19
+ @id || @url
20
+ end
21
+
22
+ ##
23
+ # Writter for published. By default, we keep the "oldest" publish time found.
24
+ def published=(val)
25
+ parsed = parse_datetime(val)
26
+ @published = parsed if !@published || parsed < @published
27
+ end
28
+
29
+ ##
30
+ # Writter for udapted. By default, we keep the most recenet update time found.
31
+ def updated=(val)
32
+ parsed = parse_datetime(val)
33
+ @updated = parsed if !@updated || parsed > @updated
34
+ end
35
+
36
+ def sanitize!
37
+ self.title.sanitize! if self.title
38
+ self.author.sanitize! if self.author
39
+ self.summary.sanitize! if self.summary
40
+ self.content.sanitize! if self.content
41
+ end
42
+
43
+ alias_method :last_modified, :published
44
+ end
45
+ end
@@ -0,0 +1,71 @@
1
+ module Feedzirra
2
+ module FeedUtilities
3
+ UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
4
+
5
+ attr_writer :new_entries, :updated, :last_modified
6
+ attr_accessor :etag
7
+
8
+ def last_modified
9
+ @last_modified ||= begin
10
+ entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
11
+ entry ? entry.published : nil
12
+ end
13
+ end
14
+
15
+ def updated?
16
+ @updated
17
+ end
18
+
19
+ def new_entries
20
+ @new_entries ||= []
21
+ end
22
+
23
+ def has_new_entries?
24
+ new_entries.size > 0
25
+ end
26
+
27
+ def update_from_feed(feed)
28
+ self.new_entries += find_new_entries_for(feed)
29
+ self.entries.unshift(*self.new_entries)
30
+
31
+ updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
32
+ end
33
+
34
+ def update_attribute(feed, name)
35
+ old_value, new_value = send(name), feed.send(name)
36
+
37
+ if old_value != new_value
38
+ send("#{name}=", new_value)
39
+ end
40
+ end
41
+
42
+ def sanitize_entries!
43
+ entries.each {|entry| entry.sanitize!}
44
+ end
45
+
46
+ private
47
+
48
+ def updated!
49
+ @updated = true
50
+ end
51
+
52
+ def find_new_entries_for(feed)
53
+ # this implementation is a hack, which is why it's so ugly.
54
+ # it's to get around the fact that not all feeds have a published date.
55
+ # however, they're always ordered with the newest one first.
56
+ # So we go through the entries just parsed and insert each one as a new entry
57
+ # until we get to one that has the same url as the the newest for the feed
58
+ latest_entry = self.entries.first
59
+ found_new_entries = []
60
+ feed.entries.each do |entry|
61
+ break if entry.url == latest_entry.url
62
+ found_new_entries << entry
63
+ end
64
+ found_new_entries
65
+ end
66
+
67
+ def existing_entry?(test_entry)
68
+ entries.any? { |entry| entry.url == test_entry.url }
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,46 @@
1
+ module Feedzirra
2
+ # iTunes is RSS 2.0 + some apple extensions
3
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
4
+ class ITunesRSS
5
+ include SAXMachine
6
+ include FeedUtilities
7
+
8
+ attr_accessor :feed_url
9
+
10
+ # RSS 2.0 elements that need including
11
+ element :copyright
12
+ element :description
13
+ element :language
14
+ element :managingEditor
15
+ element :title
16
+ element :link, :as => :url
17
+
18
+ # If author is not present use managingEditor on the channel
19
+ element :"itunes:author", :as => :itunes_author
20
+ element :"itunes:block", :as => :itunes_block
21
+ element :"itunes:image", :value => :href, :as => :itunes_image
22
+ element :"itunes:explicit", :as => :itunes_explicit
23
+ element :"itunes:keywords", :as => :itunes_keywords
24
+ # New URL for the podcast feed
25
+ element :"itunes:new-feed-url", :as => :itunes_new_feed_url
26
+ element :"itunes:subtitle", :as => :itunes_subtitle
27
+ # If summary is not present, use the description tag
28
+ element :"itunes:summary", :as => :itunes_summary
29
+
30
+ # iTunes RSS feeds can have multiple main categories...
31
+ # ...and multiple sub-categories per category
32
+ # TODO subcategories not supported correctly - they are at the same level
33
+ # as the main categories
34
+ elements :"itunes:category", :as => :itunes_categories, :value => :text
35
+
36
+ elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
37
+
38
+ elements :item, :as => :entries, :class => ITunesRSSItem
39
+
40
+ def self.able_to_parse?(xml)
41
+ xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
42
+ end
43
+
44
+ end
45
+
46
+ end
@@ -0,0 +1,28 @@
1
+ module Feedzirra
2
+ # iTunes extensions to the standard RSS2.0 item
3
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
4
+ class ITunesRSSItem
5
+ include SAXMachine
6
+ include FeedUtilities
7
+ element :author
8
+ element :guid
9
+ element :title
10
+ element :link, :as => :url
11
+ element :description, :as => :summary
12
+ element :pubDate, :as => :published
13
+
14
+ # If author is not present use author tag on the item
15
+ element :"itunes:author", :as => :itunes_author
16
+ element :"itunes:block", :as => :itunes_block
17
+ element :"itunes:duration", :as => :itunes_duration
18
+ element :"itunes:explicit", :as => :itunes_explicit
19
+ element :"itunes:keywords", :as => :itunes_keywords
20
+ element :"itunes:subtitle", :as => :itunes_subtitle
21
+ # If summary is not present, use the description tag
22
+ element :"itunes:summary", :as => :itunes_summary
23
+ element :enclosure, :value => :length, :as => :enclosure_length
24
+ element :enclosure, :value => :type, :as => :enclosure_type
25
+ element :enclosure, :value => :url, :as => :enclosure_url
26
+ end
27
+
28
+ end
@@ -0,0 +1,8 @@
1
+ module Feedzirra
2
+ class ITunesRSSOwner
3
+ include SAXMachine
4
+ include FeedUtilities
5
+ element :"itunes:name", :as => :name
6
+ element :"itunes:email", :as => :email
7
+ end
8
+ end
@@ -0,0 +1,23 @@
1
+ module Feedzirra
2
+ # == Summary
3
+ # Parser for dealing with RSS feeds.
4
+ #
5
+ # == Attributes
6
+ # * title
7
+ # * feed_url
8
+ # * url
9
+ # * entries
10
+ class RSS
11
+ include SAXMachine
12
+ include FeedUtilities
13
+ element :title
14
+ element :link, :as => :url
15
+ elements :item, :as => :entries, :class => RSSEntry
16
+
17
+ attr_accessor :feed_url
18
+
19
+ def self.able_to_parse?(xml) #:nodoc:
20
+ xml =~ /\<rss|rdf/
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,35 @@
1
+ module Feedzirra
2
+ # == Summary
3
+ # Parser for dealing with RDF feed entries.
4
+ #
5
+ # == Attributes
6
+ # * title
7
+ # * url
8
+ # * author
9
+ # * content
10
+ # * summary
11
+ # * published
12
+ # * categories
13
+ class RSSEntry
14
+ include SAXMachine
15
+ include FeedEntryUtilities
16
+ element :title
17
+ element :link, :as => :url
18
+
19
+ element :"dc:creator", :as => :author
20
+ element :"content:encoded", :as => :content
21
+ element :description, :as => :summary
22
+
23
+ element :pubDate, :as => :published
24
+ element :"dc:date", :as => :published
25
+ element :"dc:Date", :as => :published
26
+ element :"dcterms:created", :as => :published
27
+
28
+
29
+ element :"dcterms:modified", :as => :updated
30
+ element :issued, :as => :published
31
+ elements :category, :as => :categories
32
+
33
+ element :guid, :as => :id
34
+ end
35
+ end
@@ -0,0 +1,45 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Feedzirra::AtomEntry do
4
+ before(:each) do
5
+ # I don't really like doing it this way because these unit test should only rely on AtomEntry,
6
+ # but this is actually how it should work. You would never just pass entry xml straight to the AtomEnry
7
+ @entry = Feedzirra::Atom.parse(sample_atom_feed).entries.first
8
+ end
9
+
10
+ it "should parse the title" do
11
+ @entry.title.should == "AWS Job: Architect & Designer Position in Turkey"
12
+ end
13
+
14
+ it "should parse the url" do
15
+ @entry.url.should == "http://aws.typepad.com/aws/2009/01/aws-job-architect-designer-position-in-turkey.html"
16
+ end
17
+
18
+ it "should parse the author" do
19
+ @entry.author.should == "AWS Editor"
20
+ end
21
+
22
+ it "should parse the content" do
23
+ @entry.content.should == sample_atom_entry_content
24
+ end
25
+
26
+ it "should provide a summary" do
27
+ @entry.summary.should == "Late last year an entrepreneur from Turkey visited me at Amazon HQ in Seattle. We talked about his plans to use AWS as part of his new social video portal startup. I won't spill any beans before he's ready to..."
28
+ end
29
+
30
+ it "should parse the published date" do
31
+ @entry.published.to_s.should == "Fri Jan 16 18:21:00 UTC 2009"
32
+ end
33
+
34
+ it "should parse the categories" do
35
+ @entry.categories.should == ['Turkey', 'Seattle']
36
+ end
37
+
38
+ it "should parse the updated date" do
39
+ @entry.updated.to_s.should == "Fri Jan 16 18:21:00 UTC 2009"
40
+ end
41
+
42
+ it "should parse the id" do
43
+ @entry.id.should == "tag:typepad.com,2003:post-61484736"
44
+ end
45
+ end
@@ -0,0 +1,42 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Feedzirra::AtomFeedBurnerEntry do
4
+ before(:each) do
5
+ # I don't really like doing it this way because these unit test should only rely on AtomEntry,
6
+ # but this is actually how it should work. You would never just pass entry xml straight to the AtomEnry
7
+ @entry = Feedzirra::AtomFeedBurner.parse(sample_feedburner_atom_feed).entries.first
8
+ end
9
+
10
+ it "should parse the title" do
11
+ @entry.title.should == "Making a Ruby C library even faster"
12
+ end
13
+
14
+ it "should be able to fetch a url via the 'alternate' rel if no origLink exists" do
15
+ entry = Feedzirra::AtomFeedBurner.parse(File.read("#{File.dirname(__FILE__)}/../sample_feeds/PaulDixExplainsNothingAlternate.xml")).entries.first
16
+ entry.url.should == 'http://feeds.feedburner.com/~r/PaulDixExplainsNothing/~3/519925023/making-a-ruby-c-library-even-faster.html'
17
+ end
18
+
19
+ it "should parse the url" do
20
+ @entry.url.should == "http://www.pauldix.net/2009/01/making-a-ruby-c-library-even-faster.html"
21
+ end
22
+
23
+ it "should parse the author" do
24
+ @entry.author.should == "Paul Dix"
25
+ end
26
+
27
+ it "should parse the content" do
28
+ @entry.content.should == sample_feedburner_atom_entry_content
29
+ end
30
+
31
+ it "should provide a summary" do
32
+ @entry.summary.should == "Last week I released the first version of a SAX based XML parsing library called SAX-Machine. It uses Nokogiri, which uses libxml, so it's pretty fast. However, I felt that it could be even faster. The only question was how..."
33
+ end
34
+
35
+ it "should parse the published date" do
36
+ @entry.published.to_s.should == "Thu Jan 22 15:50:22 UTC 2009"
37
+ end
38
+
39
+ it "should parse the categories" do
40
+ @entry.categories.should == ['Ruby', 'Another Category']
41
+ end
42
+ end
@@ -0,0 +1,39 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Feedzirra::AtomFeedBurner do
4
+ describe "#will_parse?" do
5
+ it "should return true for a feedburner atom feed" do
6
+ Feedzirra::AtomFeedBurner.should be_able_to_parse(sample_feedburner_atom_feed)
7
+ end
8
+
9
+ it "should return false for an rdf feed" do
10
+ Feedzirra::AtomFeedBurner.should_not be_able_to_parse(sample_rdf_feed)
11
+ end
12
+
13
+ it "should return false for a regular atom feed" do
14
+ Feedzirra::AtomFeedBurner.should_not be_able_to_parse(sample_atom_feed)
15
+ end
16
+ end
17
+
18
+ describe "parsing" do
19
+ before(:each) do
20
+ @feed = Feedzirra::AtomFeedBurner.parse(sample_feedburner_atom_feed)
21
+ end
22
+
23
+ it "should parse the title" do
24
+ @feed.title.should == "Paul Dix Explains Nothing"
25
+ end
26
+
27
+ it "should parse the url" do
28
+ @feed.url.should == "http://www.pauldix.net/"
29
+ end
30
+
31
+ it "should parse the feed_url" do
32
+ @feed.feed_url.should == "http://feeds.feedburner.com/PaulDixExplainsNothing"
33
+ end
34
+
35
+ it "should parse entries" do
36
+ @feed.entries.size.should == 5
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,35 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Feedzirra::Atom do
4
+ describe "#will_parse?" do
5
+ it "should return true for an atom feed" do
6
+ Feedzirra::Atom.should be_able_to_parse(sample_atom_feed)
7
+ end
8
+
9
+ it "should return false for an rdf feed" do
10
+ Feedzirra::Atom.should_not be_able_to_parse(sample_rdf_feed)
11
+ end
12
+ end
13
+
14
+ describe "parsing" do
15
+ before(:each) do
16
+ @feed = Feedzirra::Atom.parse(sample_atom_feed)
17
+ end
18
+
19
+ it "should parse the title" do
20
+ @feed.title.should == "Amazon Web Services Blog"
21
+ end
22
+
23
+ it "should parse the url" do
24
+ @feed.url.should == "http://aws.typepad.com/aws/"
25
+ end
26
+
27
+ it "should parse the feed_url" do
28
+ @feed.feed_url.should == "http://aws.typepad.com/aws/atom.xml"
29
+ end
30
+
31
+ it "should parse entries" do
32
+ @feed.entries.size.should == 10
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,52 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Feedzirra::FeedUtilities do
4
+ before(:each) do
5
+ @klass = Class.new do
6
+ include Feedzirra::FeedEntryUtilities
7
+ end
8
+ end
9
+
10
+ describe "handling dates" do
11
+ it "should parse an ISO 8601 formatted datetime into Time" do
12
+ time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00")
13
+ time.class.should == Time
14
+ time.to_s.should == "Wed Feb 20 18:05:00 UTC 2008"
15
+ end
16
+ end
17
+
18
+ describe "sanitizing" do
19
+ before(:each) do
20
+ @feed = Feedzirra::Feed.parse(sample_atom_feed)
21
+ @entry = @feed.entries.first
22
+ end
23
+
24
+ it "should provide a sanitized title" do
25
+ new_title = "<script>" + @entry.title
26
+ @entry.title = new_title
27
+ @entry.title.sanitize.should == Dryopteris.sanitize(new_title)
28
+ end
29
+
30
+ it "should sanitize content in place" do
31
+ new_content = "<script>" + @entry.content
32
+ @entry.content = new_content.dup
33
+ @entry.content.sanitize!.should == Dryopteris.sanitize(new_content)
34
+ @entry.content.should == Dryopteris.sanitize(new_content)
35
+ end
36
+
37
+ it "should sanitize things in place" do
38
+ @entry.title += "<script>"
39
+ @entry.author += "<script>"
40
+ @entry.content += "<script>"
41
+
42
+ cleaned_title = Dryopteris.sanitize(@entry.title)
43
+ cleaned_author = Dryopteris.sanitize(@entry.author)
44
+ cleaned_content = Dryopteris.sanitize(@entry.content)
45
+
46
+ @entry.sanitize!
47
+ @entry.title.should == cleaned_title
48
+ @entry.author.should == cleaned_author
49
+ @entry.content.should == cleaned_content
50
+ end
51
+ end
52
+ end