mwilliams-feedzirra 0.0.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ module Feedzirra
2
+ module FeedEntryUtilities
3
+ def published
4
+ @published || @updated
5
+ end
6
+
7
+ def parse_datetime(string)
8
+ begin
9
+ DateTime.parse(string).feed_utils_to_gm_time
10
+ rescue
11
+ puts "DATE CAN'T BE PARSED: #{string}"
12
+ nil
13
+ end
14
+ end
15
+
16
+ ##
17
+ # Returns the id of the entry or its url if not id is present, as some formats don't support it
18
+ def id
19
+ @id || @url
20
+ end
21
+
22
+ ##
23
+ # Writter for published. By default, we keep the "oldest" publish time found.
24
+ def published=(val)
25
+ parsed = parse_datetime(val)
26
+ @published = parsed if !@published || parsed < @published
27
+ end
28
+
29
+ ##
30
+ # Writter for udapted. By default, we keep the most recenet update time found.
31
+ def updated=(val)
32
+ parsed = parse_datetime(val)
33
+ @updated = parsed if !@updated || parsed > @updated
34
+ end
35
+
36
+ def sanitize!
37
+ self.title.sanitize! if self.title
38
+ self.author.sanitize! if self.author
39
+ self.summary.sanitize! if self.summary
40
+ self.content.sanitize! if self.content
41
+ end
42
+
43
+ alias_method :last_modified, :published
44
+ end
45
+ end
@@ -0,0 +1,71 @@
1
+ module Feedzirra
2
+ module FeedUtilities
3
+ UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
4
+
5
+ attr_writer :new_entries, :updated, :last_modified
6
+ attr_accessor :etag
7
+
8
+ def last_modified
9
+ @last_modified ||= begin
10
+ entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
11
+ entry ? entry.published : nil
12
+ end
13
+ end
14
+
15
+ def updated?
16
+ @updated
17
+ end
18
+
19
+ def new_entries
20
+ @new_entries ||= []
21
+ end
22
+
23
+ def has_new_entries?
24
+ new_entries.size > 0
25
+ end
26
+
27
+ def update_from_feed(feed)
28
+ self.new_entries += find_new_entries_for(feed)
29
+ self.entries.unshift(*self.new_entries)
30
+
31
+ updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
32
+ end
33
+
34
+ def update_attribute(feed, name)
35
+ old_value, new_value = send(name), feed.send(name)
36
+
37
+ if old_value != new_value
38
+ send("#{name}=", new_value)
39
+ end
40
+ end
41
+
42
+ def sanitize_entries!
43
+ entries.each {|entry| entry.sanitize!}
44
+ end
45
+
46
+ private
47
+
48
+ def updated!
49
+ @updated = true
50
+ end
51
+
52
+ def find_new_entries_for(feed)
53
+ # this implementation is a hack, which is why it's so ugly.
54
+ # it's to get around the fact that not all feeds have a published date.
55
+ # however, they're always ordered with the newest one first.
56
+ # So we go through the entries just parsed and insert each one as a new entry
57
+ # until we get to one that has the same url as the the newest for the feed
58
+ latest_entry = self.entries.first
59
+ found_new_entries = []
60
+ feed.entries.each do |entry|
61
+ break if entry.url == latest_entry.url
62
+ found_new_entries << entry
63
+ end
64
+ found_new_entries
65
+ end
66
+
67
+ def existing_entry?(test_entry)
68
+ entries.any? { |entry| entry.url == test_entry.url }
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,26 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Atom feeds.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * feed_url
10
+ # * url
11
+ # * entries
12
+ class Atom
13
+ include SAXMachine
14
+ include FeedUtilities
15
+ element :title
16
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
+ elements :entry, :as => :entries, :class => AtomEntry
19
+
20
+ def self.able_to_parse?(xml) #:nodoc:
21
+ xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
22
+ end
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,34 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Atom feed entries.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * url
10
+ # * author
11
+ # * content
12
+ # * summary
13
+ # * published
14
+ # * categories
15
+ class AtomEntry
16
+ include SAXMachine
17
+ include FeedEntryUtilities
18
+ element :title
19
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
20
+ element :name, :as => :author
21
+ element :content
22
+ element :summary
23
+ element :published
24
+ element :id
25
+ element :created, :as => :published
26
+ element :issued, :as => :published
27
+ element :updated
28
+ element :modified, :as => :updated
29
+ elements :category, :as => :categories, :value => :term
30
+ end
31
+
32
+ end
33
+
34
+ end
@@ -0,0 +1,27 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Feedburner Atom feeds.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * feed_url
10
+ # * url
11
+ # * entries
12
+ class AtomFeedBurner
13
+ include SAXMachine
14
+ include FeedUtilities
15
+ element :title
16
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
17
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
18
+ elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
19
+
20
+ def self.able_to_parse?(xml) #:nodoc:
21
+ (xml =~ /Atom/ && xml =~ /feedburner/) || false
22
+ end
23
+ end
24
+
25
+ end
26
+
27
+ end
@@ -0,0 +1,35 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with Feedburner Atom feed entries.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * url
10
+ # * author
11
+ # * content
12
+ # * summary
13
+ # * published
14
+ # * categories
15
+ class AtomFeedBurnerEntry
16
+ include SAXMachine
17
+ include FeedEntryUtilities
18
+ element :title
19
+ element :name, :as => :author
20
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
21
+ element :"feedburner:origLink", :as => :url
22
+ element :summary
23
+ element :content
24
+ element :published
25
+ element :id
26
+ element :issued, :as => :published
27
+ element :created, :as => :published
28
+ element :updated
29
+ element :modified, :as => :updated
30
+ elements :category, :as => :categories, :value => :term
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -0,0 +1,50 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # iTunes is RSS 2.0 + some apple extensions
5
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
+ class ITunesRSS
7
+ include SAXMachine
8
+ include FeedUtilities
9
+
10
+ attr_accessor :feed_url
11
+
12
+ # RSS 2.0 elements that need including
13
+ element :copyright
14
+ element :description
15
+ element :language
16
+ element :managingEditor
17
+ element :title
18
+ element :link, :as => :url
19
+
20
+ # If author is not present use managingEditor on the channel
21
+ element :"itunes:author", :as => :itunes_author
22
+ element :"itunes:block", :as => :itunes_block
23
+ element :"itunes:image", :value => :href, :as => :itunes_image
24
+ element :"itunes:explicit", :as => :itunes_explicit
25
+ element :"itunes:keywords", :as => :itunes_keywords
26
+ # New URL for the podcast feed
27
+ element :"itunes:new-feed-url", :as => :itunes_new_feed_url
28
+ element :"itunes:subtitle", :as => :itunes_subtitle
29
+ # If summary is not present, use the description tag
30
+ element :"itunes:summary", :as => :itunes_summary
31
+
32
+ # iTunes RSS feeds can have multiple main categories...
33
+ # ...and multiple sub-categories per category
34
+ # TODO subcategories not supported correctly - they are at the same level
35
+ # as the main categories
36
+ elements :"itunes:category", :as => :itunes_categories, :value => :text
37
+
38
+ elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
39
+
40
+ elements :item, :as => :entries, :class => ITunesRSSItem
41
+
42
+ def self.able_to_parse?(xml)
43
+ xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,31 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # iTunes extensions to the standard RSS2.0 item
5
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
+ class ITunesRSSItem
7
+ include SAXMachine
8
+ include FeedUtilities
9
+ element :author
10
+ element :guid
11
+ element :title
12
+ element :link, :as => :url
13
+ element :description, :as => :summary
14
+ element :pubDate, :as => :published
15
+
16
+ # If author is not present use author tag on the item
17
+ element :"itunes:author", :as => :itunes_author
18
+ element :"itunes:block", :as => :itunes_block
19
+ element :"itunes:duration", :as => :itunes_duration
20
+ element :"itunes:explicit", :as => :itunes_explicit
21
+ element :"itunes:keywords", :as => :itunes_keywords
22
+ element :"itunes:subtitle", :as => :itunes_subtitle
23
+ # If summary is not present, use the description tag
24
+ element :"itunes:summary", :as => :itunes_summary
25
+ element :enclosure, :value => :length, :as => :enclosure_length
26
+ element :enclosure, :value => :type, :as => :enclosure_type
27
+ element :enclosure, :value => :url, :as => :enclosure_url
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,12 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ class ITunesRSSOwner
5
+ include SAXMachine
6
+ include FeedUtilities
7
+ element :"itunes:name", :as => :name
8
+ element :"itunes:email", :as => :email
9
+ end
10
+ end
11
+
12
+ end
@@ -0,0 +1,28 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with RSS feeds.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * feed_url
10
+ # * url
11
+ # * entries
12
+ class RSS
13
+ include SAXMachine
14
+ include FeedUtilities
15
+ element :title
16
+ element :link, :as => :url
17
+ elements :item, :as => :entries, :class => RSSEntry
18
+
19
+ attr_accessor :feed_url
20
+
21
+ def self.able_to_parse?(xml) #:nodoc:
22
+ xml =~ /\<rss|rdf/
23
+ end
24
+ end
25
+
26
+ end
27
+
28
+ end
@@ -0,0 +1,40 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # == Summary
5
+ # Parser for dealing with RDF feed entries.
6
+ #
7
+ # == Attributes
8
+ # * title
9
+ # * url
10
+ # * author
11
+ # * content
12
+ # * summary
13
+ # * published
14
+ # * categories
15
+ class RSSEntry
16
+ include SAXMachine
17
+ include FeedEntryUtilities
18
+ element :title
19
+ element :link, :as => :url
20
+
21
+ element :"dc:creator", :as => :author
22
+ element :"content:encoded", :as => :content
23
+ element :description, :as => :summary
24
+
25
+ element :pubDate, :as => :published
26
+ element :"dc:date", :as => :published
27
+ element :"dc:Date", :as => :published
28
+ element :"dcterms:created", :as => :published
29
+
30
+
31
+ element :"dcterms:modified", :as => :updated
32
+ element :issued, :as => :published
33
+ elements :category, :as => :categories
34
+
35
+ element :guid, :as => :id
36
+ end
37
+
38
+ end
39
+
40
+ end
@@ -0,0 +1,52 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe Feedzirra::FeedUtilities do
4
+ before(:each) do
5
+ @klass = Class.new do
6
+ include Feedzirra::FeedEntryUtilities
7
+ end
8
+ end
9
+
10
+ describe "handling dates" do
11
+ it "should parse an ISO 8601 formatted datetime into Time" do
12
+ time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00")
13
+ time.class.should == Time
14
+ time.to_s.should == "Wed Feb 20 18:05:00 UTC 2008"
15
+ end
16
+ end
17
+
18
+ describe "sanitizing" do
19
+ before(:each) do
20
+ @feed = Feedzirra::Feed.parse(sample_atom_feed)
21
+ @entry = @feed.entries.first
22
+ end
23
+
24
+ it "should provide a sanitized title" do
25
+ new_title = "<script>" + @entry.title
26
+ @entry.title = new_title
27
+ @entry.title.sanitize.should == Dryopteris.sanitize(new_title)
28
+ end
29
+
30
+ it "should sanitize content in place" do
31
+ new_content = "<script>" + @entry.content
32
+ @entry.content = new_content.dup
33
+ @entry.content.sanitize!.should == Dryopteris.sanitize(new_content)
34
+ @entry.content.should == Dryopteris.sanitize(new_content)
35
+ end
36
+
37
+ it "should sanitize things in place" do
38
+ @entry.title += "<script>"
39
+ @entry.author += "<script>"
40
+ @entry.content += "<script>"
41
+
42
+ cleaned_title = Dryopteris.sanitize(@entry.title)
43
+ cleaned_author = Dryopteris.sanitize(@entry.author)
44
+ cleaned_content = Dryopteris.sanitize(@entry.content)
45
+
46
+ @entry.sanitize!
47
+ @entry.title.should == cleaned_title
48
+ @entry.author.should == cleaned_author
49
+ @entry.content.should == cleaned_content
50
+ end
51
+ end
52
+ end