agiley-feedzirra 0.0.24
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +208 -0
- data/Rakefile +56 -0
- data/lib/core_ext/date.rb +21 -0
- data/lib/core_ext/string.rb +9 -0
- data/lib/feedzirra.rb +41 -0
- data/lib/feedzirra/feed.rb +325 -0
- data/lib/feedzirra/feed_entry_utilities.rb +45 -0
- data/lib/feedzirra/feed_utilities.rb +72 -0
- data/lib/feedzirra/parser/atom.rb +35 -0
- data/lib/feedzirra/parser/atom_entry.rb +39 -0
- data/lib/feedzirra/parser/atom_feed_burner.rb +27 -0
- data/lib/feedzirra/parser/atom_feed_burner_entry.rb +35 -0
- data/lib/feedzirra/parser/itunes_rss.rb +50 -0
- data/lib/feedzirra/parser/itunes_rss_item.rb +31 -0
- data/lib/feedzirra/parser/itunes_rss_owner.rb +12 -0
- data/lib/feedzirra/parser/rss.rb +28 -0
- data/lib/feedzirra/parser/rss_entry.rb +42 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
- data/spec/feedzirra/feed_spec.rb +556 -0
- data/spec/feedzirra/feed_utilities_spec.rb +149 -0
- data/spec/feedzirra/parser/atom_entry_spec.rb +49 -0
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +42 -0
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +39 -0
- data/spec/feedzirra/parser/atom_spec.rb +43 -0
- data/spec/feedzirra/parser/itunes_rss_item_spec.rb +48 -0
- data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +18 -0
- data/spec/feedzirra/parser/itunes_rss_spec.rb +50 -0
- data/spec/feedzirra/parser/rss_entry_spec.rb +41 -0
- data/spec/feedzirra/parser/rss_spec.rb +41 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +58 -0
- metadata +220 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedEntryUtilities
|
3
|
+
def published
|
4
|
+
@published || @updated
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_datetime(string)
|
8
|
+
begin
|
9
|
+
DateTime.parse(string).in_time_zone
|
10
|
+
rescue
|
11
|
+
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
+
def id
|
19
|
+
@entry_id || @url
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Writter for published. By default, we keep the "oldest" publish time found.
|
24
|
+
def published=(val)
|
25
|
+
parsed = parse_datetime(val)
|
26
|
+
@published = parsed if !@published || parsed < @published
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Writter for udapted. By default, we keep the most recenet update time found.
|
31
|
+
def updated=(val)
|
32
|
+
parsed = parse_datetime(val)
|
33
|
+
@updated = parsed if !@updated || parsed > @updated
|
34
|
+
end
|
35
|
+
|
36
|
+
def sanitize!
|
37
|
+
self.title.sanitize! if self.title
|
38
|
+
self.author.sanitize! if self.author
|
39
|
+
self.summary.sanitize! if self.summary
|
40
|
+
self.content.sanitize! if self.content
|
41
|
+
end
|
42
|
+
|
43
|
+
alias_method :last_modified, :published
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedUtilities
|
3
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag)
|
4
|
+
|
5
|
+
attr_writer :new_entries, :updated, :last_modified
|
6
|
+
attr_accessor :etag
|
7
|
+
|
8
|
+
def last_modified
|
9
|
+
@last_modified ||= begin
|
10
|
+
entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
|
11
|
+
entry ? entry.published : nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def updated?
|
16
|
+
@updated
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_entries
|
20
|
+
@new_entries ||= []
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_new_entries?
|
24
|
+
new_entries.size > 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_from_feed(feed)
|
28
|
+
self.new_entries += find_new_entries_for(feed)
|
29
|
+
self.entries.unshift(*self.new_entries)
|
30
|
+
|
31
|
+
@updated = false
|
32
|
+
UPDATABLE_ATTRIBUTES.each do |name|
|
33
|
+
updated = update_attribute(feed, name)
|
34
|
+
@updated ||= updated
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def update_attribute(feed, name)
|
39
|
+
old_value, new_value = send(name), feed.send(name)
|
40
|
+
|
41
|
+
if old_value != new_value
|
42
|
+
send("#{name}=", new_value)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def sanitize_entries!
|
47
|
+
entries.each {|entry| entry.sanitize!}
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def find_new_entries_for(feed)
|
53
|
+
# this implementation is a hack, which is why it's so ugly.
|
54
|
+
# it's to get around the fact that not all feeds have a published date.
|
55
|
+
# however, they're always ordered with the newest one first.
|
56
|
+
# So we go through the entries just parsed and insert each one as a new entry
|
57
|
+
# until we get to one that has the same url as the the newest for the feed
|
58
|
+
return feed.entries if self.entries.length == 0
|
59
|
+
latest_entry = self.entries.first
|
60
|
+
found_new_entries = []
|
61
|
+
feed.entries.each do |entry|
|
62
|
+
break if entry.url == latest_entry.url
|
63
|
+
found_new_entries << entry
|
64
|
+
end
|
65
|
+
found_new_entries
|
66
|
+
end
|
67
|
+
|
68
|
+
def existing_entry?(test_entry)
|
69
|
+
entries.any? { |entry| entry.url == test_entry.url }
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class Atom
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
|
+
elements :link, :as => :links, :value => :href
|
19
|
+
elements :entry, :as => :entries, :class => AtomEntry
|
20
|
+
|
21
|
+
def self.able_to_parse?(xml) #:nodoc:
|
22
|
+
xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
|
23
|
+
end
|
24
|
+
|
25
|
+
def url
|
26
|
+
@url || links.last
|
27
|
+
end
|
28
|
+
|
29
|
+
def feed_url
|
30
|
+
@feed_url || links.first
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class AtomEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
20
|
+
element :name, :as => :author
|
21
|
+
element :content
|
22
|
+
element :summary
|
23
|
+
element :published
|
24
|
+
element :id, :as => :entry_id
|
25
|
+
element :created, :as => :published
|
26
|
+
element :issued, :as => :published
|
27
|
+
element :updated
|
28
|
+
element :modified, :as => :updated
|
29
|
+
elements :category, :as => :categories, :value => :term
|
30
|
+
elements :link, :as => :links, :value => :href
|
31
|
+
|
32
|
+
def url
|
33
|
+
@url || links.first
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Feedburner Atom feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class AtomFeedBurner
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
|
+
elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
|
19
|
+
|
20
|
+
def self.able_to_parse?(xml) #:nodoc:
|
21
|
+
(xml =~ /Atom/ && xml =~ /feedburner/) || false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Feedburner Atom feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class AtomFeedBurnerEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :name, :as => :author
|
20
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
21
|
+
element :"feedburner:origLink", :as => :url
|
22
|
+
element :summary
|
23
|
+
element :content
|
24
|
+
element :published
|
25
|
+
element :id, :as => :entry_id
|
26
|
+
element :issued, :as => :published
|
27
|
+
element :created, :as => :published
|
28
|
+
element :updated
|
29
|
+
element :modified, :as => :updated
|
30
|
+
elements :category, :as => :categories, :value => :term
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes is RSS 2.0 + some apple extensions
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSS
|
7
|
+
include SAXMachine
|
8
|
+
include FeedUtilities
|
9
|
+
|
10
|
+
attr_accessor :feed_url
|
11
|
+
|
12
|
+
# RSS 2.0 elements that need including
|
13
|
+
element :copyright
|
14
|
+
element :description
|
15
|
+
element :language
|
16
|
+
element :managingEditor
|
17
|
+
element :title
|
18
|
+
element :link, :as => :url
|
19
|
+
|
20
|
+
# If author is not present use managingEditor on the channel
|
21
|
+
element :"itunes:author", :as => :itunes_author
|
22
|
+
element :"itunes:block", :as => :itunes_block
|
23
|
+
element :"itunes:image", :value => :href, :as => :itunes_image
|
24
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
25
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
26
|
+
# New URL for the podcast feed
|
27
|
+
element :"itunes:new-feed-url", :as => :itunes_new_feed_url
|
28
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
29
|
+
# If summary is not present, use the description tag
|
30
|
+
element :"itunes:summary", :as => :itunes_summary
|
31
|
+
|
32
|
+
# iTunes RSS feeds can have multiple main categories...
|
33
|
+
# ...and multiple sub-categories per category
|
34
|
+
# TODO subcategories not supported correctly - they are at the same level
|
35
|
+
# as the main categories
|
36
|
+
elements :"itunes:category", :as => :itunes_categories, :value => :text
|
37
|
+
|
38
|
+
elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
|
39
|
+
|
40
|
+
elements :item, :as => :entries, :class => ITunesRSSItem
|
41
|
+
|
42
|
+
def self.able_to_parse?(xml)
|
43
|
+
xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes extensions to the standard RSS2.0 item
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSSItem
|
7
|
+
include SAXMachine
|
8
|
+
include FeedUtilities
|
9
|
+
element :author
|
10
|
+
element :guid
|
11
|
+
element :title
|
12
|
+
element :link, :as => :url
|
13
|
+
element :description, :as => :summary
|
14
|
+
element :pubDate, :as => :published
|
15
|
+
|
16
|
+
# If author is not present use author tag on the item
|
17
|
+
element :"itunes:author", :as => :itunes_author
|
18
|
+
element :"itunes:block", :as => :itunes_block
|
19
|
+
element :"itunes:duration", :as => :itunes_duration
|
20
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
21
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
22
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
23
|
+
# If summary is not present, use the description tag
|
24
|
+
element :"itunes:summary", :as => :itunes_summary
|
25
|
+
element :enclosure, :value => :length, :as => :enclosure_length
|
26
|
+
element :enclosure, :value => :type, :as => :enclosure_type
|
27
|
+
element :enclosure, :value => :url, :as => :enclosure_url
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with RSS feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class RSS
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url
|
17
|
+
elements :item, :as => :entries, :class => RSSEntry
|
18
|
+
|
19
|
+
attr_accessor :feed_url
|
20
|
+
|
21
|
+
def self.able_to_parse?(xml) #:nodoc:
|
22
|
+
xml =~ /\<rss|\<rdf/
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with RDF feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class RSSEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :link, :as => :url
|
20
|
+
|
21
|
+
element :"dc:creator", :as => :author
|
22
|
+
element :author, :as => :author
|
23
|
+
element :"content:encoded", :as => :content
|
24
|
+
element :description, :as => :summary
|
25
|
+
|
26
|
+
element :pubDate, :as => :published
|
27
|
+
element :pubdate, :as => :published
|
28
|
+
element :"dc:date", :as => :published
|
29
|
+
element :"dc:Date", :as => :published
|
30
|
+
element :"dcterms:created", :as => :published
|
31
|
+
|
32
|
+
|
33
|
+
element :"dcterms:modified", :as => :updated
|
34
|
+
element :issued, :as => :published
|
35
|
+
elements :category, :as => :categories
|
36
|
+
|
37
|
+
element :guid, :as => :entry_id
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Feedzirra::FeedUtilities do
|
4
|
+
before(:each) do
|
5
|
+
@klass = Class.new do
|
6
|
+
include Feedzirra::FeedEntryUtilities
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "handling dates" do
|
11
|
+
it "should parse an ISO 8601 formatted datetime into Time" do
|
12
|
+
time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00")
|
13
|
+
time.class.should == Time
|
14
|
+
time.to_s.should == "Wed Feb 20 18:05:00 UTC 2008"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "sanitizing" do
|
19
|
+
before(:each) do
|
20
|
+
@feed = Feedzirra::Feed.parse(sample_atom_feed)
|
21
|
+
@entry = @feed.entries.first
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should provide a sanitized title" do
|
25
|
+
new_title = "<script>this is not safe</script>" + @entry.title
|
26
|
+
@entry.title = new_title
|
27
|
+
@entry.title.sanitize.should == Loofah.scrub_fragment(new_title, :prune).to_s
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should sanitize content in place" do
|
31
|
+
new_content = "<script>" + @entry.content
|
32
|
+
@entry.content = new_content.dup
|
33
|
+
@entry.content.sanitize!.should == Loofah.scrub_fragment(new_content, :prune).to_s
|
34
|
+
@entry.content.should == Loofah.scrub_fragment(new_content, :prune).to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should sanitize things in place" do
|
38
|
+
@entry.title += "<script>"
|
39
|
+
@entry.author += "<script>"
|
40
|
+
@entry.content += "<script>"
|
41
|
+
|
42
|
+
cleaned_title = Loofah.scrub_fragment(@entry.title, :prune).to_s
|
43
|
+
cleaned_author = Loofah.scrub_fragment(@entry.author, :prune).to_s
|
44
|
+
cleaned_content = Loofah.scrub_fragment(@entry.content, :prune).to_s
|
45
|
+
|
46
|
+
@entry.sanitize!
|
47
|
+
@entry.title.should == cleaned_title
|
48
|
+
@entry.author.should == cleaned_author
|
49
|
+
@entry.content.should == cleaned_content
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|