mwilliams-feedzirra 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +196 -0
- data/Rakefile +56 -0
- data/lib/core_ext/date.rb +21 -0
- data/lib/core_ext/string.rb +9 -0
- data/lib/feedzirra.rb +48 -0
- data/lib/feedzirra/feed.rb +390 -0
- data/lib/feedzirra/feed_entry_utilities.rb +45 -0
- data/lib/feedzirra/feed_utilities.rb +71 -0
- data/lib/feedzirra/parser/atom.rb +26 -0
- data/lib/feedzirra/parser/atom_entry.rb +34 -0
- data/lib/feedzirra/parser/atom_feed_burner.rb +27 -0
- data/lib/feedzirra/parser/atom_feed_burner_entry.rb +35 -0
- data/lib/feedzirra/parser/itunes_rss.rb +50 -0
- data/lib/feedzirra/parser/itunes_rss_item.rb +31 -0
- data/lib/feedzirra/parser/itunes_rss_owner.rb +12 -0
- data/lib/feedzirra/parser/rss.rb +28 -0
- data/lib/feedzirra/parser/rss_entry.rb +40 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
- data/spec/feedzirra/feed_spec.rb +578 -0
- data/spec/feedzirra/feed_utilities_spec.rb +149 -0
- data/spec/feedzirra/parser/atom_entry_spec.rb +45 -0
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +42 -0
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +39 -0
- data/spec/feedzirra/parser/atom_spec.rb +35 -0
- data/spec/feedzirra/parser/itunes_rss_item_spec.rb +48 -0
- data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +18 -0
- data/spec/feedzirra/parser/itunes_rss_spec.rb +50 -0
- data/spec/feedzirra/parser/rss_entry_spec.rb +41 -0
- data/spec/feedzirra/parser/rss_spec.rb +41 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +58 -0
- metadata +132 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedEntryUtilities
|
3
|
+
def published
|
4
|
+
@published || @updated
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_datetime(string)
|
8
|
+
begin
|
9
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
10
|
+
rescue
|
11
|
+
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
+
def id
|
19
|
+
@id || @url
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Writter for published. By default, we keep the "oldest" publish time found.
|
24
|
+
def published=(val)
|
25
|
+
parsed = parse_datetime(val)
|
26
|
+
@published = parsed if !@published || parsed < @published
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Writter for udapted. By default, we keep the most recenet update time found.
|
31
|
+
def updated=(val)
|
32
|
+
parsed = parse_datetime(val)
|
33
|
+
@updated = parsed if !@updated || parsed > @updated
|
34
|
+
end
|
35
|
+
|
36
|
+
def sanitize!
|
37
|
+
self.title.sanitize! if self.title
|
38
|
+
self.author.sanitize! if self.author
|
39
|
+
self.summary.sanitize! if self.summary
|
40
|
+
self.content.sanitize! if self.content
|
41
|
+
end
|
42
|
+
|
43
|
+
alias_method :last_modified, :published
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedUtilities
|
3
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
|
4
|
+
|
5
|
+
attr_writer :new_entries, :updated, :last_modified
|
6
|
+
attr_accessor :etag
|
7
|
+
|
8
|
+
def last_modified
|
9
|
+
@last_modified ||= begin
|
10
|
+
entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
|
11
|
+
entry ? entry.published : nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def updated?
|
16
|
+
@updated
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_entries
|
20
|
+
@new_entries ||= []
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_new_entries?
|
24
|
+
new_entries.size > 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_from_feed(feed)
|
28
|
+
self.new_entries += find_new_entries_for(feed)
|
29
|
+
self.entries.unshift(*self.new_entries)
|
30
|
+
|
31
|
+
updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def update_attribute(feed, name)
|
35
|
+
old_value, new_value = send(name), feed.send(name)
|
36
|
+
|
37
|
+
if old_value != new_value
|
38
|
+
send("#{name}=", new_value)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def sanitize_entries!
|
43
|
+
entries.each {|entry| entry.sanitize!}
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def updated!
|
49
|
+
@updated = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_new_entries_for(feed)
|
53
|
+
# this implementation is a hack, which is why it's so ugly.
|
54
|
+
# it's to get around the fact that not all feeds have a published date.
|
55
|
+
# however, they're always ordered with the newest one first.
|
56
|
+
# So we go through the entries just parsed and insert each one as a new entry
|
57
|
+
# until we get to one that has the same url as the the newest for the feed
|
58
|
+
latest_entry = self.entries.first
|
59
|
+
found_new_entries = []
|
60
|
+
feed.entries.each do |entry|
|
61
|
+
break if entry.url == latest_entry.url
|
62
|
+
found_new_entries << entry
|
63
|
+
end
|
64
|
+
found_new_entries
|
65
|
+
end
|
66
|
+
|
67
|
+
def existing_entry?(test_entry)
|
68
|
+
entries.any? { |entry| entry.url == test_entry.url }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class Atom
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
|
+
elements :entry, :as => :entries, :class => AtomEntry
|
19
|
+
|
20
|
+
def self.able_to_parse?(xml) #:nodoc:
|
21
|
+
xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class AtomEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
20
|
+
element :name, :as => :author
|
21
|
+
element :content
|
22
|
+
element :summary
|
23
|
+
element :published
|
24
|
+
element :id
|
25
|
+
element :created, :as => :published
|
26
|
+
element :issued, :as => :published
|
27
|
+
element :updated
|
28
|
+
element :modified, :as => :updated
|
29
|
+
elements :category, :as => :categories, :value => :term
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Feedburner Atom feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class AtomFeedBurner
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
|
+
elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
|
19
|
+
|
20
|
+
def self.able_to_parse?(xml) #:nodoc:
|
21
|
+
(xml =~ /Atom/ && xml =~ /feedburner/) || false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Feedburner Atom feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class AtomFeedBurnerEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :name, :as => :author
|
20
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
21
|
+
element :"feedburner:origLink", :as => :url
|
22
|
+
element :summary
|
23
|
+
element :content
|
24
|
+
element :published
|
25
|
+
element :id
|
26
|
+
element :issued, :as => :published
|
27
|
+
element :created, :as => :published
|
28
|
+
element :updated
|
29
|
+
element :modified, :as => :updated
|
30
|
+
elements :category, :as => :categories, :value => :term
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes is RSS 2.0 + some apple extensions
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSS
|
7
|
+
include SAXMachine
|
8
|
+
include FeedUtilities
|
9
|
+
|
10
|
+
attr_accessor :feed_url
|
11
|
+
|
12
|
+
# RSS 2.0 elements that need including
|
13
|
+
element :copyright
|
14
|
+
element :description
|
15
|
+
element :language
|
16
|
+
element :managingEditor
|
17
|
+
element :title
|
18
|
+
element :link, :as => :url
|
19
|
+
|
20
|
+
# If author is not present use managingEditor on the channel
|
21
|
+
element :"itunes:author", :as => :itunes_author
|
22
|
+
element :"itunes:block", :as => :itunes_block
|
23
|
+
element :"itunes:image", :value => :href, :as => :itunes_image
|
24
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
25
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
26
|
+
# New URL for the podcast feed
|
27
|
+
element :"itunes:new-feed-url", :as => :itunes_new_feed_url
|
28
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
29
|
+
# If summary is not present, use the description tag
|
30
|
+
element :"itunes:summary", :as => :itunes_summary
|
31
|
+
|
32
|
+
# iTunes RSS feeds can have multiple main categories...
|
33
|
+
# ...and multiple sub-categories per category
|
34
|
+
# TODO subcategories not supported correctly - they are at the same level
|
35
|
+
# as the main categories
|
36
|
+
elements :"itunes:category", :as => :itunes_categories, :value => :text
|
37
|
+
|
38
|
+
elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
|
39
|
+
|
40
|
+
elements :item, :as => :entries, :class => ITunesRSSItem
|
41
|
+
|
42
|
+
def self.able_to_parse?(xml)
|
43
|
+
xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes extensions to the standard RSS2.0 item
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSSItem
|
7
|
+
include SAXMachine
|
8
|
+
include FeedUtilities
|
9
|
+
element :author
|
10
|
+
element :guid
|
11
|
+
element :title
|
12
|
+
element :link, :as => :url
|
13
|
+
element :description, :as => :summary
|
14
|
+
element :pubDate, :as => :published
|
15
|
+
|
16
|
+
# If author is not present use author tag on the item
|
17
|
+
element :"itunes:author", :as => :itunes_author
|
18
|
+
element :"itunes:block", :as => :itunes_block
|
19
|
+
element :"itunes:duration", :as => :itunes_duration
|
20
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
21
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
22
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
23
|
+
# If summary is not present, use the description tag
|
24
|
+
element :"itunes:summary", :as => :itunes_summary
|
25
|
+
element :enclosure, :value => :length, :as => :enclosure_length
|
26
|
+
element :enclosure, :value => :type, :as => :enclosure_type
|
27
|
+
element :enclosure, :value => :url, :as => :enclosure_url
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with RSS feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class RSS
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url
|
17
|
+
elements :item, :as => :entries, :class => RSSEntry
|
18
|
+
|
19
|
+
attr_accessor :feed_url
|
20
|
+
|
21
|
+
def self.able_to_parse?(xml) #:nodoc:
|
22
|
+
xml =~ /\<rss|rdf/
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with RDF feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class RSSEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :link, :as => :url
|
20
|
+
|
21
|
+
element :"dc:creator", :as => :author
|
22
|
+
element :"content:encoded", :as => :content
|
23
|
+
element :description, :as => :summary
|
24
|
+
|
25
|
+
element :pubDate, :as => :published
|
26
|
+
element :"dc:date", :as => :published
|
27
|
+
element :"dc:Date", :as => :published
|
28
|
+
element :"dcterms:created", :as => :published
|
29
|
+
|
30
|
+
|
31
|
+
element :"dcterms:modified", :as => :updated
|
32
|
+
element :issued, :as => :published
|
33
|
+
elements :category, :as => :categories
|
34
|
+
|
35
|
+
element :guid, :as => :id
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Feedzirra::FeedUtilities do
|
4
|
+
before(:each) do
|
5
|
+
@klass = Class.new do
|
6
|
+
include Feedzirra::FeedEntryUtilities
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "handling dates" do
|
11
|
+
it "should parse an ISO 8601 formatted datetime into Time" do
|
12
|
+
time = @klass.new.parse_datetime("2008-02-20T8:05:00-010:00")
|
13
|
+
time.class.should == Time
|
14
|
+
time.to_s.should == "Wed Feb 20 18:05:00 UTC 2008"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "sanitizing" do
|
19
|
+
before(:each) do
|
20
|
+
@feed = Feedzirra::Feed.parse(sample_atom_feed)
|
21
|
+
@entry = @feed.entries.first
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should provide a sanitized title" do
|
25
|
+
new_title = "<script>" + @entry.title
|
26
|
+
@entry.title = new_title
|
27
|
+
@entry.title.sanitize.should == Dryopteris.sanitize(new_title)
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should sanitize content in place" do
|
31
|
+
new_content = "<script>" + @entry.content
|
32
|
+
@entry.content = new_content.dup
|
33
|
+
@entry.content.sanitize!.should == Dryopteris.sanitize(new_content)
|
34
|
+
@entry.content.should == Dryopteris.sanitize(new_content)
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should sanitize things in place" do
|
38
|
+
@entry.title += "<script>"
|
39
|
+
@entry.author += "<script>"
|
40
|
+
@entry.content += "<script>"
|
41
|
+
|
42
|
+
cleaned_title = Dryopteris.sanitize(@entry.title)
|
43
|
+
cleaned_author = Dryopteris.sanitize(@entry.author)
|
44
|
+
cleaned_content = Dryopteris.sanitize(@entry.content)
|
45
|
+
|
46
|
+
@entry.sanitize!
|
47
|
+
@entry.title.should == cleaned_title
|
48
|
+
@entry.author.should == cleaned_author
|
49
|
+
@entry.content.should == cleaned_content
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|