feedzirra 0.0.18.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +196 -0
- data/Rakefile +56 -0
- data/lib/core_ext/date.rb +21 -0
- data/lib/core_ext/string.rb +9 -0
- data/lib/feedzirra.rb +34 -0
- data/lib/feedzirra/feed.rb +324 -0
- data/lib/feedzirra/feed_entry_utilities.rb +45 -0
- data/lib/feedzirra/feed_utilities.rb +71 -0
- data/lib/feedzirra/parser/atom.rb +47 -0
- data/lib/feedzirra/parser/atom_entry.rb +51 -0
- data/lib/feedzirra/parser/atom_feed_burner.rb +27 -0
- data/lib/feedzirra/parser/atom_feed_burner_entry.rb +35 -0
- data/lib/feedzirra/parser/itunes_rss.rb +50 -0
- data/lib/feedzirra/parser/itunes_rss_item.rb +31 -0
- data/lib/feedzirra/parser/itunes_rss_owner.rb +12 -0
- data/lib/feedzirra/parser/rss.rb +40 -0
- data/lib/feedzirra/parser/rss_entry.rb +55 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
- data/spec/feedzirra/feed_spec.rb +546 -0
- data/spec/feedzirra/feed_utilities_spec.rb +149 -0
- data/spec/feedzirra/parser/atom_entry_spec.rb +49 -0
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +42 -0
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +39 -0
- data/spec/feedzirra/parser/atom_spec.rb +63 -0
- data/spec/feedzirra/parser/itunes_rss_item_spec.rb +48 -0
- data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +18 -0
- data/spec/feedzirra/parser/itunes_rss_spec.rb +50 -0
- data/spec/feedzirra/parser/rss_entry_spec.rb +91 -0
- data/spec/feedzirra/parser/rss_spec.rb +88 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +66 -0
- metadata +144 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedEntryUtilities
|
3
|
+
def published
|
4
|
+
@published || @updated
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_datetime(string)
|
8
|
+
begin
|
9
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
10
|
+
rescue
|
11
|
+
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
+
def id
|
19
|
+
@id || @url
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Writter for published. By default, we keep the "oldest" publish time found.
|
24
|
+
def published=(val)
|
25
|
+
parsed = parse_datetime(val)
|
26
|
+
@published = parsed if !@published || parsed < @published
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Writter for udapted. By default, we keep the most recenet update time found.
|
31
|
+
def updated=(val)
|
32
|
+
parsed = parse_datetime(val)
|
33
|
+
@updated = parsed if !@updated || parsed > @updated
|
34
|
+
end
|
35
|
+
|
36
|
+
def sanitize!
|
37
|
+
self.title.sanitize! if self.title
|
38
|
+
self.author.sanitize! if self.author
|
39
|
+
self.summary.sanitize! if self.summary
|
40
|
+
self.content.sanitize! if self.content
|
41
|
+
end
|
42
|
+
|
43
|
+
alias_method :last_modified, :published
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedUtilities
|
3
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
|
4
|
+
|
5
|
+
attr_writer :new_entries, :updated, :last_modified
|
6
|
+
attr_accessor :etag
|
7
|
+
|
8
|
+
def last_modified
|
9
|
+
@last_modified ||= begin
|
10
|
+
entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
|
11
|
+
entry ? entry.published : nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def updated?
|
16
|
+
@updated
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_entries
|
20
|
+
@new_entries ||= []
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_new_entries?
|
24
|
+
new_entries.size > 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_from_feed(feed)
|
28
|
+
self.new_entries += find_new_entries_for(feed)
|
29
|
+
self.entries.unshift(*self.new_entries)
|
30
|
+
|
31
|
+
updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def update_attribute(feed, name)
|
35
|
+
old_value, new_value = send(name), feed.send(name)
|
36
|
+
|
37
|
+
if old_value != new_value
|
38
|
+
send("#{name}=", new_value)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def sanitize_entries!
|
43
|
+
entries.each {|entry| entry.sanitize!}
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def updated!
|
49
|
+
@updated = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_new_entries_for(feed)
|
53
|
+
# this implementation is a hack, which is why it's so ugly.
|
54
|
+
# it's to get around the fact that not all feeds have a published date.
|
55
|
+
# however, they're always ordered with the newest one first.
|
56
|
+
# So we go through the entries just parsed and insert each one as a new entry
|
57
|
+
# until we get to one that has the same url as the the newest for the feed
|
58
|
+
latest_entry = self.entries.first
|
59
|
+
found_new_entries = []
|
60
|
+
feed.entries.each do |entry|
|
61
|
+
break if entry.url == latest_entry.url
|
62
|
+
found_new_entries << entry
|
63
|
+
end
|
64
|
+
found_new_entries
|
65
|
+
end
|
66
|
+
|
67
|
+
def existing_entry?(test_entry)
|
68
|
+
entries.any? { |entry| entry.url == test_entry.url }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * prev_page
|
9
|
+
# * next_page
|
10
|
+
# * lat_page
|
11
|
+
# * title
|
12
|
+
# * subtitle
|
13
|
+
# * updated
|
14
|
+
# * feed_url
|
15
|
+
# * url
|
16
|
+
# * related
|
17
|
+
# * entries
|
18
|
+
class Atom
|
19
|
+
include SAXMachine
|
20
|
+
include FeedUtilities
|
21
|
+
element :"atom:link", :as => :prev_page, :value => :href, :with => {:rel => 'prev'}
|
22
|
+
element :"atom:link", :as => :next_page, :value => :href, :with => {:rel => 'next'}
|
23
|
+
element :"atom:link", :as => :last_page, :value => :href, :with => {:rel => 'last'}
|
24
|
+
element :title
|
25
|
+
element :subtitle
|
26
|
+
element :updated
|
27
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
28
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
29
|
+
elements :link, :as => :related, :value => :href, :with => {:rel => "related"}
|
30
|
+
elements :link, :as => :links, :value => :href
|
31
|
+
elements :entry, :as => :entries, :class => AtomEntry
|
32
|
+
|
33
|
+
def self.able_to_parse?(xml) #:nodoc:
|
34
|
+
xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
|
35
|
+
end
|
36
|
+
|
37
|
+
def url
|
38
|
+
@url || links.last
|
39
|
+
end
|
40
|
+
|
41
|
+
def feed_url
|
42
|
+
@feed_url || links.first
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * related
|
11
|
+
# * author
|
12
|
+
# * content
|
13
|
+
# * summary
|
14
|
+
# * published
|
15
|
+
# * categories
|
16
|
+
# * media_content
|
17
|
+
# * media_description
|
18
|
+
# * media_thumbnail
|
19
|
+
# * enclosure
|
20
|
+
class AtomEntry
|
21
|
+
include SAXMachine
|
22
|
+
include FeedEntryUtilities
|
23
|
+
element :title
|
24
|
+
element :link, :as => :url, :value => :href, :with => {:rel => "alternate"}
|
25
|
+
elements :link, :as => :related, :value => :href, :with => {:rel => "related"}
|
26
|
+
element :name, :as => :author
|
27
|
+
element :content
|
28
|
+
element :summary
|
29
|
+
element :published
|
30
|
+
element :id
|
31
|
+
element :created, :as => :published
|
32
|
+
element :issued, :as => :published
|
33
|
+
element :updated
|
34
|
+
element :modified, :as => :updated
|
35
|
+
elements :category, :as => :categories, :value => :term
|
36
|
+
|
37
|
+
element :"media:content", :as => :media_content, :value => :url
|
38
|
+
element :"media:description", :as => :media_description
|
39
|
+
element :"media:thumbnail", :as => :media_thumbnail, :value => :url
|
40
|
+
element :enclosure, :value => :url
|
41
|
+
|
42
|
+
elements :link, :as => :links, :value => :href
|
43
|
+
|
44
|
+
def url
|
45
|
+
@url || links.first
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Feedburner Atom feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class AtomFeedBurner
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
|
+
elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
|
19
|
+
|
20
|
+
def self.able_to_parse?(xml) #:nodoc:
|
21
|
+
(xml =~ /Atom/ && xml =~ /feedburner/) || false
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Feedburner Atom feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class AtomFeedBurnerEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :name, :as => :author
|
20
|
+
element :link, :as => :url, :value => :href, :with => {:rel => "alternate"}
|
21
|
+
element :"feedburner:origLink", :as => :url
|
22
|
+
element :summary
|
23
|
+
element :content
|
24
|
+
element :published
|
25
|
+
element :id
|
26
|
+
element :issued, :as => :published
|
27
|
+
element :created, :as => :published
|
28
|
+
element :updated
|
29
|
+
element :modified, :as => :updated
|
30
|
+
elements :category, :as => :categories, :value => :term
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes is RSS 2.0 + some apple extensions
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSS
|
7
|
+
include SAXMachine
|
8
|
+
include FeedUtilities
|
9
|
+
|
10
|
+
attr_accessor :feed_url
|
11
|
+
|
12
|
+
# RSS 2.0 elements that need including
|
13
|
+
element :copyright
|
14
|
+
element :description
|
15
|
+
element :language
|
16
|
+
element :managingEditor
|
17
|
+
element :title
|
18
|
+
element :link, :as => :url
|
19
|
+
|
20
|
+
# If author is not present use managingEditor on the channel
|
21
|
+
element :"itunes:author", :as => :itunes_author
|
22
|
+
element :"itunes:block", :as => :itunes_block
|
23
|
+
element :"itunes:image", :value => :href, :as => :itunes_image
|
24
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
25
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
26
|
+
# New URL for the podcast feed
|
27
|
+
element :"itunes:new-feed-url", :as => :itunes_new_feed_url
|
28
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
29
|
+
# If summary is not present, use the description tag
|
30
|
+
element :"itunes:summary", :as => :itunes_summary
|
31
|
+
|
32
|
+
# iTunes RSS feeds can have multiple main categories...
|
33
|
+
# ...and multiple sub-categories per category
|
34
|
+
# TODO subcategories not supported correctly - they are at the same level
|
35
|
+
# as the main categories
|
36
|
+
elements :"itunes:category", :as => :itunes_categories, :value => :text
|
37
|
+
|
38
|
+
elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
|
39
|
+
|
40
|
+
elements :item, :as => :entries, :class => ITunesRSSItem
|
41
|
+
|
42
|
+
def self.able_to_parse?(xml)
|
43
|
+
xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes extensions to the standard RSS2.0 item
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSSItem
|
7
|
+
include SAXMachine
|
8
|
+
include FeedUtilities
|
9
|
+
element :author
|
10
|
+
element :guid
|
11
|
+
element :title
|
12
|
+
element :link, :as => :url
|
13
|
+
element :description, :as => :summary
|
14
|
+
element :pubDate, :as => :published
|
15
|
+
|
16
|
+
# If author is not present use author tag on the item
|
17
|
+
element :"itunes:author", :as => :itunes_author
|
18
|
+
element :"itunes:block", :as => :itunes_block
|
19
|
+
element :"itunes:duration", :as => :itunes_duration
|
20
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
21
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
22
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
23
|
+
# If summary is not present, use the description tag
|
24
|
+
element :"itunes:summary", :as => :itunes_summary
|
25
|
+
element :enclosure, :value => :length, :as => :enclosure_length
|
26
|
+
element :enclosure, :value => :type, :as => :enclosure_type
|
27
|
+
element :enclosure, :value => :url, :as => :enclosure_url
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with RSS feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * prev_page
|
9
|
+
# * next_page
|
10
|
+
# * lat_page
|
11
|
+
# * title
|
12
|
+
# * feed_url
|
13
|
+
# * url
|
14
|
+
# * related
|
15
|
+
# * description
|
16
|
+
# * language
|
17
|
+
# * entries
|
18
|
+
class RSS
|
19
|
+
include SAXMachine
|
20
|
+
include FeedUtilities
|
21
|
+
element :"atom:link", :as => :prev_page, :value => :href, :with => {:rel => 'prev'}
|
22
|
+
element :"atom:link", :as => :next_page, :value => :href, :with => {:rel => 'next'}
|
23
|
+
element :"atom:link", :as => :last_page, :value => :href, :with => {:rel => 'last'}
|
24
|
+
element :title
|
25
|
+
element :link, :as => :url
|
26
|
+
elements :link, :as => :related, :value => :href, :with => {:rel => "related"}
|
27
|
+
element :description
|
28
|
+
element :language
|
29
|
+
elements :item, :as => :entries, :class => RSSEntry
|
30
|
+
|
31
|
+
attr_accessor :feed_url
|
32
|
+
|
33
|
+
def self.able_to_parse?(xml) #:nodoc:
|
34
|
+
xml =~ /\<rss|\<rdf/
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with RDF feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * related
|
11
|
+
# * author
|
12
|
+
# * content
|
13
|
+
# * summary
|
14
|
+
# * published
|
15
|
+
# * updated
|
16
|
+
# * categories
|
17
|
+
# * media_content
|
18
|
+
# * media_description
|
19
|
+
# * media_thumbnail
|
20
|
+
# * enclosure
|
21
|
+
class RSSEntry
|
22
|
+
include SAXMachine
|
23
|
+
include FeedEntryUtilities
|
24
|
+
element :title
|
25
|
+
element :link, :as => :url
|
26
|
+
elements :link, :as => :related, :value => :href, :with => {:rel => "related"}
|
27
|
+
|
28
|
+
element :author
|
29
|
+
element :"dc:creator", :as => :author
|
30
|
+
element :author, :as => :author
|
31
|
+
element :"content:encoded", :as => :content
|
32
|
+
element :description, :as => :summary
|
33
|
+
|
34
|
+
element :pubDate, :as => :published
|
35
|
+
element :pubdate, :as => :published
|
36
|
+
element :"dc:date", :as => :published
|
37
|
+
element :"dc:Date", :as => :published
|
38
|
+
element :"dcterms:created", :as => :published
|
39
|
+
|
40
|
+
|
41
|
+
element :"dcterms:modified", :as => :updated
|
42
|
+
element :issued, :as => :published
|
43
|
+
elements :category, :as => :categories
|
44
|
+
|
45
|
+
element :"media:content", :as => :media_content, :value => :url
|
46
|
+
element :"media:description", :as => :media_description
|
47
|
+
element :"media:thumbnail", :as => :media_thumbnail, :value => :url
|
48
|
+
element :enclosure, :value => :url
|
49
|
+
|
50
|
+
element :guid, :as => :id
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|