jashmenn-feedzirra 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +1 -0
- data/README.rdoc +177 -0
- data/Rakefile +51 -0
- data/lib/feedzirra.rb +20 -0
- data/lib/feedzirra/core_ext.rb +3 -0
- data/lib/feedzirra/core_ext/date.rb +19 -0
- data/lib/feedzirra/core_ext/string.rb +9 -0
- data/lib/feedzirra/feed.rb +384 -0
- data/lib/feedzirra/feed_entry_utilities.rb +65 -0
- data/lib/feedzirra/feed_utilities.rb +61 -0
- data/lib/feedzirra/parser.rb +20 -0
- data/lib/feedzirra/parser/atom.rb +29 -0
- data/lib/feedzirra/parser/atom_entry.rb +30 -0
- data/lib/feedzirra/parser/atom_feed_burner.rb +21 -0
- data/lib/feedzirra/parser/atom_feed_burner_entry.rb +31 -0
- data/lib/feedzirra/parser/google_docs_atom.rb +28 -0
- data/lib/feedzirra/parser/google_docs_atom_entry.rb +29 -0
- data/lib/feedzirra/parser/itunes_rss.rb +50 -0
- data/lib/feedzirra/parser/itunes_rss_item.rb +32 -0
- data/lib/feedzirra/parser/itunes_rss_owner.rb +12 -0
- data/lib/feedzirra/parser/rss.rb +22 -0
- data/lib/feedzirra/parser/rss_entry.rb +34 -0
- data/lib/feedzirra/parser/rss_feed_burner.rb +22 -0
- data/lib/feedzirra/parser/rss_feed_burner_entry.rb +40 -0
- data/lib/feedzirra/version.rb +3 -0
- data/spec/benchmarks/feed_benchmarks.rb +98 -0
- data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
- data/spec/benchmarks/fetching_benchmarks.rb +28 -0
- data/spec/benchmarks/parsing_benchmark.rb +30 -0
- data/spec/benchmarks/updating_benchmarks.rb +33 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
- data/spec/feedzirra/feed_spec.rb +597 -0
- data/spec/feedzirra/feed_utilities_spec.rb +152 -0
- data/spec/feedzirra/parser/atom_entry_spec.rb +86 -0
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +47 -0
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +47 -0
- data/spec/feedzirra/parser/atom_spec.rb +51 -0
- data/spec/feedzirra/parser/google_docs_atom_entry_spec.rb +22 -0
- data/spec/feedzirra/parser/google_docs_atom_spec.rb +31 -0
- data/spec/feedzirra/parser/itunes_rss_item_spec.rb +48 -0
- data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +18 -0
- data/spec/feedzirra/parser/itunes_rss_spec.rb +54 -0
- data/spec/feedzirra/parser/rss_entry_spec.rb +85 -0
- data/spec/feedzirra/parser/rss_feed_burner_entry_spec.rb +85 -0
- data/spec/feedzirra/parser/rss_feed_burner_spec.rb +52 -0
- data/spec/feedzirra/parser/rss_spec.rb +49 -0
- data/spec/sample_feeds/run_against_sample.rb +20 -0
- data/spec/spec_helper.rb +78 -0
- metadata +228 -0
@@ -0,0 +1,65 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedEntryUtilities
|
3
|
+
|
4
|
+
include Enumerable
|
5
|
+
|
6
|
+
def published
|
7
|
+
@published ||= @updated
|
8
|
+
end
|
9
|
+
|
10
|
+
def parse_datetime(string)
|
11
|
+
begin
|
12
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
13
|
+
rescue
|
14
|
+
warn "Failed to parse date #{string.inspect}"
|
15
|
+
nil
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
##
|
20
|
+
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
21
|
+
def id
|
22
|
+
@entry_id ||= @url
|
23
|
+
end
|
24
|
+
|
25
|
+
##
|
26
|
+
# Writer for published. By default, we keep the "oldest" publish time found.
|
27
|
+
def published=(val)
|
28
|
+
parsed = parse_datetime(val)
|
29
|
+
@published = parsed if !@published || parsed < @published
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# Writer for updated. By default, we keep the most recent update time found.
|
34
|
+
def updated=(val)
|
35
|
+
parsed = parse_datetime(val)
|
36
|
+
@updated = parsed if !@updated || parsed > @updated
|
37
|
+
end
|
38
|
+
|
39
|
+
def sanitize!
|
40
|
+
self.title.sanitize! if self.title
|
41
|
+
self.author.sanitize! if self.author
|
42
|
+
self.summary.sanitize! if self.summary
|
43
|
+
self.content.sanitize! if self.content
|
44
|
+
end
|
45
|
+
|
46
|
+
alias_method :last_modified, :published
|
47
|
+
|
48
|
+
def each
|
49
|
+
@rss_fields ||= self.instance_variables
|
50
|
+
|
51
|
+
@rss_fields.each do |field|
|
52
|
+
yield(field.to_s.sub('@', ''), self.instance_variable_get(field))
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def [](field)
|
57
|
+
self.instance_variable_get("@#{field.to_s}")
|
58
|
+
end
|
59
|
+
|
60
|
+
def []=(field, value)
|
61
|
+
self.instance_variable_set("@#{field.to_s}", value)
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedUtilities
|
3
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag)
|
4
|
+
|
5
|
+
attr_writer :new_entries, :updated, :last_modified
|
6
|
+
attr_accessor :etag
|
7
|
+
|
8
|
+
def last_modified
|
9
|
+
@last_modified ||= begin
|
10
|
+
entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
|
11
|
+
entry ? entry.published : nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def updated?
|
16
|
+
@updated
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_entries
|
20
|
+
@new_entries ||= []
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_new_entries?
|
24
|
+
new_entries.size > 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_from_feed(feed)
|
28
|
+
self.new_entries += find_new_entries_for(feed)
|
29
|
+
self.entries.unshift(*self.new_entries)
|
30
|
+
|
31
|
+
@updated = false
|
32
|
+
UPDATABLE_ATTRIBUTES.each do |name|
|
33
|
+
updated = update_attribute(feed, name)
|
34
|
+
@updated ||= updated
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def update_attribute(feed, name)
|
39
|
+
old_value, new_value = send(name), feed.send(name)
|
40
|
+
|
41
|
+
if old_value != new_value
|
42
|
+
send("#{name}=", new_value)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def sanitize_entries!
|
47
|
+
entries.each {|entry| entry.sanitize!}
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def find_new_entries_for(feed)
|
53
|
+
# this algorithm does not optimize based on publication date, but always finds new entries
|
54
|
+
feed.entries.reject {|entry| self.entries.any? {|e| e.url == entry.url} }
|
55
|
+
end
|
56
|
+
|
57
|
+
def existing_entry?(test_entry)
|
58
|
+
entries.any? { |entry| entry.url == test_entry.url }
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module Parser
|
3
|
+
autoload :RSS, 'feedzirra/parser/rss'
|
4
|
+
autoload :RSSEntry, 'feedzirra/parser/rss_entry'
|
5
|
+
autoload :RSSFeedBurner, 'feedzirra/parser/rss_feed_burner'
|
6
|
+
autoload :RSSFeedBurnerEntry, 'feedzirra/parser/rss_feed_burner_entry'
|
7
|
+
|
8
|
+
autoload :ITunesRSS, 'feedzirra/parser/itunes_rss'
|
9
|
+
autoload :ITunesRSSItem, 'feedzirra/parser/itunes_rss_item'
|
10
|
+
autoload :ITunesRSSOwner, 'feedzirra/parser/itunes_rss_owner'
|
11
|
+
|
12
|
+
autoload :GoogleDocsAtom, 'feedzirra/parser/google_docs_atom'
|
13
|
+
autoload :GoogleDocsAtomEntry, 'feedzirra/parser/google_docs_atom_entry'
|
14
|
+
|
15
|
+
autoload :Atom, 'feedzirra/parser/atom'
|
16
|
+
autoload :AtomEntry, 'feedzirra/parser/atom_entry'
|
17
|
+
autoload :AtomFeedBurner, 'feedzirra/parser/atom_feed_burner'
|
18
|
+
autoload :AtomFeedBurnerEntry, 'feedzirra/parser/atom_feed_burner_entry'
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# Parser for dealing with Atom feeds.
|
5
|
+
class Atom
|
6
|
+
include SAXMachine
|
7
|
+
include FeedUtilities
|
8
|
+
element :title
|
9
|
+
element :subtitle, :as => :description
|
10
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
11
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
12
|
+
elements :link, :as => :links, :value => :href
|
13
|
+
elements :entry, :as => :entries, :class => AtomEntry
|
14
|
+
|
15
|
+
def self.able_to_parse?(xml) #:nodoc:
|
16
|
+
/\<feed[^\>]+xmlns=[\"|\'](http:\/\/www\.w3\.org\/2005\/Atom|http:\/\/purl\.org\/atom\/ns\#)[\"|\'][^\>]*\>/ =~ xml
|
17
|
+
end
|
18
|
+
|
19
|
+
def url
|
20
|
+
@url || links.last
|
21
|
+
end
|
22
|
+
|
23
|
+
def feed_url
|
24
|
+
@feed_url ||= links.first
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# Parser for dealing with Atom feed entries.
|
5
|
+
class AtomEntry
|
6
|
+
include SAXMachine
|
7
|
+
include FeedEntryUtilities
|
8
|
+
|
9
|
+
element :title
|
10
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
11
|
+
element :name, :as => :author
|
12
|
+
element :content
|
13
|
+
element :summary
|
14
|
+
element :published
|
15
|
+
element :id, :as => :entry_id
|
16
|
+
element :created, :as => :published
|
17
|
+
element :issued, :as => :published
|
18
|
+
element :updated
|
19
|
+
element :modified, :as => :updated
|
20
|
+
elements :category, :as => :categories, :value => :term
|
21
|
+
elements :link, :as => :links, :value => :href
|
22
|
+
|
23
|
+
def url
|
24
|
+
@url ||= links.first
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# Parser for dealing with Feedburner Atom feeds.
|
5
|
+
class AtomFeedBurner
|
6
|
+
include SAXMachine
|
7
|
+
include FeedUtilities
|
8
|
+
element :title
|
9
|
+
element :subtitle, :as => :description
|
10
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
11
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
12
|
+
elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
|
13
|
+
|
14
|
+
def self.able_to_parse?(xml) #:nodoc:
|
15
|
+
((/Atom/ =~ xml) && (/feedburner/ =~ xml) && !(/\<rss|\<rdf/ =~ xml)) || false
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# Parser for dealing with Feedburner Atom feed entries.
|
5
|
+
class AtomFeedBurnerEntry
|
6
|
+
include SAXMachine
|
7
|
+
include FeedEntryUtilities
|
8
|
+
|
9
|
+
element :title
|
10
|
+
element :name, :as => :author
|
11
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
12
|
+
element :"feedburner:origLink", :as => :url
|
13
|
+
element :summary
|
14
|
+
element :content
|
15
|
+
element :published
|
16
|
+
element :id, :as => :entry_id
|
17
|
+
element :issued, :as => :published
|
18
|
+
element :created, :as => :published
|
19
|
+
element :updated
|
20
|
+
element :modified, :as => :updated
|
21
|
+
elements :category, :as => :categories, :value => :term
|
22
|
+
elements :link, :as => :links, :value => :href
|
23
|
+
|
24
|
+
def url
|
25
|
+
@url ||= links.first
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.expand_path('./atom', File.dirname(__FILE__))
|
2
|
+
|
3
|
+
module Feedzirra
|
4
|
+
module Parser
|
5
|
+
class GoogleDocsAtom
|
6
|
+
include SAXMachine
|
7
|
+
include FeedUtilities
|
8
|
+
element :title
|
9
|
+
element :subtitle, :as => :description
|
10
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
11
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
12
|
+
elements :link, :as => :links, :value => :href
|
13
|
+
elements :entry, :as => :entries, :class => GoogleDocsAtomEntry
|
14
|
+
|
15
|
+
def url
|
16
|
+
@url ||= links.first
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.able_to_parse?(xml) #:nodoc:
|
20
|
+
%r{<id>https?://docs.google.com/.*\</id\>} =~ xml
|
21
|
+
end
|
22
|
+
|
23
|
+
def feed_url
|
24
|
+
@feed_url ||= links.first
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module Parser
|
3
|
+
class GoogleDocsAtomEntry
|
4
|
+
include SAXMachine
|
5
|
+
include FeedEntryUtilities
|
6
|
+
|
7
|
+
element :title
|
8
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
9
|
+
element :name, :as => :author
|
10
|
+
element :content
|
11
|
+
element :summary
|
12
|
+
element :published
|
13
|
+
element :id, :as => :entry_id
|
14
|
+
element :created, :as => :published
|
15
|
+
element :issued, :as => :published
|
16
|
+
element :updated
|
17
|
+
element :modified, :as => :updated
|
18
|
+
elements :category, :as => :categories, :value => :term
|
19
|
+
elements :link, :as => :links, :value => :href
|
20
|
+
element :"docs:md5Checksum", :as => :checksum
|
21
|
+
element :"docs:filename", :as => :original_filename
|
22
|
+
element :"docs:suggestedFilename", :as => :suggested_filename
|
23
|
+
|
24
|
+
def url
|
25
|
+
@url ||= links.first
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes is RSS 2.0 + some apple extensions
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSS
|
7
|
+
include SAXMachine
|
8
|
+
include FeedUtilities
|
9
|
+
|
10
|
+
attr_accessor :feed_url
|
11
|
+
|
12
|
+
# RSS 2.0 elements that need including
|
13
|
+
element :copyright
|
14
|
+
element :description
|
15
|
+
element :language
|
16
|
+
element :managingEditor
|
17
|
+
element :title
|
18
|
+
element :link, :as => :url
|
19
|
+
|
20
|
+
# If author is not present use managingEditor on the channel
|
21
|
+
element :"itunes:author", :as => :itunes_author
|
22
|
+
element :"itunes:block", :as => :itunes_block
|
23
|
+
element :"itunes:image", :value => :href, :as => :itunes_image
|
24
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
25
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
26
|
+
# New URL for the podcast feed
|
27
|
+
element :"itunes:new-feed-url", :as => :itunes_new_feed_url
|
28
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
29
|
+
# If summary is not present, use the description tag
|
30
|
+
element :"itunes:summary", :as => :itunes_summary
|
31
|
+
|
32
|
+
# iTunes RSS feeds can have multiple main categories...
|
33
|
+
# ...and multiple sub-categories per category
|
34
|
+
# TODO subcategories not supported correctly - they are at the same level
|
35
|
+
# as the main categories
|
36
|
+
elements :"itunes:category", :as => :itunes_categories, :value => :text
|
37
|
+
|
38
|
+
elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
|
39
|
+
|
40
|
+
elements :item, :as => :entries, :class => ITunesRSSItem
|
41
|
+
|
42
|
+
def self.able_to_parse?(xml)
|
43
|
+
/xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/i =~ xml
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# iTunes extensions to the standard RSS2.0 item
|
5
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
6
|
+
class ITunesRSSItem
|
7
|
+
include SAXMachine
|
8
|
+
include FeedEntryUtilities
|
9
|
+
|
10
|
+
element :author
|
11
|
+
element :guid
|
12
|
+
element :title
|
13
|
+
element :link, :as => :url
|
14
|
+
element :description, :as => :summary
|
15
|
+
element :pubDate, :as => :published
|
16
|
+
|
17
|
+
# If author is not present use author tag on the item
|
18
|
+
element :"itunes:author", :as => :itunes_author
|
19
|
+
element :"itunes:block", :as => :itunes_block
|
20
|
+
element :"itunes:duration", :as => :itunes_duration
|
21
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
22
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
23
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
24
|
+
# If summary is not present, use the description tag
|
25
|
+
element :"itunes:summary", :as => :itunes_summary
|
26
|
+
element :enclosure, :value => :length, :as => :enclosure_length
|
27
|
+
element :enclosure, :value => :type, :as => :enclosure_type
|
28
|
+
element :enclosure, :value => :url, :as => :enclosure_url
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# Parser for dealing with RSS feeds.
|
5
|
+
class RSS
|
6
|
+
include SAXMachine
|
7
|
+
include FeedUtilities
|
8
|
+
element :title
|
9
|
+
element :description
|
10
|
+
element :link, :as => :url
|
11
|
+
elements :item, :as => :entries, :class => RSSEntry
|
12
|
+
|
13
|
+
attr_accessor :feed_url
|
14
|
+
|
15
|
+
def self.able_to_parse?(xml) #:nodoc:
|
16
|
+
(/\<rss|\<rdf/ =~ xml) && !(/feedburner/ =~ xml)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|