codders-feedzirra 0.2.0.rc2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.gitignore +12 -0
  2. data/.rspec +1 -0
  3. data/.travis.yml +9 -0
  4. data/Gemfile +10 -0
  5. data/Guardfile +6 -0
  6. data/HISTORY.md +25 -0
  7. data/README.md +179 -0
  8. data/Rakefile +6 -0
  9. data/feedzirra.gemspec +28 -0
  10. data/lib/feedzirra.rb +17 -0
  11. data/lib/feedzirra/core_ext.rb +3 -0
  12. data/lib/feedzirra/core_ext/date.rb +19 -0
  13. data/lib/feedzirra/core_ext/string.rb +9 -0
  14. data/lib/feedzirra/core_ext/time.rb +29 -0
  15. data/lib/feedzirra/feed.rb +382 -0
  16. data/lib/feedzirra/feed_entry_utilities.rb +65 -0
  17. data/lib/feedzirra/feed_utilities.rb +72 -0
  18. data/lib/feedzirra/parser.rb +20 -0
  19. data/lib/feedzirra/parser/atom.rb +29 -0
  20. data/lib/feedzirra/parser/atom_entry.rb +30 -0
  21. data/lib/feedzirra/parser/atom_feed_burner.rb +21 -0
  22. data/lib/feedzirra/parser/atom_feed_burner_entry.rb +31 -0
  23. data/lib/feedzirra/parser/google_docs_atom.rb +28 -0
  24. data/lib/feedzirra/parser/google_docs_atom_entry.rb +29 -0
  25. data/lib/feedzirra/parser/itunes_rss.rb +50 -0
  26. data/lib/feedzirra/parser/itunes_rss_item.rb +32 -0
  27. data/lib/feedzirra/parser/itunes_rss_owner.rb +12 -0
  28. data/lib/feedzirra/parser/rss.rb +22 -0
  29. data/lib/feedzirra/parser/rss_entry.rb +34 -0
  30. data/lib/feedzirra/parser/rss_feed_burner.rb +22 -0
  31. data/lib/feedzirra/parser/rss_feed_burner_entry.rb +40 -0
  32. data/lib/feedzirra/version.rb +3 -0
  33. data/spec/benchmarks/feed_benchmarks.rb +98 -0
  34. data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
  35. data/spec/benchmarks/fetching_benchmarks.rb +28 -0
  36. data/spec/benchmarks/parsing_benchmark.rb +30 -0
  37. data/spec/benchmarks/updating_benchmarks.rb +33 -0
  38. data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
  39. data/spec/feedzirra/feed_spec.rb +599 -0
  40. data/spec/feedzirra/feed_utilities_spec.rb +150 -0
  41. data/spec/feedzirra/parser/atom_entry_spec.rb +86 -0
  42. data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +47 -0
  43. data/spec/feedzirra/parser/atom_feed_burner_spec.rb +47 -0
  44. data/spec/feedzirra/parser/atom_spec.rb +51 -0
  45. data/spec/feedzirra/parser/google_docs_atom_entry_spec.rb +22 -0
  46. data/spec/feedzirra/parser/google_docs_atom_spec.rb +31 -0
  47. data/spec/feedzirra/parser/itunes_rss_item_spec.rb +48 -0
  48. data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +18 -0
  49. data/spec/feedzirra/parser/itunes_rss_spec.rb +54 -0
  50. data/spec/feedzirra/parser/rss_entry_spec.rb +85 -0
  51. data/spec/feedzirra/parser/rss_feed_burner_entry_spec.rb +85 -0
  52. data/spec/feedzirra/parser/rss_feed_burner_spec.rb +52 -0
  53. data/spec/feedzirra/parser/rss_spec.rb +49 -0
  54. data/spec/sample_feeds/AmazonWebServicesBlog.xml +796 -0
  55. data/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml +63 -0
  56. data/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml +27 -0
  57. data/spec/sample_feeds/GoogleDocsList.xml +187 -0
  58. data/spec/sample_feeds/HREFConsideredHarmful.xml +313 -0
  59. data/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml +22 -0
  60. data/spec/sample_feeds/PaulDixExplainsNothing.xml +174 -0
  61. data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +174 -0
  62. data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +19 -0
  63. data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +174 -0
  64. data/spec/sample_feeds/TechCrunch.xml +1514 -0
  65. data/spec/sample_feeds/TechCrunchFirstEntry.xml +9 -0
  66. data/spec/sample_feeds/TechCrunchFirstEntryDescription.xml +3 -0
  67. data/spec/sample_feeds/TenderLovemaking.xml +515 -0
  68. data/spec/sample_feeds/TenderLovemakingFirstEntry.xml +66 -0
  69. data/spec/sample_feeds/TrotterCashionHome.xml +610 -0
  70. data/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml +30 -0
  71. data/spec/sample_feeds/itunes.xml +60 -0
  72. data/spec/sample_feeds/run_against_sample.rb +20 -0
  73. data/spec/sample_feeds/top5kfeeds.dat +2170 -0
  74. data/spec/sample_feeds/trouble_feeds.txt +16 -0
  75. data/spec/spec_helper.rb +75 -0
  76. metadata +203 -0
@@ -0,0 +1,65 @@
1
+ module Feedzirra
2
+ module FeedEntryUtilities
3
+
4
+ include Enumerable
5
+
6
+ def published
7
+ @published ||= @updated
8
+ end
9
+
10
+ def parse_datetime(string)
11
+ begin
12
+ DateTime.parse(string).feed_utils_to_gm_time
13
+ rescue
14
+ warn "Failed to parse date #{string.inspect}"
15
+ nil
16
+ end
17
+ end
18
+
19
+ ##
20
+ # Returns the id of the entry or its url if not id is present, as some formats don't support it
21
+ def id
22
+ @entry_id ||= @url
23
+ end
24
+
25
+ ##
26
+ # Writer for published. By default, we keep the "oldest" publish time found.
27
+ def published=(val)
28
+ parsed = parse_datetime(val)
29
+ @published = parsed if !@published || parsed < @published
30
+ end
31
+
32
+ ##
33
+ # Writer for updated. By default, we keep the most recent update time found.
34
+ def updated=(val)
35
+ parsed = parse_datetime(val)
36
+ @updated = parsed if !@updated || parsed > @updated
37
+ end
38
+
39
+ def sanitize!
40
+ self.title.sanitize! if self.title
41
+ self.author.sanitize! if self.author
42
+ self.summary.sanitize! if self.summary
43
+ self.content.sanitize! if self.content
44
+ end
45
+
46
+ alias_method :last_modified, :published
47
+
48
+ def each
49
+ @rss_fields ||= self.instance_variables
50
+
51
+ @rss_fields.each do |field|
52
+ yield(field.to_s.sub('@', ''), self.instance_variable_get(field))
53
+ end
54
+ end
55
+
56
+ def [](field)
57
+ self.instance_variable_get("@#{field.to_s}")
58
+ end
59
+
60
+ def []=(field, value)
61
+ self.instance_variable_set("@#{field.to_s}", value)
62
+ end
63
+
64
+ end
65
+ end
@@ -0,0 +1,72 @@
1
+ module Feedzirra
2
+ module FeedUtilities
3
+ UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag)
4
+
5
+ attr_writer :new_entries, :updated, :last_modified
6
+ attr_accessor :etag
7
+
8
+ def last_modified
9
+ @last_modified ||= begin
10
+ entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
11
+ entry ? entry.published : nil
12
+ end
13
+ end
14
+
15
+ def updated?
16
+ @updated
17
+ end
18
+
19
+ def new_entries
20
+ @new_entries ||= []
21
+ end
22
+
23
+ def has_new_entries?
24
+ new_entries.size > 0
25
+ end
26
+
27
+ def update_from_feed(feed)
28
+ self.new_entries += find_new_entries_for(feed)
29
+ self.entries.unshift(*self.new_entries)
30
+
31
+ @updated = false
32
+ UPDATABLE_ATTRIBUTES.each do |name|
33
+ updated = update_attribute(feed, name)
34
+ @updated ||= updated
35
+ end
36
+ end
37
+
38
+ def update_attribute(feed, name)
39
+ old_value, new_value = send(name), feed.send(name)
40
+
41
+ if old_value != new_value
42
+ send("#{name}=", new_value)
43
+ end
44
+ end
45
+
46
+ def sanitize_entries!
47
+ entries.each {|entry| entry.sanitize!}
48
+ end
49
+
50
+ private
51
+
52
+ def find_new_entries_for(feed)
53
+ # this implementation is a hack, which is why it's so ugly.
54
+ # it's to get around the fact that not all feeds have a published date.
55
+ # however, they're always ordered with the newest one first.
56
+ # So we go through the entries just parsed and insert each one as a new entry
57
+ # until we get to one that has the same url as the the newest for the feed
58
+ return feed.entries if self.entries.length == 0
59
+ latest_entry = self.entries.first
60
+ found_new_entries = []
61
+ feed.entries.each do |entry|
62
+ break if entry.url == latest_entry.url
63
+ found_new_entries << entry
64
+ end
65
+ found_new_entries
66
+ end
67
+
68
+ def existing_entry?(test_entry)
69
+ entries.any? { |entry| entry.url == test_entry.url }
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,20 @@
1
+ module Feedzirra
2
+ module Parser
3
+ autoload :RSS, 'feedzirra/parser/rss'
4
+ autoload :RSSEntry, 'feedzirra/parser/rss_entry'
5
+ autoload :RSSFeedBurner, 'feedzirra/parser/rss_feed_burner'
6
+ autoload :RSSFeedBurnerEntry, 'feedzirra/parser/rss_feed_burner_entry'
7
+
8
+ autoload :ITunesRSS, 'feedzirra/parser/itunes_rss'
9
+ autoload :ITunesRSSItem, 'feedzirra/parser/itunes_rss_item'
10
+ autoload :ITunesRSSOwner, 'feedzirra/parser/itunes_rss_owner'
11
+
12
+ autoload :GoogleDocsAtom, 'feedzirra/parser/google_docs_atom'
13
+ autoload :GoogleDocsAtomEntry, 'feedzirra/parser/google_docs_atom_entry'
14
+
15
+ autoload :Atom, 'feedzirra/parser/atom'
16
+ autoload :AtomEntry, 'feedzirra/parser/atom_entry'
17
+ autoload :AtomFeedBurner, 'feedzirra/parser/atom_feed_burner'
18
+ autoload :AtomFeedBurnerEntry, 'feedzirra/parser/atom_feed_burner_entry'
19
+ end
20
+ end
@@ -0,0 +1,29 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # Parser for dealing with Atom feeds.
5
+ class Atom
6
+ include SAXMachine
7
+ include FeedUtilities
8
+ element :title
9
+ element :subtitle, :as => :description
10
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
11
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
12
+ elements :link, :as => :links, :value => :href
13
+ elements :entry, :as => :entries, :class => AtomEntry
14
+
15
+ def self.able_to_parse?(xml) #:nodoc:
16
+ /\<feed[^\>]+xmlns=[\"|\'](http:\/\/www\.w3\.org\/2005\/Atom|http:\/\/purl\.org\/atom\/ns\#)[\"|\'][^\>]*\>/ =~ xml
17
+ end
18
+
19
+ def url
20
+ @url || links.last
21
+ end
22
+
23
+ def feed_url
24
+ @feed_url ||= links.first
25
+ end
26
+ end
27
+ end
28
+
29
+ end
@@ -0,0 +1,30 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # Parser for dealing with Atom feed entries.
5
+ class AtomEntry
6
+ include SAXMachine
7
+ include FeedEntryUtilities
8
+
9
+ element :title
10
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
11
+ element :name, :as => :author
12
+ element :content
13
+ element :summary
14
+ element :published
15
+ element :id, :as => :entry_id
16
+ element :created, :as => :published
17
+ element :issued, :as => :published
18
+ element :updated
19
+ element :modified, :as => :updated
20
+ elements :category, :as => :categories, :value => :term
21
+ elements :link, :as => :links, :value => :href
22
+
23
+ def url
24
+ @url ||= links.first
25
+ end
26
+ end
27
+
28
+ end
29
+
30
+ end
@@ -0,0 +1,21 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # Parser for dealing with Feedburner Atom feeds.
5
+ class AtomFeedBurner
6
+ include SAXMachine
7
+ include FeedUtilities
8
+ element :title
9
+ element :subtitle, :as => :description
10
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
11
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
12
+ elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
13
+
14
+ def self.able_to_parse?(xml) #:nodoc:
15
+ ((/Atom/ =~ xml) && (/feedburner/ =~ xml) && !(/\<rss|\<rdf/ =~ xml)) || false
16
+ end
17
+ end
18
+
19
+ end
20
+
21
+ end
@@ -0,0 +1,31 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # Parser for dealing with Feedburner Atom feed entries.
5
+ class AtomFeedBurnerEntry
6
+ include SAXMachine
7
+ include FeedEntryUtilities
8
+
9
+ element :title
10
+ element :name, :as => :author
11
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
12
+ element :"feedburner:origLink", :as => :url
13
+ element :summary
14
+ element :content
15
+ element :published
16
+ element :id, :as => :entry_id
17
+ element :issued, :as => :published
18
+ element :created, :as => :published
19
+ element :updated
20
+ element :modified, :as => :updated
21
+ elements :category, :as => :categories, :value => :term
22
+ elements :link, :as => :links, :value => :href
23
+
24
+ def url
25
+ @url ||= links.first
26
+ end
27
+
28
+ end
29
+ end
30
+
31
+ end
@@ -0,0 +1,28 @@
1
+ require File.expand_path('./atom', File.dirname(__FILE__))
2
+
3
+ module Feedzirra
4
+ module Parser
5
+ class GoogleDocsAtom
6
+ include SAXMachine
7
+ include FeedUtilities
8
+ element :title
9
+ element :subtitle, :as => :description
10
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
11
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
12
+ elements :link, :as => :links, :value => :href
13
+ elements :entry, :as => :entries, :class => GoogleDocsAtomEntry
14
+
15
+ def url
16
+ @url ||= links.first
17
+ end
18
+
19
+ def self.able_to_parse?(xml) #:nodoc:
20
+ %r{<id>https?://docs.google.com/.*\</id\>} =~ xml
21
+ end
22
+
23
+ def feed_url
24
+ @feed_url ||= links.first
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,29 @@
1
+ module Feedzirra
2
+ module Parser
3
+ class GoogleDocsAtomEntry
4
+ include SAXMachine
5
+ include FeedEntryUtilities
6
+
7
+ element :title
8
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
9
+ element :name, :as => :author
10
+ element :content
11
+ element :summary
12
+ element :published
13
+ element :id, :as => :entry_id
14
+ element :created, :as => :published
15
+ element :issued, :as => :published
16
+ element :updated
17
+ element :modified, :as => :updated
18
+ elements :category, :as => :categories, :value => :term
19
+ elements :link, :as => :links, :value => :href
20
+ element :"docs:md5Checksum", :as => :checksum
21
+ element :"docs:filename", :as => :original_filename
22
+ element :"docs:suggestedFilename", :as => :suggested_filename
23
+
24
+ def url
25
+ @url ||= links.first
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,50 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # iTunes is RSS 2.0 + some apple extensions
5
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
+ class ITunesRSS
7
+ include SAXMachine
8
+ include FeedUtilities
9
+
10
+ attr_accessor :feed_url
11
+
12
+ # RSS 2.0 elements that need including
13
+ element :copyright
14
+ element :description
15
+ element :language
16
+ element :managingEditor
17
+ element :title
18
+ element :link, :as => :url
19
+
20
+ # If author is not present use managingEditor on the channel
21
+ element :"itunes:author", :as => :itunes_author
22
+ element :"itunes:block", :as => :itunes_block
23
+ element :"itunes:image", :value => :href, :as => :itunes_image
24
+ element :"itunes:explicit", :as => :itunes_explicit
25
+ element :"itunes:keywords", :as => :itunes_keywords
26
+ # New URL for the podcast feed
27
+ element :"itunes:new-feed-url", :as => :itunes_new_feed_url
28
+ element :"itunes:subtitle", :as => :itunes_subtitle
29
+ # If summary is not present, use the description tag
30
+ element :"itunes:summary", :as => :itunes_summary
31
+
32
+ # iTunes RSS feeds can have multiple main categories...
33
+ # ...and multiple sub-categories per category
34
+ # TODO subcategories not supported correctly - they are at the same level
35
+ # as the main categories
36
+ elements :"itunes:category", :as => :itunes_categories, :value => :text
37
+
38
+ elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
39
+
40
+ elements :item, :as => :entries, :class => ITunesRSSItem
41
+
42
+ def self.able_to_parse?(xml)
43
+ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/i =~ xml
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ end
@@ -0,0 +1,32 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # iTunes extensions to the standard RSS2.0 item
5
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
+ class ITunesRSSItem
7
+ include SAXMachine
8
+ include FeedEntryUtilities
9
+
10
+ element :author
11
+ element :guid
12
+ element :title
13
+ element :link, :as => :url
14
+ element :description, :as => :summary
15
+ element :pubDate, :as => :published
16
+
17
+ # If author is not present use author tag on the item
18
+ element :"itunes:author", :as => :itunes_author
19
+ element :"itunes:block", :as => :itunes_block
20
+ element :"itunes:duration", :as => :itunes_duration
21
+ element :"itunes:explicit", :as => :itunes_explicit
22
+ element :"itunes:keywords", :as => :itunes_keywords
23
+ element :"itunes:subtitle", :as => :itunes_subtitle
24
+ # If summary is not present, use the description tag
25
+ element :"itunes:summary", :as => :itunes_summary
26
+ element :enclosure, :value => :length, :as => :enclosure_length
27
+ element :enclosure, :value => :type, :as => :enclosure_type
28
+ element :enclosure, :value => :url, :as => :enclosure_url
29
+ end
30
+ end
31
+
32
+ end
@@ -0,0 +1,12 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ class ITunesRSSOwner
5
+ include SAXMachine
6
+ include FeedUtilities
7
+ element :"itunes:name", :as => :name
8
+ element :"itunes:email", :as => :email
9
+ end
10
+ end
11
+
12
+ end
@@ -0,0 +1,22 @@
1
+ module Feedzirra
2
+
3
+ module Parser
4
+ # Parser for dealing with RSS feeds.
5
+ class RSS
6
+ include SAXMachine
7
+ include FeedUtilities
8
+ element :title
9
+ element :description
10
+ element :link, :as => :url
11
+ elements :item, :as => :entries, :class => RSSEntry
12
+
13
+ attr_accessor :feed_url
14
+
15
+ def self.able_to_parse?(xml) #:nodoc:
16
+ (/\<rss|\<rdf/ =~ xml) && !(/feedburner/ =~ xml)
17
+ end
18
+ end
19
+
20
+ end
21
+
22
+ end