feedjira 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +1 -0
  4. data/.travis.yml +8 -0
  5. data/CHANGELOG.md +162 -0
  6. data/Gemfile +17 -0
  7. data/Guardfile +5 -0
  8. data/README.md +242 -0
  9. data/Rakefile +6 -0
  10. data/benchmarks/README.md +90 -0
  11. data/benchmarks/basic.rb +31 -0
  12. data/benchmarks/feed_list.txt +10 -0
  13. data/benchmarks/feed_xml/apple.xml +149 -0
  14. data/benchmarks/feed_xml/cnn.xml +278 -0
  15. data/benchmarks/feed_xml/daring_fireball.xml +1697 -0
  16. data/benchmarks/feed_xml/engadget.xml +604 -0
  17. data/benchmarks/feed_xml/feedjira_commits.xml +370 -0
  18. data/benchmarks/feed_xml/gizmodo.xml +2 -0
  19. data/benchmarks/feed_xml/loop.xml +441 -0
  20. data/benchmarks/feed_xml/rails.xml +1938 -0
  21. data/benchmarks/feed_xml/white_house.xml +951 -0
  22. data/benchmarks/feed_xml/xkcd.xml +2 -0
  23. data/benchmarks/fetching_systems.rb +23 -0
  24. data/benchmarks/other_libraries.rb +73 -0
  25. data/feedjira.gemspec +27 -0
  26. data/lib/feedjira.rb +16 -0
  27. data/lib/feedjira/core_ext.rb +3 -0
  28. data/lib/feedjira/core_ext/date.rb +19 -0
  29. data/lib/feedjira/core_ext/string.rb +9 -0
  30. data/lib/feedjira/core_ext/time.rb +31 -0
  31. data/lib/feedjira/feed.rb +459 -0
  32. data/lib/feedjira/feed_entry_utilities.rb +66 -0
  33. data/lib/feedjira/feed_utilities.rb +103 -0
  34. data/lib/feedjira/parser.rb +20 -0
  35. data/lib/feedjira/parser/atom.rb +61 -0
  36. data/lib/feedjira/parser/atom_entry.rb +34 -0
  37. data/lib/feedjira/parser/atom_feed_burner.rb +22 -0
  38. data/lib/feedjira/parser/atom_feed_burner_entry.rb +35 -0
  39. data/lib/feedjira/parser/google_docs_atom.rb +28 -0
  40. data/lib/feedjira/parser/google_docs_atom_entry.rb +29 -0
  41. data/lib/feedjira/parser/itunes_rss.rb +50 -0
  42. data/lib/feedjira/parser/itunes_rss_item.rb +41 -0
  43. data/lib/feedjira/parser/itunes_rss_owner.rb +12 -0
  44. data/lib/feedjira/parser/rss.rb +24 -0
  45. data/lib/feedjira/parser/rss_entry.rb +37 -0
  46. data/lib/feedjira/parser/rss_feed_burner.rb +23 -0
  47. data/lib/feedjira/parser/rss_feed_burner_entry.rb +43 -0
  48. data/lib/feedjira/version.rb +3 -0
  49. data/spec/feedjira/feed_entry_utilities_spec.rb +62 -0
  50. data/spec/feedjira/feed_spec.rb +762 -0
  51. data/spec/feedjira/feed_utilities_spec.rb +273 -0
  52. data/spec/feedjira/parser/atom_entry_spec.rb +86 -0
  53. data/spec/feedjira/parser/atom_feed_burner_entry_spec.rb +47 -0
  54. data/spec/feedjira/parser/atom_feed_burner_spec.rb +56 -0
  55. data/spec/feedjira/parser/atom_spec.rb +76 -0
  56. data/spec/feedjira/parser/google_docs_atom_entry_spec.rb +22 -0
  57. data/spec/feedjira/parser/google_docs_atom_spec.rb +31 -0
  58. data/spec/feedjira/parser/itunes_rss_item_spec.rb +63 -0
  59. data/spec/feedjira/parser/itunes_rss_owner_spec.rb +18 -0
  60. data/spec/feedjira/parser/itunes_rss_spec.rb +58 -0
  61. data/spec/feedjira/parser/rss_entry_spec.rb +85 -0
  62. data/spec/feedjira/parser/rss_feed_burner_entry_spec.rb +85 -0
  63. data/spec/feedjira/parser/rss_feed_burner_spec.rb +57 -0
  64. data/spec/feedjira/parser/rss_spec.rb +57 -0
  65. data/spec/sample_feeds/AmazonWebServicesBlog.xml +797 -0
  66. data/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml +63 -0
  67. data/spec/sample_feeds/AtomFeedWithSpacesAroundEquals.xml +61 -0
  68. data/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml +28 -0
  69. data/spec/sample_feeds/GoogleDocsList.xml +188 -0
  70. data/spec/sample_feeds/HREFConsideredHarmful.xml +314 -0
  71. data/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml +22 -0
  72. data/spec/sample_feeds/ITunesWithSpacesInAttributes.xml +63 -0
  73. data/spec/sample_feeds/PaulDixExplainsNothing.xml +175 -0
  74. data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +175 -0
  75. data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +19 -0
  76. data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +174 -0
  77. data/spec/sample_feeds/SamRuby.xml +583 -0
  78. data/spec/sample_feeds/TechCrunch.xml +1515 -0
  79. data/spec/sample_feeds/TechCrunchFirstEntry.xml +9 -0
  80. data/spec/sample_feeds/TechCrunchFirstEntryDescription.xml +3 -0
  81. data/spec/sample_feeds/TenderLovemaking.xml +516 -0
  82. data/spec/sample_feeds/TenderLovemakingFirstEntry.xml +66 -0
  83. data/spec/sample_feeds/TrotterCashionHome.xml +611 -0
  84. data/spec/sample_feeds/TypePadNews.xml +368 -0
  85. data/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml +31 -0
  86. data/spec/sample_feeds/itunes.xml +67 -0
  87. data/spec/sample_feeds/pet_atom.xml +497 -0
  88. data/spec/spec_helper.rb +88 -0
  89. metadata +229 -0
@@ -0,0 +1,66 @@
1
+ module Feedjira
2
+ module FeedEntryUtilities
3
+
4
+ include Enumerable
5
+
6
+ def published
7
+ @published ||= @updated
8
+ end
9
+
10
+ def parse_datetime(string)
11
+ begin
12
+ DateTime.parse(string).feed_utils_to_gm_time
13
+ rescue
14
+ warn "Failed to parse date #{string.inspect}"
15
+ nil
16
+ end
17
+ end
18
+
19
+ ##
20
+ # Returns the id of the entry or its url if not id is present, as some formats don't support it
21
+ def id
22
+ @entry_id ||= @url
23
+ end
24
+
25
+ ##
26
+ # Writer for published. By default, we keep the "oldest" publish time found.
27
+ def published=(val)
28
+ parsed = parse_datetime(val)
29
+ @published = parsed if !@published || parsed < @published
30
+ end
31
+
32
+ ##
33
+ # Writer for updated. By default, we keep the most recent update time found.
34
+ def updated=(val)
35
+ parsed = parse_datetime(val)
36
+ @updated = parsed if !@updated || parsed > @updated
37
+ end
38
+
39
+ def sanitize!
40
+ %w[title author summary content image].each do |name|
41
+ if self.respond_to?(name) && self.send(name).respond_to?(:sanitize!)
42
+ self.send(name).send :sanitize!
43
+ end
44
+ end
45
+ end
46
+
47
+ alias_method :last_modified, :published
48
+
49
+ def each
50
+ @rss_fields ||= self.instance_variables
51
+
52
+ @rss_fields.each do |field|
53
+ yield(field.to_s.sub('@', ''), self.instance_variable_get(field))
54
+ end
55
+ end
56
+
57
+ def [](field)
58
+ self.instance_variable_get("@#{field.to_s}")
59
+ end
60
+
61
+ def []=(field, value)
62
+ self.instance_variable_set("@#{field.to_s}", value)
63
+ end
64
+
65
+ end
66
+ end
@@ -0,0 +1,103 @@
1
+ module Feedjira
2
+ module FeedUtilities
3
+ UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag)
4
+
5
+ attr_writer :new_entries, :updated, :last_modified
6
+ attr_accessor :etag
7
+
8
+ def self.included(base)
9
+ base.extend ClassMethods
10
+ end
11
+
12
+ module ClassMethods
13
+ def parse(xml, &block)
14
+ xml = preprocess(xml) if preprocess_xml
15
+ super xml.lstrip, &block
16
+ end
17
+
18
+ def preprocess(xml)
19
+ # noop
20
+ xml
21
+ end
22
+
23
+ def preprocess_xml=(value)
24
+ @preprocess_xml = value
25
+ end
26
+
27
+ def preprocess_xml
28
+ @preprocess_xml
29
+ end
30
+ end
31
+
32
+ def last_modified
33
+ @last_modified ||= begin
34
+ entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
35
+ entry ? entry.published : nil
36
+ end
37
+ end
38
+
39
+ def updated?
40
+ @updated || false
41
+ end
42
+
43
+ def new_entries
44
+ @new_entries ||= []
45
+ end
46
+
47
+ def has_new_entries?
48
+ new_entries.size > 0
49
+ end
50
+
51
+ def update_from_feed(feed)
52
+ self.new_entries += find_new_entries_for(feed)
53
+ self.entries.unshift(*self.new_entries)
54
+
55
+ @updated = false
56
+
57
+ UPDATABLE_ATTRIBUTES.each do |name|
58
+ @updated ||= update_attribute(feed, name)
59
+ end
60
+ end
61
+
62
+ def update_attribute(feed, name)
63
+ old_value, new_value = send(name), feed.send(name)
64
+
65
+ if old_value != new_value
66
+ send("#{name}=", new_value)
67
+ true
68
+ else
69
+ false
70
+ end
71
+ end
72
+
73
+ def sanitize_entries!
74
+ entries.each {|entry| entry.sanitize!}
75
+ end
76
+
77
+ private
78
+
79
+ def find_new_entries_for(feed)
80
+ # this implementation is a hack, which is why it's so ugly.
81
+ # it's to get around the fact that not all feeds have a published date.
82
+ # however, they're always ordered with the newest one first.
83
+ # So we go through the entries just parsed and insert each one as a new entry
84
+ # until we get to one that has the same id as the the newest for the feed
85
+ return feed.entries if self.entries.length == 0
86
+ latest_entry = self.entries.first
87
+ found_new_entries = []
88
+ feed.entries.each do |entry|
89
+ if entry.entry_id.nil? && latest_entry.entry_id.nil?
90
+ break if entry.url == latest_entry.url
91
+ else
92
+ break if entry.entry_id == latest_entry.entry_id || entry.url == latest_entry.url
93
+ end
94
+ found_new_entries << entry
95
+ end
96
+ found_new_entries
97
+ end
98
+
99
+ def existing_entry?(test_entry)
100
+ entries.any? { |entry| entry.id == test_entry.id }
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,20 @@
1
+ module Feedjira
2
+ module Parser
3
+ autoload :RSS, 'feedjira/parser/rss'
4
+ autoload :RSSEntry, 'feedjira/parser/rss_entry'
5
+ autoload :RSSFeedBurner, 'feedjira/parser/rss_feed_burner'
6
+ autoload :RSSFeedBurnerEntry, 'feedjira/parser/rss_feed_burner_entry'
7
+
8
+ autoload :ITunesRSS, 'feedjira/parser/itunes_rss'
9
+ autoload :ITunesRSSItem, 'feedjira/parser/itunes_rss_item'
10
+ autoload :ITunesRSSOwner, 'feedjira/parser/itunes_rss_owner'
11
+
12
+ autoload :GoogleDocsAtom, 'feedjira/parser/google_docs_atom'
13
+ autoload :GoogleDocsAtomEntry, 'feedjira/parser/google_docs_atom_entry'
14
+
15
+ autoload :Atom, 'feedjira/parser/atom'
16
+ autoload :AtomEntry, 'feedjira/parser/atom_entry'
17
+ autoload :AtomFeedBurner, 'feedjira/parser/atom_feed_burner'
18
+ autoload :AtomFeedBurnerEntry, 'feedjira/parser/atom_feed_burner_entry'
19
+ end
20
+ end
@@ -0,0 +1,61 @@
1
+ module Feedjira
2
+ module Parser
3
+ # Parser for dealing with Atom feeds.
4
+ class Atom
5
+ include SAXMachine
6
+ include FeedUtilities
7
+ element :title
8
+ element :subtitle, :as => :description
9
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
10
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
11
+ elements :link, :as => :links, :value => :href
12
+ elements :link, :as => :hubs, :value => :href, :with => {:rel => "hub"}
13
+ elements :entry, :as => :entries, :class => AtomEntry
14
+
15
+ def self.able_to_parse?(xml) #:nodoc:
16
+ /\<feed[^\>]+xmlns\s?=\s?[\"|\'](http:\/\/www\.w3\.org\/2005\/Atom|http:\/\/purl\.org\/atom\/ns\#)[\"|\'][^\>]*\>/ =~ xml
17
+ end
18
+
19
+ def url
20
+ @url || links.last
21
+ end
22
+
23
+ def feed_url
24
+ @feed_url ||= links.first
25
+ end
26
+
27
+ def self.preprocess(xml)
28
+ Preprocessor.new(xml).to_xml
29
+ end
30
+
31
+ class Preprocessor
32
+ def initialize(xml)
33
+ @xml = xml
34
+ end
35
+
36
+ def to_xml
37
+ process_nodes
38
+ doc.to_xml
39
+ end
40
+
41
+ private
42
+
43
+ def process_nodes
44
+ nodes.each { |node| node.content = raw_html(node) unless node.cdata? }
45
+ end
46
+
47
+ def nodes
48
+ doc.search 'entry > content[type="xhtml"]'
49
+ end
50
+
51
+ def raw_html(node)
52
+ CGI.unescape_html node.inner_html
53
+ end
54
+
55
+ def doc
56
+ @doc ||= Nokogiri::XML @xml
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,34 @@
1
+ module Feedjira
2
+
3
+ module Parser
4
+ # Parser for dealing with Atom feed entries.
5
+ class AtomEntry
6
+ include SAXMachine
7
+ include FeedEntryUtilities
8
+
9
+ element :title
10
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
11
+ element :name, :as => :author
12
+ element :content
13
+ element :summary
14
+
15
+ element :"media:content", :as => :image, :value => :url
16
+ element :enclosure, :as => :image, :value => :href
17
+
18
+ element :published
19
+ element :id, :as => :entry_id
20
+ element :created, :as => :published
21
+ element :issued, :as => :published
22
+ element :updated
23
+ element :modified, :as => :updated
24
+ elements :category, :as => :categories, :value => :term
25
+ elements :link, :as => :links, :value => :href
26
+
27
+ def url
28
+ @url ||= links.first
29
+ end
30
+ end
31
+
32
+ end
33
+
34
+ end
@@ -0,0 +1,22 @@
1
+ module Feedjira
2
+
3
+ module Parser
4
+ # Parser for dealing with Feedburner Atom feeds.
5
+ class AtomFeedBurner
6
+ include SAXMachine
7
+ include FeedUtilities
8
+ element :title
9
+ element :subtitle, :as => :description
10
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
11
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
12
+ elements :"atom10:link", :as => :hubs, :value => :href, :with => {:rel => "hub"}
13
+ elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
14
+
15
+ def self.able_to_parse?(xml) #:nodoc:
16
+ ((/Atom/ =~ xml) && (/feedburner/ =~ xml) && !(/\<rss|\<rdf/ =~ xml)) || false
17
+ end
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -0,0 +1,35 @@
1
+ module Feedjira
2
+
3
+ module Parser
4
+ # Parser for dealing with Feedburner Atom feed entries.
5
+ class AtomFeedBurnerEntry
6
+ include SAXMachine
7
+ include FeedEntryUtilities
8
+
9
+ element :title
10
+ element :name, :as => :author
11
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
12
+ element :"feedburner:origLink", :as => :url
13
+ element :summary
14
+ element :content
15
+
16
+ element :"media:content", :as => :image, :value => :url
17
+ element :enclosure, :as => :image, :value => :href
18
+
19
+ element :published
20
+ element :id, :as => :entry_id
21
+ element :issued, :as => :published
22
+ element :created, :as => :published
23
+ element :updated
24
+ element :modified, :as => :updated
25
+ elements :category, :as => :categories, :value => :term
26
+ elements :link, :as => :links, :value => :href
27
+
28
+ def url
29
+ @url ||= links.first
30
+ end
31
+
32
+ end
33
+ end
34
+
35
+ end
@@ -0,0 +1,28 @@
1
+ require File.expand_path('./atom', File.dirname(__FILE__))
2
+
3
+ module Feedjira
4
+ module Parser
5
+ class GoogleDocsAtom
6
+ include SAXMachine
7
+ include FeedUtilities
8
+ element :title
9
+ element :subtitle, :as => :description
10
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
11
+ element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
12
+ elements :link, :as => :links, :value => :href
13
+ elements :entry, :as => :entries, :class => GoogleDocsAtomEntry
14
+
15
+ def url
16
+ @url ||= links.first
17
+ end
18
+
19
+ def self.able_to_parse?(xml) #:nodoc:
20
+ %r{<id>https?://docs.google.com/.*\</id\>} =~ xml
21
+ end
22
+
23
+ def feed_url
24
+ @feed_url ||= links.first
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,29 @@
1
+ module Feedjira
2
+ module Parser
3
+ class GoogleDocsAtomEntry
4
+ include SAXMachine
5
+ include FeedEntryUtilities
6
+
7
+ element :title
8
+ element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
9
+ element :name, :as => :author
10
+ element :content
11
+ element :summary
12
+ element :published
13
+ element :id, :as => :entry_id
14
+ element :created, :as => :published
15
+ element :issued, :as => :published
16
+ element :updated
17
+ element :modified, :as => :updated
18
+ elements :category, :as => :categories, :value => :term
19
+ elements :link, :as => :links, :value => :href
20
+ element :"docs:md5Checksum", :as => :checksum
21
+ element :"docs:filename", :as => :original_filename
22
+ element :"docs:suggestedFilename", :as => :suggested_filename
23
+
24
+ def url
25
+ @url ||= links.first
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,50 @@
1
+ module Feedjira
2
+
3
+ module Parser
4
+ # iTunes is RSS 2.0 + some apple extensions
5
+ # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
+ class ITunesRSS
7
+ include SAXMachine
8
+ include FeedUtilities
9
+
10
+ attr_accessor :feed_url
11
+
12
+ # RSS 2.0 elements that need including
13
+ element :copyright
14
+ element :description
15
+ element :language
16
+ element :managingEditor
17
+ element :title
18
+ element :link, :as => :url
19
+
20
+ # If author is not present use managingEditor on the channel
21
+ element :"itunes:author", :as => :itunes_author
22
+ element :"itunes:block", :as => :itunes_block
23
+ element :"itunes:image", :value => :href, :as => :itunes_image
24
+ element :"itunes:explicit", :as => :itunes_explicit
25
+ element :"itunes:keywords", :as => :itunes_keywords
26
+ # New URL for the podcast feed
27
+ element :"itunes:new-feed-url", :as => :itunes_new_feed_url
28
+ element :"itunes:subtitle", :as => :itunes_subtitle
29
+ # If summary is not present, use the description tag
30
+ element :"itunes:summary", :as => :itunes_summary
31
+
32
+ # iTunes RSS feeds can have multiple main categories...
33
+ # ...and multiple sub-categories per category
34
+ # TODO subcategories not supported correctly - they are at the same level
35
+ # as the main categories
36
+ elements :"itunes:category", :as => :itunes_categories, :value => :text
37
+
38
+ elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
39
+
40
+ elements :item, :as => :entries, :class => ITunesRSSItem
41
+
42
+ def self.able_to_parse?(xml)
43
+ /xmlns:itunes\s?=\s?\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/i =~ xml
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+
50
+ end