feedzirra 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/Gemfile +0 -14
  4. data/README.md +2 -241
  5. data/feedzirra.gemspec +2 -8
  6. data/lib/feedzirra.rb +2 -15
  7. data/lib/feedzirra/version.rb +1 -1
  8. metadata +7 -182
  9. data/.rspec +0 -1
  10. data/.travis.yml +0 -8
  11. data/Guardfile +0 -5
  12. data/Rakefile +0 -6
  13. data/benchmarks/README.md +0 -90
  14. data/benchmarks/basic.rb +0 -31
  15. data/benchmarks/feed_list.txt +0 -10
  16. data/benchmarks/feed_xml/apple.xml +0 -149
  17. data/benchmarks/feed_xml/cnn.xml +0 -278
  18. data/benchmarks/feed_xml/daring_fireball.xml +0 -1697
  19. data/benchmarks/feed_xml/engadget.xml +0 -604
  20. data/benchmarks/feed_xml/feedzirra_commits.xml +0 -370
  21. data/benchmarks/feed_xml/gizmodo.xml +0 -2
  22. data/benchmarks/feed_xml/loop.xml +0 -441
  23. data/benchmarks/feed_xml/rails.xml +0 -1938
  24. data/benchmarks/feed_xml/white_house.xml +0 -951
  25. data/benchmarks/feed_xml/xkcd.xml +0 -2
  26. data/benchmarks/fetching_systems.rb +0 -23
  27. data/benchmarks/other_libraries.rb +0 -73
  28. data/lib/feedzirra/core_ext.rb +0 -3
  29. data/lib/feedzirra/core_ext/date.rb +0 -19
  30. data/lib/feedzirra/core_ext/string.rb +0 -9
  31. data/lib/feedzirra/core_ext/time.rb +0 -31
  32. data/lib/feedzirra/feed.rb +0 -459
  33. data/lib/feedzirra/feed_entry_utilities.rb +0 -66
  34. data/lib/feedzirra/feed_utilities.rb +0 -103
  35. data/lib/feedzirra/parser.rb +0 -20
  36. data/lib/feedzirra/parser/atom.rb +0 -61
  37. data/lib/feedzirra/parser/atom_entry.rb +0 -34
  38. data/lib/feedzirra/parser/atom_feed_burner.rb +0 -22
  39. data/lib/feedzirra/parser/atom_feed_burner_entry.rb +0 -35
  40. data/lib/feedzirra/parser/google_docs_atom.rb +0 -28
  41. data/lib/feedzirra/parser/google_docs_atom_entry.rb +0 -29
  42. data/lib/feedzirra/parser/itunes_rss.rb +0 -50
  43. data/lib/feedzirra/parser/itunes_rss_item.rb +0 -41
  44. data/lib/feedzirra/parser/itunes_rss_owner.rb +0 -12
  45. data/lib/feedzirra/parser/rss.rb +0 -24
  46. data/lib/feedzirra/parser/rss_entry.rb +0 -37
  47. data/lib/feedzirra/parser/rss_feed_burner.rb +0 -23
  48. data/lib/feedzirra/parser/rss_feed_burner_entry.rb +0 -43
  49. data/spec/feedzirra/feed_entry_utilities_spec.rb +0 -62
  50. data/spec/feedzirra/feed_spec.rb +0 -762
  51. data/spec/feedzirra/feed_utilities_spec.rb +0 -273
  52. data/spec/feedzirra/parser/atom_entry_spec.rb +0 -86
  53. data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +0 -47
  54. data/spec/feedzirra/parser/atom_feed_burner_spec.rb +0 -56
  55. data/spec/feedzirra/parser/atom_spec.rb +0 -76
  56. data/spec/feedzirra/parser/google_docs_atom_entry_spec.rb +0 -22
  57. data/spec/feedzirra/parser/google_docs_atom_spec.rb +0 -31
  58. data/spec/feedzirra/parser/itunes_rss_item_spec.rb +0 -63
  59. data/spec/feedzirra/parser/itunes_rss_owner_spec.rb +0 -18
  60. data/spec/feedzirra/parser/itunes_rss_spec.rb +0 -58
  61. data/spec/feedzirra/parser/rss_entry_spec.rb +0 -85
  62. data/spec/feedzirra/parser/rss_feed_burner_entry_spec.rb +0 -85
  63. data/spec/feedzirra/parser/rss_feed_burner_spec.rb +0 -57
  64. data/spec/feedzirra/parser/rss_spec.rb +0 -57
  65. data/spec/sample_feeds/AmazonWebServicesBlog.xml +0 -797
  66. data/spec/sample_feeds/AmazonWebServicesBlogFirstEntryContent.xml +0 -63
  67. data/spec/sample_feeds/AtomFeedWithSpacesAroundEquals.xml +0 -61
  68. data/spec/sample_feeds/FeedBurnerUrlNoAlternate.xml +0 -28
  69. data/spec/sample_feeds/GoogleDocsList.xml +0 -188
  70. data/spec/sample_feeds/HREFConsideredHarmful.xml +0 -314
  71. data/spec/sample_feeds/HREFConsideredHarmfulFirstEntry.xml +0 -22
  72. data/spec/sample_feeds/ITunesWithSpacesInAttributes.xml +0 -63
  73. data/spec/sample_feeds/PaulDixExplainsNothing.xml +0 -175
  74. data/spec/sample_feeds/PaulDixExplainsNothingAlternate.xml +0 -175
  75. data/spec/sample_feeds/PaulDixExplainsNothingFirstEntryContent.xml +0 -19
  76. data/spec/sample_feeds/PaulDixExplainsNothingWFW.xml +0 -174
  77. data/spec/sample_feeds/SamRuby.xml +0 -583
  78. data/spec/sample_feeds/TechCrunch.xml +0 -1515
  79. data/spec/sample_feeds/TechCrunchFirstEntry.xml +0 -9
  80. data/spec/sample_feeds/TechCrunchFirstEntryDescription.xml +0 -3
  81. data/spec/sample_feeds/TenderLovemaking.xml +0 -516
  82. data/spec/sample_feeds/TenderLovemakingFirstEntry.xml +0 -66
  83. data/spec/sample_feeds/TrotterCashionHome.xml +0 -611
  84. data/spec/sample_feeds/TypePadNews.xml +0 -368
  85. data/spec/sample_feeds/atom_with_link_tag_for_url_unmarked.xml +0 -31
  86. data/spec/sample_feeds/itunes.xml +0 -67
  87. data/spec/sample_feeds/pet_atom.xml +0 -497
  88. data/spec/spec_helper.rb +0 -88
@@ -1,66 +0,0 @@
1
- module Feedzirra
2
- module FeedEntryUtilities
3
-
4
- include Enumerable
5
-
6
- def published
7
- @published ||= @updated
8
- end
9
-
10
- def parse_datetime(string)
11
- begin
12
- DateTime.parse(string).feed_utils_to_gm_time
13
- rescue
14
- warn "Failed to parse date #{string.inspect}"
15
- nil
16
- end
17
- end
18
-
19
- ##
20
- # Returns the id of the entry or its url if not id is present, as some formats don't support it
21
- def id
22
- @entry_id ||= @url
23
- end
24
-
25
- ##
26
- # Writer for published. By default, we keep the "oldest" publish time found.
27
- def published=(val)
28
- parsed = parse_datetime(val)
29
- @published = parsed if !@published || parsed < @published
30
- end
31
-
32
- ##
33
- # Writer for updated. By default, we keep the most recent update time found.
34
- def updated=(val)
35
- parsed = parse_datetime(val)
36
- @updated = parsed if !@updated || parsed > @updated
37
- end
38
-
39
- def sanitize!
40
- %w[title author summary content image].each do |name|
41
- if self.respond_to?(name) && self.send(name).respond_to?(:sanitize!)
42
- self.send(name).send :sanitize!
43
- end
44
- end
45
- end
46
-
47
- alias_method :last_modified, :published
48
-
49
- def each
50
- @rss_fields ||= self.instance_variables
51
-
52
- @rss_fields.each do |field|
53
- yield(field.to_s.sub('@', ''), self.instance_variable_get(field))
54
- end
55
- end
56
-
57
- def [](field)
58
- self.instance_variable_get("@#{field.to_s}")
59
- end
60
-
61
- def []=(field, value)
62
- self.instance_variable_set("@#{field.to_s}", value)
63
- end
64
-
65
- end
66
- end
@@ -1,103 +0,0 @@
1
- module Feedzirra
2
- module FeedUtilities
3
- UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified etag)
4
-
5
- attr_writer :new_entries, :updated, :last_modified
6
- attr_accessor :etag
7
-
8
- def self.included(base)
9
- base.extend ClassMethods
10
- end
11
-
12
- module ClassMethods
13
- def parse(xml, &block)
14
- xml = preprocess(xml) if preprocess_xml
15
- super xml.lstrip, &block
16
- end
17
-
18
- def preprocess(xml)
19
- # noop
20
- xml
21
- end
22
-
23
- def preprocess_xml=(value)
24
- @preprocess_xml = value
25
- end
26
-
27
- def preprocess_xml
28
- @preprocess_xml
29
- end
30
- end
31
-
32
- def last_modified
33
- @last_modified ||= begin
34
- entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
35
- entry ? entry.published : nil
36
- end
37
- end
38
-
39
- def updated?
40
- @updated || false
41
- end
42
-
43
- def new_entries
44
- @new_entries ||= []
45
- end
46
-
47
- def has_new_entries?
48
- new_entries.size > 0
49
- end
50
-
51
- def update_from_feed(feed)
52
- self.new_entries += find_new_entries_for(feed)
53
- self.entries.unshift(*self.new_entries)
54
-
55
- @updated = false
56
-
57
- UPDATABLE_ATTRIBUTES.each do |name|
58
- @updated ||= update_attribute(feed, name)
59
- end
60
- end
61
-
62
- def update_attribute(feed, name)
63
- old_value, new_value = send(name), feed.send(name)
64
-
65
- if old_value != new_value
66
- send("#{name}=", new_value)
67
- true
68
- else
69
- false
70
- end
71
- end
72
-
73
- def sanitize_entries!
74
- entries.each {|entry| entry.sanitize!}
75
- end
76
-
77
- private
78
-
79
- def find_new_entries_for(feed)
80
- # this implementation is a hack, which is why it's so ugly.
81
- # it's to get around the fact that not all feeds have a published date.
82
- # however, they're always ordered with the newest one first.
83
- # So we go through the entries just parsed and insert each one as a new entry
84
- # until we get to one that has the same id as the the newest for the feed
85
- return feed.entries if self.entries.length == 0
86
- latest_entry = self.entries.first
87
- found_new_entries = []
88
- feed.entries.each do |entry|
89
- if entry.entry_id.nil? && latest_entry.entry_id.nil?
90
- break if entry.url == latest_entry.url
91
- else
92
- break if entry.entry_id == latest_entry.entry_id || entry.url == latest_entry.url
93
- end
94
- found_new_entries << entry
95
- end
96
- found_new_entries
97
- end
98
-
99
- def existing_entry?(test_entry)
100
- entries.any? { |entry| entry.id == test_entry.id }
101
- end
102
- end
103
- end
@@ -1,20 +0,0 @@
1
- module Feedzirra
2
- module Parser
3
- autoload :RSS, 'feedzirra/parser/rss'
4
- autoload :RSSEntry, 'feedzirra/parser/rss_entry'
5
- autoload :RSSFeedBurner, 'feedzirra/parser/rss_feed_burner'
6
- autoload :RSSFeedBurnerEntry, 'feedzirra/parser/rss_feed_burner_entry'
7
-
8
- autoload :ITunesRSS, 'feedzirra/parser/itunes_rss'
9
- autoload :ITunesRSSItem, 'feedzirra/parser/itunes_rss_item'
10
- autoload :ITunesRSSOwner, 'feedzirra/parser/itunes_rss_owner'
11
-
12
- autoload :GoogleDocsAtom, 'feedzirra/parser/google_docs_atom'
13
- autoload :GoogleDocsAtomEntry, 'feedzirra/parser/google_docs_atom_entry'
14
-
15
- autoload :Atom, 'feedzirra/parser/atom'
16
- autoload :AtomEntry, 'feedzirra/parser/atom_entry'
17
- autoload :AtomFeedBurner, 'feedzirra/parser/atom_feed_burner'
18
- autoload :AtomFeedBurnerEntry, 'feedzirra/parser/atom_feed_burner_entry'
19
- end
20
- end
@@ -1,61 +0,0 @@
1
- module Feedzirra
2
- module Parser
3
- # Parser for dealing with Atom feeds.
4
- class Atom
5
- include SAXMachine
6
- include FeedUtilities
7
- element :title
8
- element :subtitle, :as => :description
9
- element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
10
- element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
11
- elements :link, :as => :links, :value => :href
12
- elements :link, :as => :hubs, :value => :href, :with => {:rel => "hub"}
13
- elements :entry, :as => :entries, :class => AtomEntry
14
-
15
- def self.able_to_parse?(xml) #:nodoc:
16
- /\<feed[^\>]+xmlns\s?=\s?[\"|\'](http:\/\/www\.w3\.org\/2005\/Atom|http:\/\/purl\.org\/atom\/ns\#)[\"|\'][^\>]*\>/ =~ xml
17
- end
18
-
19
- def url
20
- @url || links.last
21
- end
22
-
23
- def feed_url
24
- @feed_url ||= links.first
25
- end
26
-
27
- def self.preprocess(xml)
28
- Preprocessor.new(xml).to_xml
29
- end
30
-
31
- class Preprocessor
32
- def initialize(xml)
33
- @xml = xml
34
- end
35
-
36
- def to_xml
37
- process_nodes
38
- doc.to_xml
39
- end
40
-
41
- private
42
-
43
- def process_nodes
44
- nodes.each { |node| node.content = raw_html(node) unless node.cdata? }
45
- end
46
-
47
- def nodes
48
- doc.search 'entry > content[type="xhtml"]'
49
- end
50
-
51
- def raw_html(node)
52
- CGI.unescape_html node.inner_html
53
- end
54
-
55
- def doc
56
- @doc ||= Nokogiri::XML @xml
57
- end
58
- end
59
- end
60
- end
61
- end
@@ -1,34 +0,0 @@
1
- module Feedzirra
2
-
3
- module Parser
4
- # Parser for dealing with Atom feed entries.
5
- class AtomEntry
6
- include SAXMachine
7
- include FeedEntryUtilities
8
-
9
- element :title
10
- element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
11
- element :name, :as => :author
12
- element :content
13
- element :summary
14
-
15
- element :"media:content", :as => :image, :value => :url
16
- element :enclosure, :as => :image, :value => :href
17
-
18
- element :published
19
- element :id, :as => :entry_id
20
- element :created, :as => :published
21
- element :issued, :as => :published
22
- element :updated
23
- element :modified, :as => :updated
24
- elements :category, :as => :categories, :value => :term
25
- elements :link, :as => :links, :value => :href
26
-
27
- def url
28
- @url ||= links.first
29
- end
30
- end
31
-
32
- end
33
-
34
- end
@@ -1,22 +0,0 @@
1
- module Feedzirra
2
-
3
- module Parser
4
- # Parser for dealing with Feedburner Atom feeds.
5
- class AtomFeedBurner
6
- include SAXMachine
7
- include FeedUtilities
8
- element :title
9
- element :subtitle, :as => :description
10
- element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
11
- element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
12
- elements :"atom10:link", :as => :hubs, :value => :href, :with => {:rel => "hub"}
13
- elements :entry, :as => :entries, :class => AtomFeedBurnerEntry
14
-
15
- def self.able_to_parse?(xml) #:nodoc:
16
- ((/Atom/ =~ xml) && (/feedburner/ =~ xml) && !(/\<rss|\<rdf/ =~ xml)) || false
17
- end
18
- end
19
-
20
- end
21
-
22
- end
@@ -1,35 +0,0 @@
1
- module Feedzirra
2
-
3
- module Parser
4
- # Parser for dealing with Feedburner Atom feed entries.
5
- class AtomFeedBurnerEntry
6
- include SAXMachine
7
- include FeedEntryUtilities
8
-
9
- element :title
10
- element :name, :as => :author
11
- element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
12
- element :"feedburner:origLink", :as => :url
13
- element :summary
14
- element :content
15
-
16
- element :"media:content", :as => :image, :value => :url
17
- element :enclosure, :as => :image, :value => :href
18
-
19
- element :published
20
- element :id, :as => :entry_id
21
- element :issued, :as => :published
22
- element :created, :as => :published
23
- element :updated
24
- element :modified, :as => :updated
25
- elements :category, :as => :categories, :value => :term
26
- elements :link, :as => :links, :value => :href
27
-
28
- def url
29
- @url ||= links.first
30
- end
31
-
32
- end
33
- end
34
-
35
- end
@@ -1,28 +0,0 @@
1
- require File.expand_path('./atom', File.dirname(__FILE__))
2
-
3
- module Feedzirra
4
- module Parser
5
- class GoogleDocsAtom
6
- include SAXMachine
7
- include FeedUtilities
8
- element :title
9
- element :subtitle, :as => :description
10
- element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
11
- element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
12
- elements :link, :as => :links, :value => :href
13
- elements :entry, :as => :entries, :class => GoogleDocsAtomEntry
14
-
15
- def url
16
- @url ||= links.first
17
- end
18
-
19
- def self.able_to_parse?(xml) #:nodoc:
20
- %r{<id>https?://docs.google.com/.*\</id\>} =~ xml
21
- end
22
-
23
- def feed_url
24
- @feed_url ||= links.first
25
- end
26
- end
27
- end
28
- end
@@ -1,29 +0,0 @@
1
- module Feedzirra
2
- module Parser
3
- class GoogleDocsAtomEntry
4
- include SAXMachine
5
- include FeedEntryUtilities
6
-
7
- element :title
8
- element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
9
- element :name, :as => :author
10
- element :content
11
- element :summary
12
- element :published
13
- element :id, :as => :entry_id
14
- element :created, :as => :published
15
- element :issued, :as => :published
16
- element :updated
17
- element :modified, :as => :updated
18
- elements :category, :as => :categories, :value => :term
19
- elements :link, :as => :links, :value => :href
20
- element :"docs:md5Checksum", :as => :checksum
21
- element :"docs:filename", :as => :original_filename
22
- element :"docs:suggestedFilename", :as => :suggested_filename
23
-
24
- def url
25
- @url ||= links.first
26
- end
27
- end
28
- end
29
- end
@@ -1,50 +0,0 @@
1
- module Feedzirra
2
-
3
- module Parser
4
- # iTunes is RSS 2.0 + some apple extensions
5
- # Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
6
- class ITunesRSS
7
- include SAXMachine
8
- include FeedUtilities
9
-
10
- attr_accessor :feed_url
11
-
12
- # RSS 2.0 elements that need including
13
- element :copyright
14
- element :description
15
- element :language
16
- element :managingEditor
17
- element :title
18
- element :link, :as => :url
19
-
20
- # If author is not present use managingEditor on the channel
21
- element :"itunes:author", :as => :itunes_author
22
- element :"itunes:block", :as => :itunes_block
23
- element :"itunes:image", :value => :href, :as => :itunes_image
24
- element :"itunes:explicit", :as => :itunes_explicit
25
- element :"itunes:keywords", :as => :itunes_keywords
26
- # New URL for the podcast feed
27
- element :"itunes:new-feed-url", :as => :itunes_new_feed_url
28
- element :"itunes:subtitle", :as => :itunes_subtitle
29
- # If summary is not present, use the description tag
30
- element :"itunes:summary", :as => :itunes_summary
31
-
32
- # iTunes RSS feeds can have multiple main categories...
33
- # ...and multiple sub-categories per category
34
- # TODO subcategories not supported correctly - they are at the same level
35
- # as the main categories
36
- elements :"itunes:category", :as => :itunes_categories, :value => :text
37
-
38
- elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
39
-
40
- elements :item, :as => :entries, :class => ITunesRSSItem
41
-
42
- def self.able_to_parse?(xml)
43
- /xmlns:itunes\s?=\s?\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/i =~ xml
44
- end
45
-
46
- end
47
-
48
- end
49
-
50
- end