astro-feedzirra 0.0.8.20090419
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +187 -0
- data/Rakefile +53 -0
- data/lib/core_ext/date.rb +21 -0
- data/lib/core_ext/string.rb +9 -0
- data/lib/feedzirra/atom.rb +22 -0
- data/lib/feedzirra/atom_entry.rb +29 -0
- data/lib/feedzirra/atom_feed_burner.rb +22 -0
- data/lib/feedzirra/atom_feed_burner_entry.rb +30 -0
- data/lib/feedzirra/feed.rb +321 -0
- data/lib/feedzirra/feed_entry_utilities.rb +45 -0
- data/lib/feedzirra/feed_utilities.rb +71 -0
- data/lib/feedzirra/itunes_rss.rb +46 -0
- data/lib/feedzirra/itunes_rss_item.rb +28 -0
- data/lib/feedzirra/itunes_rss_owner.rb +8 -0
- data/lib/feedzirra/rss.rb +23 -0
- data/lib/feedzirra/rss_entry.rb +35 -0
- data/lib/feedzirra.rb +36 -0
- data/spec/feedzirra/atom_entry_spec.rb +45 -0
- data/spec/feedzirra/atom_feed_burner_entry_spec.rb +42 -0
- data/spec/feedzirra/atom_feed_burner_spec.rb +39 -0
- data/spec/feedzirra/atom_spec.rb +35 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
- data/spec/feedzirra/feed_spec.rb +543 -0
- data/spec/feedzirra/feed_utilities_spec.rb +149 -0
- data/spec/feedzirra/itunes_rss_item_spec.rb +48 -0
- data/spec/feedzirra/itunes_rss_owner_spec.rb +18 -0
- data/spec/feedzirra/itunes_rss_spec.rb +50 -0
- data/spec/feedzirra/rss_entry_spec.rb +41 -0
- data/spec/feedzirra/rss_spec.rb +41 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +58 -0
- metadata +142 -0
@@ -0,0 +1,321 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
class NoParserAvailable < StandardError; end
|
3
|
+
|
4
|
+
class Feed
|
5
|
+
USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master"
|
6
|
+
|
7
|
+
# Takes a raw XML feed and attempts to parse it. If no parser is available a Feedzirra::NoParserAvailable exception is raised.
|
8
|
+
#
|
9
|
+
# === Parameters
|
10
|
+
# [xml<String>] The XML that you would like parsed.
|
11
|
+
# === Returns
|
12
|
+
# An instance of the determined feed type. By default a Feedzirra::Atom, Feedzirra::AtomFeedBurner, Feedzirra::RDF, or Feedzirra::RSS object.
|
13
|
+
# === Raises
|
14
|
+
# Feedzirra::NoParserAvailable : If no valid parser classes could be found for the feed.
|
15
|
+
def self.parse(xml)
|
16
|
+
if parser = determine_feed_parser_for_xml(xml)
|
17
|
+
parser.parse(xml)
|
18
|
+
else
|
19
|
+
raise NoParserAvailable.new("No valid parser for XML.")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Determines the correct parser class to use for parsing the feed.
|
24
|
+
#
|
25
|
+
# === Parameters
|
26
|
+
# [xml<String>] The XML that you would like determine the parser for.
|
27
|
+
# === Returns
|
28
|
+
# The class name of the parser that can handle the XML.
|
29
|
+
def self.determine_feed_parser_for_xml(xml)
|
30
|
+
start_of_doc = xml.slice(0, 1000)
|
31
|
+
feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
|
32
|
+
end
|
33
|
+
|
34
|
+
# Adds a new feed parsing class that will be used for parsing.
|
35
|
+
#
|
36
|
+
# === Parameters
|
37
|
+
# [klass<Constant>] The class/constant that you want to register.
|
38
|
+
# === Returns
|
39
|
+
# A updated array of feed parser class names.
|
40
|
+
def self.add_feed_class(klass)
|
41
|
+
feed_classes.unshift klass
|
42
|
+
end
|
43
|
+
|
44
|
+
# Provides a list of registered feed parsing classes.
|
45
|
+
#
|
46
|
+
# === Returns
|
47
|
+
# A array of class names.
|
48
|
+
def self.feed_classes
|
49
|
+
@feed_classes ||= [ITunesRSS, RSS, AtomFeedBurner, Atom]
|
50
|
+
end
|
51
|
+
|
52
|
+
# Makes all entry types look for the passed in element to parse. This is actually just a call to
|
53
|
+
# element (a SAXMachine call) in the class
|
54
|
+
#
|
55
|
+
# === Parameters
|
56
|
+
# [element_tag<String>]
|
57
|
+
# [options<Hash>] Valid keys are same as with SAXMachine
|
58
|
+
def self.add_common_feed_entry_element(element_tag, options = {})
|
59
|
+
# need to think of a better way to do this. will break for people who want this behavior
|
60
|
+
# across their added classes
|
61
|
+
[RSSEntry, AtomFeedBurnerEntry, AtomEntry].each do |klass|
|
62
|
+
klass.send(:element, element_tag, options)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# Makes all entry types look for the passed in elements to parse. This is actually just a call to
|
67
|
+
# elements (a SAXMachine call) in the class
|
68
|
+
#
|
69
|
+
# === Parameters
|
70
|
+
# [element_tag<String>]
|
71
|
+
# [options<Hash>] Valid keys are same as with SAXMachine
|
72
|
+
def self.add_common_feed_entry_elements(element_tag, options = {})
|
73
|
+
# need to think of a better way to do this. will break for people who want this behavior
|
74
|
+
# across their added classes
|
75
|
+
[RSSEntry, AtomFeedBurnerEntry, AtomEntry].each do |klass|
|
76
|
+
klass.send(:elements, element_tag, options)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# Fetches and returns the raw XML for each URL provided.
|
81
|
+
#
|
82
|
+
# === Parameters
|
83
|
+
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
84
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
85
|
+
# :user_agent - String that overrides the default user agent.
|
86
|
+
# :if_modified_since - Time object representing when the feed was last updated.
|
87
|
+
# :if_none_match - String that's normally an etag for the request that was stored previously.
|
88
|
+
# :on_success - Block that gets executed after a successful request.
|
89
|
+
# :on_failure - Block that gets executed after a failed request.
|
90
|
+
# === Returns
|
91
|
+
# A String of XML if a single URL is passed.
|
92
|
+
#
|
93
|
+
# A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
|
94
|
+
def self.fetch_raw(urls, options = {})
|
95
|
+
url_queue = [*urls]
|
96
|
+
multi = Curl::Multi.new
|
97
|
+
responses = {}
|
98
|
+
url_queue.each do |url|
|
99
|
+
easy = Curl::Easy.new(url) do |curl|
|
100
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
101
|
+
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
102
|
+
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
103
|
+
curl.headers["Accept-encoding"] = 'gzip, deflate'
|
104
|
+
curl.follow_location = true
|
105
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
106
|
+
|
107
|
+
curl.on_success do |c|
|
108
|
+
responses[url] = decode_content(c)
|
109
|
+
end
|
110
|
+
curl.on_failure do |c|
|
111
|
+
responses[url] = c.response_code
|
112
|
+
end
|
113
|
+
end
|
114
|
+
multi.add(easy)
|
115
|
+
end
|
116
|
+
|
117
|
+
multi.perform
|
118
|
+
return urls.is_a?(String) ? responses.values.first : responses
|
119
|
+
end
|
120
|
+
|
121
|
+
# Fetches and returns the parsed XML for each URL provided.
|
122
|
+
#
|
123
|
+
# === Parameters
|
124
|
+
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
125
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
126
|
+
# * :user_agent - String that overrides the default user agent.
|
127
|
+
# * :if_modified_since - Time object representing when the feed was last updated.
|
128
|
+
# * :if_none_match - String, an etag for the request that was stored previously.
|
129
|
+
# * :on_success - Block that gets executed after a successful request.
|
130
|
+
# * :on_failure - Block that gets executed after a failed request.
|
131
|
+
# === Returns
|
132
|
+
# A Feed object if a single URL is passed.
|
133
|
+
#
|
134
|
+
# A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
|
135
|
+
def self.fetch_and_parse(urls, options = {})
|
136
|
+
url_queue = [*urls]
|
137
|
+
multi = Curl::Multi.new
|
138
|
+
responses = {}
|
139
|
+
|
140
|
+
# I broke these down so I would only try to do 30 simultaneously because
|
141
|
+
# I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
|
142
|
+
url_queue.slice!(0, 30).each do |url|
|
143
|
+
add_url_to_multi(multi, url, url_queue, responses, options)
|
144
|
+
end
|
145
|
+
|
146
|
+
multi.perform
|
147
|
+
return urls.is_a?(String) ? responses.values.first : responses
|
148
|
+
end
|
149
|
+
|
150
|
+
# Decodes the XML document if it was compressed.
|
151
|
+
#
|
152
|
+
# === Parameters
|
153
|
+
# [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
|
154
|
+
# === Returns
|
155
|
+
# A decoded string of XML.
|
156
|
+
def self.decode_content(c)
|
157
|
+
if c.header_str.match(/Content-Encoding: gzip/)
|
158
|
+
begin
|
159
|
+
gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
|
160
|
+
xml = gz.read
|
161
|
+
gz.close
|
162
|
+
rescue Zlib::GzipFile::Error
|
163
|
+
# Maybe this is not gzipped?
|
164
|
+
xml = c.body_str
|
165
|
+
end
|
166
|
+
elsif c.header_str.match(/Content-Encoding: deflate/)
|
167
|
+
xml = Zlib::Inflate.inflate(c.body_str)
|
168
|
+
else
|
169
|
+
xml = c.body_str
|
170
|
+
end
|
171
|
+
|
172
|
+
xml
|
173
|
+
end
|
174
|
+
|
175
|
+
# Updates each feed for each Feed object provided.
|
176
|
+
#
|
177
|
+
# === Parameters
|
178
|
+
# [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
|
179
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
180
|
+
# * :user_agent - String that overrides the default user agent.
|
181
|
+
# * :on_success - Block that gets executed after a successful request.
|
182
|
+
# * :on_failure - Block that gets executed after a failed request.
|
183
|
+
# === Returns
|
184
|
+
# A updated Feed object if a single URL is passed.
|
185
|
+
#
|
186
|
+
# A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
|
187
|
+
def self.update(feeds, options = {})
|
188
|
+
feed_queue = [*feeds]
|
189
|
+
multi = Curl::Multi.new
|
190
|
+
responses = {}
|
191
|
+
|
192
|
+
feed_queue.slice!(0, 30).each do |feed|
|
193
|
+
add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
194
|
+
end
|
195
|
+
|
196
|
+
multi.perform
|
197
|
+
return responses.size == 1 ? responses.values.first : responses.values
|
198
|
+
end
|
199
|
+
|
200
|
+
# An abstraction for adding a feed by URL to the passed Curb::multi stack.
|
201
|
+
#
|
202
|
+
# === Parameters
|
203
|
+
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
204
|
+
# [url<String>] The URL of the feed that you would like to be fetched.
|
205
|
+
# [url_queue<Array>] An array of URLs that are queued for request.
|
206
|
+
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
207
|
+
# [feeds<String> or <Array>] A single feed object, or an array of feed objects.
|
208
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
209
|
+
# * :user_agent - String that overrides the default user agent.
|
210
|
+
# * :on_success - Block that gets executed after a successful request.
|
211
|
+
# * :on_failure - Block that gets executed after a failed request.
|
212
|
+
# === Returns
|
213
|
+
# The updated Curl::Multi object with the request details added to it's stack.
|
214
|
+
def self.add_url_to_multi(multi, url, url_queue, responses, options)
|
215
|
+
easy = Curl::Easy.new(url) do |curl|
|
216
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
217
|
+
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
218
|
+
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
219
|
+
curl.headers["Accept-encoding"] = 'gzip, deflate'
|
220
|
+
curl.follow_location = true
|
221
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
222
|
+
|
223
|
+
curl.on_success do |c|
|
224
|
+
add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
|
225
|
+
xml = decode_content(c)
|
226
|
+
klass = determine_feed_parser_for_xml(xml)
|
227
|
+
|
228
|
+
if klass
|
229
|
+
feed = klass.parse(xml)
|
230
|
+
feed.feed_url = c.last_effective_url
|
231
|
+
feed.etag = etag_from_header(c.header_str)
|
232
|
+
feed.last_modified = last_modified_from_header(c.header_str)
|
233
|
+
responses[url] = feed
|
234
|
+
options[:on_success].call(url, feed) if options.has_key?(:on_success)
|
235
|
+
else
|
236
|
+
# puts "Error determining parser for #{url} - #{c.last_effective_url}"
|
237
|
+
# raise NoParserAvailable.new("no valid parser for content.") (this would unfirtunately fail the whole 'multi', so it's not really useable)
|
238
|
+
options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
curl.on_failure do |c|
|
243
|
+
add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
|
244
|
+
responses[url] = c.response_code
|
245
|
+
options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
multi.add(easy)
|
249
|
+
end
|
250
|
+
|
251
|
+
# An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
|
252
|
+
#
|
253
|
+
# === Parameters
|
254
|
+
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
255
|
+
# [feed<Feed>] A feed object that you would like to be fetched.
|
256
|
+
# [url_queue<Array>] An array of feed objects that are queued for request.
|
257
|
+
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
258
|
+
# [feeds<String>] or <Array> A single feed object, or an array of feed objects.
|
259
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
260
|
+
# * :user_agent - String that overrides the default user agent.
|
261
|
+
# * :on_success - Block that gets executed after a successful request.
|
262
|
+
# * :on_failure - Block that gets executed after a failed request.
|
263
|
+
# === Returns
|
264
|
+
# The updated Curl::Multi object with the request details added to it's stack.
|
265
|
+
def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
266
|
+
easy = Curl::Easy.new(feed.feed_url) do |curl|
|
267
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
268
|
+
curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
|
269
|
+
curl.headers["If-None-Match"] = feed.etag if feed.etag
|
270
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
271
|
+
curl.follow_location = true
|
272
|
+
|
273
|
+
curl.on_success do |c|
|
274
|
+
add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
|
275
|
+
updated_feed = Feed.parse(c.body_str)
|
276
|
+
updated_feed.feed_url = c.last_effective_url
|
277
|
+
updated_feed.etag = etag_from_header(c.header_str)
|
278
|
+
updated_feed.last_modified = last_modified_from_header(c.header_str)
|
279
|
+
feed.update_from_feed(updated_feed)
|
280
|
+
responses[feed.feed_url] = feed
|
281
|
+
options[:on_success].call(feed) if options.has_key?(:on_success)
|
282
|
+
end
|
283
|
+
|
284
|
+
curl.on_failure do |c|
|
285
|
+
add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
|
286
|
+
response_code = c.response_code
|
287
|
+
if response_code == 304 # it's not modified. this isn't an error condition
|
288
|
+
responses[feed.feed_url] = feed
|
289
|
+
options[:on_success].call(feed) if options.has_key?(:on_success)
|
290
|
+
else
|
291
|
+
responses[feed.url] = c.response_code
|
292
|
+
options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
293
|
+
end
|
294
|
+
end
|
295
|
+
end
|
296
|
+
multi.add(easy)
|
297
|
+
end
|
298
|
+
|
299
|
+
# Determines the etag from the request headers.
|
300
|
+
#
|
301
|
+
# === Parameters
|
302
|
+
# [header<String>] Raw request header returned from the request
|
303
|
+
# === Returns
|
304
|
+
# A string of the etag or nil if it cannot be found in the headers.
|
305
|
+
def self.etag_from_header(header)
|
306
|
+
header =~ /.*ETag:\s(.*)\r/
|
307
|
+
$1
|
308
|
+
end
|
309
|
+
|
310
|
+
# Determines the last modified date from the request headers.
|
311
|
+
#
|
312
|
+
# === Parameters
|
313
|
+
# [header<String>] Raw request header returned from the request
|
314
|
+
# === Returns
|
315
|
+
# A Time object of the last modified date or nil if it cannot be found in the headers.
|
316
|
+
def self.last_modified_from_header(header)
|
317
|
+
header =~ /.*Last-Modified:\s(.*)\r/
|
318
|
+
Time.parse($1) if $1
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedEntryUtilities
|
3
|
+
def published
|
4
|
+
@published || @updated
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_datetime(string)
|
8
|
+
begin
|
9
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
10
|
+
rescue
|
11
|
+
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
+
def id
|
19
|
+
@id || @url
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Writter for published. By default, we keep the "oldest" publish time found.
|
24
|
+
def published=(val)
|
25
|
+
parsed = parse_datetime(val)
|
26
|
+
@published = parsed if !@published || parsed < @published
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Writter for udapted. By default, we keep the most recenet update time found.
|
31
|
+
def updated=(val)
|
32
|
+
parsed = parse_datetime(val)
|
33
|
+
@updated = parsed if !@updated || parsed > @updated
|
34
|
+
end
|
35
|
+
|
36
|
+
def sanitize!
|
37
|
+
self.title.sanitize! if self.title
|
38
|
+
self.author.sanitize! if self.author
|
39
|
+
self.summary.sanitize! if self.summary
|
40
|
+
self.content.sanitize! if self.content
|
41
|
+
end
|
42
|
+
|
43
|
+
alias_method :last_modified, :published
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedUtilities
|
3
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
|
4
|
+
|
5
|
+
attr_writer :new_entries, :updated, :last_modified
|
6
|
+
attr_accessor :etag
|
7
|
+
|
8
|
+
def last_modified
|
9
|
+
@last_modified ||= begin
|
10
|
+
entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
|
11
|
+
entry ? entry.published : nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def updated?
|
16
|
+
@updated
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_entries
|
20
|
+
@new_entries ||= []
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_new_entries?
|
24
|
+
new_entries.size > 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_from_feed(feed)
|
28
|
+
self.new_entries += find_new_entries_for(feed)
|
29
|
+
self.entries.unshift(*self.new_entries)
|
30
|
+
|
31
|
+
updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def update_attribute(feed, name)
|
35
|
+
old_value, new_value = send(name), feed.send(name)
|
36
|
+
|
37
|
+
if old_value != new_value
|
38
|
+
send("#{name}=", new_value)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def sanitize_entries!
|
43
|
+
entries.each {|entry| entry.sanitize!}
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def updated!
|
49
|
+
@updated = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_new_entries_for(feed)
|
53
|
+
# this implementation is a hack, which is why it's so ugly.
|
54
|
+
# it's to get around the fact that not all feeds have a published date.
|
55
|
+
# however, they're always ordered with the newest one first.
|
56
|
+
# So we go through the entries just parsed and insert each one as a new entry
|
57
|
+
# until we get to one that has the same url as the the newest for the feed
|
58
|
+
latest_entry = self.entries.first
|
59
|
+
found_new_entries = []
|
60
|
+
feed.entries.each do |entry|
|
61
|
+
break if entry.url == latest_entry.url
|
62
|
+
found_new_entries << entry
|
63
|
+
end
|
64
|
+
found_new_entries
|
65
|
+
end
|
66
|
+
|
67
|
+
def existing_entry?(test_entry)
|
68
|
+
entries.any? { |entry| entry.url == test_entry.url }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
# iTunes is RSS 2.0 + some apple extensions
|
3
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
4
|
+
class ITunesRSS
|
5
|
+
include SAXMachine
|
6
|
+
include FeedUtilities
|
7
|
+
|
8
|
+
attr_accessor :feed_url
|
9
|
+
|
10
|
+
# RSS 2.0 elements that need including
|
11
|
+
element :copyright
|
12
|
+
element :description
|
13
|
+
element :language
|
14
|
+
element :managingEditor
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url
|
17
|
+
|
18
|
+
# If author is not present use managingEditor on the channel
|
19
|
+
element :"itunes:author", :as => :itunes_author
|
20
|
+
element :"itunes:block", :as => :itunes_block
|
21
|
+
element :"itunes:image", :value => :href, :as => :itunes_image
|
22
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
23
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
24
|
+
# New URL for the podcast feed
|
25
|
+
element :"itunes:new-feed-url", :as => :itunes_new_feed_url
|
26
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
27
|
+
# If summary is not present, use the description tag
|
28
|
+
element :"itunes:summary", :as => :itunes_summary
|
29
|
+
|
30
|
+
# iTunes RSS feeds can have multiple main categories...
|
31
|
+
# ...and multiple sub-categories per category
|
32
|
+
# TODO subcategories not supported correctly - they are at the same level
|
33
|
+
# as the main categories
|
34
|
+
elements :"itunes:category", :as => :itunes_categories, :value => :text
|
35
|
+
|
36
|
+
elements :"itunes:owner", :as => :itunes_owners, :class => ITunesRSSOwner
|
37
|
+
|
38
|
+
elements :item, :as => :entries, :class => ITunesRSSItem
|
39
|
+
|
40
|
+
def self.able_to_parse?(xml)
|
41
|
+
xml =~ /xmlns:itunes=\"http:\/\/www.itunes.com\/dtds\/podcast-1.0.dtd\"/
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
# iTunes extensions to the standard RSS2.0 item
|
3
|
+
# Source: http://www.apple.com/itunes/whatson/podcasts/specs.html
|
4
|
+
class ITunesRSSItem
|
5
|
+
include SAXMachine
|
6
|
+
include FeedUtilities
|
7
|
+
element :author
|
8
|
+
element :guid
|
9
|
+
element :title
|
10
|
+
element :link, :as => :url
|
11
|
+
element :description, :as => :summary
|
12
|
+
element :pubDate, :as => :published
|
13
|
+
|
14
|
+
# If author is not present use author tag on the item
|
15
|
+
element :"itunes:author", :as => :itunes_author
|
16
|
+
element :"itunes:block", :as => :itunes_block
|
17
|
+
element :"itunes:duration", :as => :itunes_duration
|
18
|
+
element :"itunes:explicit", :as => :itunes_explicit
|
19
|
+
element :"itunes:keywords", :as => :itunes_keywords
|
20
|
+
element :"itunes:subtitle", :as => :itunes_subtitle
|
21
|
+
# If summary is not present, use the description tag
|
22
|
+
element :"itunes:summary", :as => :itunes_summary
|
23
|
+
element :enclosure, :value => :length, :as => :enclosure_length
|
24
|
+
element :enclosure, :value => :type, :as => :enclosure_type
|
25
|
+
element :enclosure, :value => :url, :as => :enclosure_url
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
# == Summary
|
3
|
+
# Parser for dealing with RSS feeds.
|
4
|
+
#
|
5
|
+
# == Attributes
|
6
|
+
# * title
|
7
|
+
# * feed_url
|
8
|
+
# * url
|
9
|
+
# * entries
|
10
|
+
class RSS
|
11
|
+
include SAXMachine
|
12
|
+
include FeedUtilities
|
13
|
+
element :title
|
14
|
+
element :link, :as => :url
|
15
|
+
elements :item, :as => :entries, :class => RSSEntry
|
16
|
+
|
17
|
+
attr_accessor :feed_url
|
18
|
+
|
19
|
+
def self.able_to_parse?(xml) #:nodoc:
|
20
|
+
xml =~ /\<rss|rdf/
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
# == Summary
|
3
|
+
# Parser for dealing with RDF feed entries.
|
4
|
+
#
|
5
|
+
# == Attributes
|
6
|
+
# * title
|
7
|
+
# * url
|
8
|
+
# * author
|
9
|
+
# * content
|
10
|
+
# * summary
|
11
|
+
# * published
|
12
|
+
# * categories
|
13
|
+
class RSSEntry
|
14
|
+
include SAXMachine
|
15
|
+
include FeedEntryUtilities
|
16
|
+
element :title
|
17
|
+
element :link, :as => :url
|
18
|
+
|
19
|
+
element :"dc:creator", :as => :author
|
20
|
+
element :"content:encoded", :as => :content
|
21
|
+
element :description, :as => :summary
|
22
|
+
|
23
|
+
element :pubDate, :as => :published
|
24
|
+
element :"dc:date", :as => :published
|
25
|
+
element :"dc:Date", :as => :published
|
26
|
+
element :"dcterms:created", :as => :published
|
27
|
+
|
28
|
+
|
29
|
+
element :"dcterms:modified", :as => :updated
|
30
|
+
element :issued, :as => :published
|
31
|
+
elements :category, :as => :categories
|
32
|
+
|
33
|
+
element :guid, :as => :id
|
34
|
+
end
|
35
|
+
end
|
data/lib/feedzirra.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
gem 'activesupport'
|
4
|
+
|
5
|
+
require 'zlib'
|
6
|
+
require 'curb'
|
7
|
+
require 'sax-machine'
|
8
|
+
require 'dryopteris'
|
9
|
+
require 'uri'
|
10
|
+
require 'active_support/basic_object'
|
11
|
+
require 'active_support/core_ext/object'
|
12
|
+
require 'active_support/core_ext/time'
|
13
|
+
|
14
|
+
require 'core_ext/date'
|
15
|
+
require 'core_ext/string'
|
16
|
+
|
17
|
+
require 'feedzirra/feed_utilities'
|
18
|
+
require 'feedzirra/feed_entry_utilities'
|
19
|
+
require 'feedzirra/feed'
|
20
|
+
|
21
|
+
require 'feedzirra/push_parser'
|
22
|
+
|
23
|
+
require 'feedzirra/rss_entry'
|
24
|
+
require 'feedzirra/itunes_rss_owner'
|
25
|
+
require 'feedzirra/itunes_rss_item'
|
26
|
+
require 'feedzirra/atom_entry'
|
27
|
+
require 'feedzirra/atom_feed_burner_entry'
|
28
|
+
|
29
|
+
require 'feedzirra/rss'
|
30
|
+
require 'feedzirra/itunes_rss'
|
31
|
+
require 'feedzirra/atom'
|
32
|
+
require 'feedzirra/atom_feed_burner'
|
33
|
+
|
34
|
+
module Feedzirra
|
35
|
+
VERSION = "0.0.8"
|
36
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../spec_helper'
|
2
|
+
|
3
|
+
describe Feedzirra::AtomEntry do
|
4
|
+
before(:each) do
|
5
|
+
# I don't really like doing it this way because these unit test should only rely on AtomEntry,
|
6
|
+
# but this is actually how it should work. You would never just pass entry xml straight to the AtomEnry
|
7
|
+
@entry = Feedzirra::Atom.parse(sample_atom_feed).entries.first
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should parse the title" do
|
11
|
+
@entry.title.should == "AWS Job: Architect & Designer Position in Turkey"
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should parse the url" do
|
15
|
+
@entry.url.should == "http://aws.typepad.com/aws/2009/01/aws-job-architect-designer-position-in-turkey.html"
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should parse the author" do
|
19
|
+
@entry.author.should == "AWS Editor"
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should parse the content" do
|
23
|
+
@entry.content.should == sample_atom_entry_content
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should provide a summary" do
|
27
|
+
@entry.summary.should == "Late last year an entrepreneur from Turkey visited me at Amazon HQ in Seattle. We talked about his plans to use AWS as part of his new social video portal startup. I won't spill any beans before he's ready to..."
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should parse the published date" do
|
31
|
+
@entry.published.to_s.should == "Fri Jan 16 18:21:00 UTC 2009"
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should parse the categories" do
|
35
|
+
@entry.categories.should == ['Turkey', 'Seattle']
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should parse the updated date" do
|
39
|
+
@entry.updated.to_s.should == "Fri Jan 16 18:21:00 UTC 2009"
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should parse the id" do
|
43
|
+
@entry.id.should == "tag:typepad.com,2003:post-61484736"
|
44
|
+
end
|
45
|
+
end
|