Spectives-feedzirra 0.0.28
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +169 -0
- data/README.textile +205 -0
- data/Rakefile +56 -0
- data/lib/core_ext/date.rb +21 -0
- data/lib/core_ext/string.rb +9 -0
- data/lib/feedzirra/feed.rb +334 -0
- data/lib/feedzirra/feed_entry_utilities.rb +45 -0
- data/lib/feedzirra/feed_utilities.rb +71 -0
- data/lib/feedzirra/parser/atom.rb +35 -0
- data/lib/feedzirra/parser/atom_entry.rb +41 -0
- data/lib/feedzirra/parser/itunes_category.rb +12 -0
- data/lib/feedzirra/parser/mrss_category.rb +11 -0
- data/lib/feedzirra/parser/mrss_content.rb +48 -0
- data/lib/feedzirra/parser/mrss_copyright.rb +10 -0
- data/lib/feedzirra/parser/mrss_credit.rb +11 -0
- data/lib/feedzirra/parser/mrss_group.rb +37 -0
- data/lib/feedzirra/parser/mrss_hash.rb +10 -0
- data/lib/feedzirra/parser/mrss_player.rb +11 -0
- data/lib/feedzirra/parser/mrss_rating.rb +10 -0
- data/lib/feedzirra/parser/mrss_restriction.rb +11 -0
- data/lib/feedzirra/parser/mrss_text.rb +13 -0
- data/lib/feedzirra/parser/mrss_thumbnail.rb +11 -0
- data/lib/feedzirra/parser/rss.rb +83 -0
- data/lib/feedzirra/parser/rss_entry.rb +83 -0
- data/lib/feedzirra/parser/rss_image.rb +15 -0
- data/lib/feedzirra.rb +44 -0
- data/spec/benchmarks/feed_benchmarks.rb +98 -0
- data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
- data/spec/benchmarks/fetching_benchmarks.rb +28 -0
- data/spec/benchmarks/parsing_benchmark.rb +30 -0
- data/spec/benchmarks/updating_benchmarks.rb +33 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
- data/spec/feedzirra/feed_spec.rb +546 -0
- data/spec/feedzirra/feed_utilities_spec.rb +149 -0
- data/spec/feedzirra/parser/atom_entry_spec.rb +49 -0
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +42 -0
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +39 -0
- data/spec/feedzirra/parser/atom_spec.rb +43 -0
- data/spec/feedzirra/parser/mrss_content_spec.rb +32 -0
- data/spec/feedzirra/parser/rss_entry_spec.rb +154 -0
- data/spec/feedzirra/parser/rss_spec.rb +93 -0
- data/spec/sample_feeds/run_against_sample.rb +20 -0
- data/spec/spec_helper.rb +62 -0
- metadata +154 -0
@@ -0,0 +1,334 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
class NoParserAvailable < StandardError; end
|
3
|
+
|
4
|
+
class Feed
|
5
|
+
USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master"
|
6
|
+
|
7
|
+
# Takes a raw XML feed and attempts to parse it. If no parser is available a Feedzirra::NoParserAvailable exception is raised.
|
8
|
+
#
|
9
|
+
# === Parameters
|
10
|
+
# [xml<String>] The XML that you would like parsed.
|
11
|
+
# === Returns
|
12
|
+
# An instance of the determined feed type. By default a Feedzirra::Atom, Feedzirra::AtomFeedBurner, Feedzirra::RDF, or Feedzirra::RSS object.
|
13
|
+
# === Raises
|
14
|
+
# Feedzirra::NoParserAvailable : If no valid parser classes could be found for the feed.
|
15
|
+
def self.parse(xml)
|
16
|
+
if parser = determine_feed_parser_for_xml(xml)
|
17
|
+
parser.parse(xml)
|
18
|
+
else
|
19
|
+
raise NoParserAvailable.new("No valid parser for XML.")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Determines the correct parser class to use for parsing the feed.
|
24
|
+
#
|
25
|
+
# === Parameters
|
26
|
+
# [xml<String>] The XML that you would like determine the parser for.
|
27
|
+
# === Returns
|
28
|
+
# The class name of the parser that can handle the XML.
|
29
|
+
def self.determine_feed_parser_for_xml(xml)
|
30
|
+
start_of_doc = xml.slice(0, 2000)
|
31
|
+
feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
|
32
|
+
end
|
33
|
+
|
34
|
+
# Adds a new feed parsing class that will be used for parsing.
|
35
|
+
#
|
36
|
+
# === Parameters
|
37
|
+
# [klass<Constant>] The class/constant that you want to register.
|
38
|
+
# === Returns
|
39
|
+
# A updated array of feed parser class names.
|
40
|
+
def self.add_feed_class(klass)
|
41
|
+
feed_classes.unshift klass
|
42
|
+
end
|
43
|
+
|
44
|
+
# Provides a list of registered feed parsing classes.
|
45
|
+
#
|
46
|
+
# === Returns
|
47
|
+
# A array of class names.
|
48
|
+
def self.feed_classes
|
49
|
+
@feed_classes ||= [
|
50
|
+
Feedzirra::Parser::RSS,
|
51
|
+
Feedzirra::Parser::AtomFeedBurner,
|
52
|
+
Feedzirra::Parser::Atom
|
53
|
+
]
|
54
|
+
end
|
55
|
+
|
56
|
+
# Makes all entry types look for the passed in element to parse. This is actually just a call to
|
57
|
+
# element (a SAXMachine call) in the class
|
58
|
+
#
|
59
|
+
# === Parameters
|
60
|
+
# [element_tag<String>]
|
61
|
+
# [options<Hash>] Valid keys are same as with SAXMachine
|
62
|
+
def self.add_common_feed_entry_element(element_tag, options = {})
|
63
|
+
# need to think of a better way to do this. will break for people who want this behavior
|
64
|
+
# across their added classes
|
65
|
+
feed_classes.map{|k| eval("#{k}Entry") }.each do |klass|
|
66
|
+
klass.send(:element, element_tag, options)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Fetches and returns the raw XML for each URL provided.
|
71
|
+
#
|
72
|
+
# === Parameters
|
73
|
+
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
74
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
75
|
+
# :user_agent - String that overrides the default user agent.
|
76
|
+
# :if_modified_since - Time object representing when the feed was last updated.
|
77
|
+
# :if_none_match - String that's normally an etag for the request that was stored previously.
|
78
|
+
# :on_success - Block that gets executed after a successful request.
|
79
|
+
# :on_failure - Block that gets executed after a failed request.
|
80
|
+
# === Returns
|
81
|
+
# A String of XML if a single URL is passed.
|
82
|
+
#
|
83
|
+
# A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
|
84
|
+
def self.fetch_raw(urls, options = {})
|
85
|
+
url_queue = [*urls]
|
86
|
+
multi = Curl::Multi.new
|
87
|
+
responses = {}
|
88
|
+
url_queue.each do |url|
|
89
|
+
easy = Curl::Easy.new(url) do |curl|
|
90
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
91
|
+
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
92
|
+
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
93
|
+
curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
|
94
|
+
curl.follow_location = true
|
95
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
96
|
+
|
97
|
+
curl.max_redirects = options[:max_redirects] if options[:max_redirects]
|
98
|
+
curl.timeout = options[:timeout] if options[:timeout]
|
99
|
+
|
100
|
+
curl.on_success do |c|
|
101
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
102
|
+
responses[url] = decode_content(c)
|
103
|
+
end
|
104
|
+
curl.on_failure do |c|
|
105
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
106
|
+
responses[url] = c.response_code
|
107
|
+
end
|
108
|
+
end
|
109
|
+
multi.add(easy)
|
110
|
+
end
|
111
|
+
|
112
|
+
multi.perform
|
113
|
+
urls.is_a?(String) ? responses.values.first : responses
|
114
|
+
end
|
115
|
+
|
116
|
+
# Fetches and returns the parsed XML for each URL provided.
|
117
|
+
#
|
118
|
+
# === Parameters
|
119
|
+
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
120
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
121
|
+
# * :user_agent - String that overrides the default user agent.
|
122
|
+
# * :if_modified_since - Time object representing when the feed was last updated.
|
123
|
+
# * :if_none_match - String, an etag for the request that was stored previously.
|
124
|
+
# * :on_success - Block that gets executed after a successful request.
|
125
|
+
# * :on_failure - Block that gets executed after a failed request.
|
126
|
+
# === Returns
|
127
|
+
# A Feed object if a single URL is passed.
|
128
|
+
#
|
129
|
+
# A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
|
130
|
+
def self.fetch_and_parse(urls, options = {})
|
131
|
+
url_queue = [*urls]
|
132
|
+
multi = Curl::Multi.new
|
133
|
+
responses = {}
|
134
|
+
|
135
|
+
# I broke these down so I would only try to do 30 simultaneously because
|
136
|
+
# I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
|
137
|
+
url_queue.slice!(0, 30).each do |url|
|
138
|
+
add_url_to_multi(multi, url, url_queue, responses, options)
|
139
|
+
end
|
140
|
+
|
141
|
+
multi.perform
|
142
|
+
return urls.is_a?(String) ? responses.values.first : responses
|
143
|
+
end
|
144
|
+
|
145
|
+
# Decodes the XML document if it was compressed.
|
146
|
+
#
|
147
|
+
# === Parameters
|
148
|
+
# [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
|
149
|
+
# === Returns
|
150
|
+
# A decoded string of XML.
|
151
|
+
def self.decode_content(c)
|
152
|
+
if c.header_str.match(/Content-Encoding: gzip/)
|
153
|
+
begin
|
154
|
+
gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
|
155
|
+
xml = gz.read
|
156
|
+
gz.close
|
157
|
+
rescue Zlib::GzipFile::Error
|
158
|
+
# Maybe this is not gzipped?
|
159
|
+
xml = c.body_str
|
160
|
+
end
|
161
|
+
elsif c.header_str.match(/Content-Encoding: deflate/)
|
162
|
+
xml = Zlib::Inflate.inflate(c.body_str)
|
163
|
+
else
|
164
|
+
xml = c.body_str
|
165
|
+
end
|
166
|
+
|
167
|
+
xml
|
168
|
+
end
|
169
|
+
|
170
|
+
# Updates each feed for each Feed object provided.
|
171
|
+
#
|
172
|
+
# === Parameters
|
173
|
+
# [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
|
174
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
175
|
+
# * :user_agent - String that overrides the default user agent.
|
176
|
+
# * :on_success - Block that gets executed after a successful request.
|
177
|
+
# * :on_failure - Block that gets executed after a failed request.
|
178
|
+
# === Returns
|
179
|
+
# A updated Feed object if a single URL is passed.
|
180
|
+
#
|
181
|
+
# A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
|
182
|
+
def self.update(feeds, options = {})
|
183
|
+
feed_queue = [*feeds]
|
184
|
+
multi = Curl::Multi.new
|
185
|
+
responses = {}
|
186
|
+
|
187
|
+
feed_queue.slice!(0, 30).each do |feed|
|
188
|
+
add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
189
|
+
end
|
190
|
+
|
191
|
+
multi.perform
|
192
|
+
responses.size == 1 ? responses.values.first : responses.values
|
193
|
+
end
|
194
|
+
|
195
|
+
# An abstraction for adding a feed by URL to the passed Curb::multi stack.
|
196
|
+
#
|
197
|
+
# === Parameters
|
198
|
+
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
199
|
+
# [url<String>] The URL of the feed that you would like to be fetched.
|
200
|
+
# [url_queue<Array>] An array of URLs that are queued for request.
|
201
|
+
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
202
|
+
# [feeds<String> or <Array>] A single feed object, or an array of feed objects.
|
203
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
204
|
+
# * :user_agent - String that overrides the default user agent.
|
205
|
+
# * :on_success - Block that gets executed after a successful request.
|
206
|
+
# * :on_failure - Block that gets executed after a failed request.
|
207
|
+
# === Returns
|
208
|
+
# The updated Curl::Multi object with the request details added to it's stack.
|
209
|
+
def self.add_url_to_multi(multi, url, url_queue, responses, options)
|
210
|
+
easy = Curl::Easy.new(url) do |curl|
|
211
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
212
|
+
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
213
|
+
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
214
|
+
curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
|
215
|
+
curl.follow_location = true
|
216
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
217
|
+
|
218
|
+
curl.max_redirects = options[:max_redirects] if options[:max_redirects]
|
219
|
+
curl.timeout = options[:timeout] if options[:timeout]
|
220
|
+
|
221
|
+
curl.on_success do |c|
|
222
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
223
|
+
add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
|
224
|
+
xml = decode_content(c)
|
225
|
+
klass = determine_feed_parser_for_xml(xml)
|
226
|
+
|
227
|
+
if klass
|
228
|
+
begin
|
229
|
+
feed = klass.parse(xml)
|
230
|
+
feed.feed_url = c.last_effective_url
|
231
|
+
feed.etag = etag_from_header(c.header_str)
|
232
|
+
feed.last_modified = last_modified_from_header(c.header_str)
|
233
|
+
responses[url] = feed
|
234
|
+
options[:on_success].call(url, feed) if options.has_key?(:on_success)
|
235
|
+
rescue Exception => e
|
236
|
+
options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
237
|
+
end
|
238
|
+
else
|
239
|
+
# puts "Error determining parser for #{url} - #{c.last_effective_url}"
|
240
|
+
# raise NoParserAvailable.new("no valid parser for content.") (this would unfirtunately fail the whole 'multi', so it's not really useable)
|
241
|
+
options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
curl.on_failure do |c|
|
246
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
247
|
+
add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
|
248
|
+
responses[url] = c.response_code
|
249
|
+
options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
250
|
+
end
|
251
|
+
end
|
252
|
+
multi.add(easy)
|
253
|
+
end
|
254
|
+
|
255
|
+
# An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
|
256
|
+
#
|
257
|
+
# === Parameters
|
258
|
+
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
259
|
+
# [feed<Feed>] A feed object that you would like to be fetched.
|
260
|
+
# [url_queue<Array>] An array of feed objects that are queued for request.
|
261
|
+
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
262
|
+
# [feeds<String>] or <Array> A single feed object, or an array of feed objects.
|
263
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
264
|
+
# * :user_agent - String that overrides the default user agent.
|
265
|
+
# * :on_success - Block that gets executed after a successful request.
|
266
|
+
# * :on_failure - Block that gets executed after a failed request.
|
267
|
+
# === Returns
|
268
|
+
# The updated Curl::Multi object with the request details added to it's stack.
|
269
|
+
def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
270
|
+
easy = Curl::Easy.new(feed.feed_url) do |curl|
|
271
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
272
|
+
curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
|
273
|
+
curl.headers["If-None-Match"] = feed.etag if feed.etag
|
274
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
275
|
+
curl.follow_location = true
|
276
|
+
|
277
|
+
curl.max_redirects = options[:max_redirects] if options[:max_redirects]
|
278
|
+
curl.timeout = options[:timeout] if options[:timeout]
|
279
|
+
|
280
|
+
curl.on_success do |c|
|
281
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
282
|
+
begin
|
283
|
+
add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
|
284
|
+
updated_feed = Feed.parse(c.body_str)
|
285
|
+
updated_feed.feed_url = c.last_effective_url
|
286
|
+
updated_feed.etag = etag_from_header(c.header_str)
|
287
|
+
updated_feed.last_modified = last_modified_from_header(c.header_str)
|
288
|
+
feed.update_from_feed(updated_feed)
|
289
|
+
responses[feed.feed_url] = feed
|
290
|
+
options[:on_success].call(feed) if options.has_key?(:on_success)
|
291
|
+
rescue Exception => e
|
292
|
+
options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
293
|
+
end
|
294
|
+
end
|
295
|
+
|
296
|
+
curl.on_failure do |c|
|
297
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
298
|
+
add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
|
299
|
+
response_code = c.response_code
|
300
|
+
if response_code == 304 # it's not modified. this isn't an error condition
|
301
|
+
responses[feed.feed_url] = feed
|
302
|
+
options[:on_success].call(feed) if options.has_key?(:on_success)
|
303
|
+
else
|
304
|
+
responses[feed.url] = c.response_code
|
305
|
+
options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|
309
|
+
multi.add(easy)
|
310
|
+
end
|
311
|
+
|
312
|
+
# Determines the etag from the request headers.
|
313
|
+
#
|
314
|
+
# === Parameters
|
315
|
+
# [header<String>] Raw request header returned from the request
|
316
|
+
# === Returns
|
317
|
+
# A string of the etag or nil if it cannot be found in the headers.
|
318
|
+
def self.etag_from_header(header)
|
319
|
+
header =~ /.*ETag:\s(.*)\r/
|
320
|
+
$1
|
321
|
+
end
|
322
|
+
|
323
|
+
# Determines the last modified date from the request headers.
|
324
|
+
#
|
325
|
+
# === Parameters
|
326
|
+
# [header<String>] Raw request header returned from the request
|
327
|
+
# === Returns
|
328
|
+
# A Time object of the last modified date or nil if it cannot be found in the headers.
|
329
|
+
def self.last_modified_from_header(header)
|
330
|
+
header =~ /.*Last-Modified:\s(.*)\r/
|
331
|
+
Time.parse($1) if $1
|
332
|
+
end
|
333
|
+
end
|
334
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedEntryUtilities
|
3
|
+
def published
|
4
|
+
@published || @updated
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_datetime(string)
|
8
|
+
begin
|
9
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
10
|
+
rescue
|
11
|
+
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
+
def id
|
19
|
+
@id || @url
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Writter for published. By default, we keep the "oldest" publish time found.
|
24
|
+
def published=(val)
|
25
|
+
parsed = parse_datetime(val)
|
26
|
+
@published = parsed if !@published || parsed < @published
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Writter for udapted. By default, we keep the most recenet update time found.
|
31
|
+
def updated=(val)
|
32
|
+
parsed = parse_datetime(val)
|
33
|
+
@updated = parsed if !@updated || parsed > @updated
|
34
|
+
end
|
35
|
+
|
36
|
+
def sanitize!
|
37
|
+
self.title.sanitize! if self.title
|
38
|
+
self.author.sanitize! if self.author
|
39
|
+
self.summary.sanitize! if self.summary
|
40
|
+
self.content.sanitize! if self.content
|
41
|
+
end
|
42
|
+
|
43
|
+
alias_method :last_modified, :published
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedUtilities
|
3
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
|
4
|
+
|
5
|
+
attr_writer :new_entries, :updated, :last_modified
|
6
|
+
attr_accessor :etag
|
7
|
+
|
8
|
+
def last_modified
|
9
|
+
@last_modified ||= begin
|
10
|
+
entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
|
11
|
+
entry ? entry.published : nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def updated?
|
16
|
+
@updated
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_entries
|
20
|
+
@new_entries ||= []
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_new_entries?
|
24
|
+
new_entries.size > 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_from_feed(feed)
|
28
|
+
self.new_entries += find_new_entries_for(feed)
|
29
|
+
self.entries.unshift(*self.new_entries)
|
30
|
+
|
31
|
+
updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def update_attribute(feed, name)
|
35
|
+
old_value, new_value = send(name), feed.send(name)
|
36
|
+
|
37
|
+
if old_value != new_value
|
38
|
+
send("#{name}=", new_value)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def sanitize_entries!
|
43
|
+
entries.each {|entry| entry.sanitize!}
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def updated!
|
49
|
+
@updated = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_new_entries_for(feed)
|
53
|
+
# this implementation is a hack, which is why it's so ugly.
|
54
|
+
# it's to get around the fact that not all feeds have a published date.
|
55
|
+
# however, they're always ordered with the newest one first.
|
56
|
+
# So we go through the entries just parsed and insert each one as a new entry
|
57
|
+
# until we get to one that has the same url as the the newest for the feed
|
58
|
+
latest_entry = self.entries.first
|
59
|
+
found_new_entries = []
|
60
|
+
feed.entries.each do |entry|
|
61
|
+
break if entry.url == latest_entry.url
|
62
|
+
found_new_entries << entry
|
63
|
+
end
|
64
|
+
found_new_entries
|
65
|
+
end
|
66
|
+
|
67
|
+
def existing_entry?(test_entry)
|
68
|
+
entries.any? { |entry| entry.url == test_entry.url }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feeds.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * feed_url
|
10
|
+
# * url
|
11
|
+
# * entries
|
12
|
+
class Atom
|
13
|
+
include SAXMachine
|
14
|
+
include FeedUtilities
|
15
|
+
element :title
|
16
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html"}
|
17
|
+
element :link, :as => :feed_url, :value => :href, :with => {:type => "application/atom+xml"}
|
18
|
+
elements :link, :as => :links, :value => :href
|
19
|
+
elements :entry, :as => :entries, :class => AtomEntry
|
20
|
+
|
21
|
+
def self.able_to_parse?(xml) #:nodoc:
|
22
|
+
xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
|
23
|
+
end
|
24
|
+
|
25
|
+
def url
|
26
|
+
@url || links.last
|
27
|
+
end
|
28
|
+
|
29
|
+
def feed_url
|
30
|
+
@feed_url || links.first
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
|
3
|
+
module Parser
|
4
|
+
# == Summary
|
5
|
+
# Parser for dealing with Atom feed entries.
|
6
|
+
#
|
7
|
+
# == Attributes
|
8
|
+
# * title
|
9
|
+
# * url
|
10
|
+
# * author
|
11
|
+
# * content
|
12
|
+
# * summary
|
13
|
+
# * published
|
14
|
+
# * categories
|
15
|
+
class AtomEntry
|
16
|
+
include SAXMachine
|
17
|
+
include FeedEntryUtilities
|
18
|
+
element :title
|
19
|
+
element :link, :as => :url, :value => :href, :with => {:type => "text/html", :rel => "alternate"}
|
20
|
+
element :"feedburner:origLink", :as => :url
|
21
|
+
element :name, :as => :author
|
22
|
+
element :content
|
23
|
+
element :summary
|
24
|
+
element :published
|
25
|
+
element :id
|
26
|
+
element :created, :as => :published
|
27
|
+
element :issued, :as => :published
|
28
|
+
element :updated
|
29
|
+
element :modified, :as => :updated
|
30
|
+
elements :category, :as => :categories, :value => :term
|
31
|
+
elements :link, :as => :links, :value => :href
|
32
|
+
elements :link, :as => :enclosure_links, :value => :href, :with => {:rel => "enclosure"}
|
33
|
+
|
34
|
+
def url
|
35
|
+
@url || links.first
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module Parser
|
3
|
+
class MRSSCategory
|
4
|
+
include SAXMachine
|
5
|
+
|
6
|
+
element :'media:category', :as => :category
|
7
|
+
element :'media:category', :value => :scheme, :as => :scheme
|
8
|
+
element :'media:category', :value => :label, :as => :label
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/mrss_credit'
|
2
|
+
require File.dirname(__FILE__) + '/mrss_restriction'
|
3
|
+
require File.dirname(__FILE__) + '/mrss_category'
|
4
|
+
require File.dirname(__FILE__) + '/mrss_copyright'
|
5
|
+
require File.dirname(__FILE__) + '/mrss_hash'
|
6
|
+
require File.dirname(__FILE__) + '/mrss_player'
|
7
|
+
require File.dirname(__FILE__) + '/mrss_rating'
|
8
|
+
require File.dirname(__FILE__) + '/mrss_restriction'
|
9
|
+
require File.dirname(__FILE__) + '/mrss_text'
|
10
|
+
require File.dirname(__FILE__) + '/mrss_thumbnail'
|
11
|
+
|
12
|
+
module Feedzirra
|
13
|
+
module Parser
|
14
|
+
class MRSSContent
|
15
|
+
include SAXMachine
|
16
|
+
|
17
|
+
element :'media:content', :as => :url, :value => :url
|
18
|
+
element :'media:content', :as => :content_type, :value => :type
|
19
|
+
element :'media:content', :as => :medium, :value => :medium
|
20
|
+
element :'media:content', :as => :duration, :value => :duration
|
21
|
+
element :'media:content', :as => :isDefault, :value => :isDefault
|
22
|
+
element :'media:content', :as => :expression, :value => :expression
|
23
|
+
element :'media:content', :as => :bitrate, :value => :bitrate
|
24
|
+
element :'media:content', :as => :framerate, :value => :framerate
|
25
|
+
element :'media:content', :as => :samplingrate, :value => :sampling
|
26
|
+
element :'media:content', :as => :channels, :value => :duration
|
27
|
+
element :'media:content', :as => :height, :value => :height
|
28
|
+
element :'media:content', :as => :width, :value => :width
|
29
|
+
element :'media:content', :as => :lang, :value => :lang
|
30
|
+
element :'media:content', :as => :fileSize, :value => :fileSize
|
31
|
+
|
32
|
+
# optional elements
|
33
|
+
element :'media:title', :as => :media_title
|
34
|
+
element :'media:keywords', :as => :media_keywords
|
35
|
+
element :'media:description', :as => :media_description
|
36
|
+
|
37
|
+
element :'media:thumbnail', :as => :media_thumbnail, :class => MRSSThumbnail
|
38
|
+
element :'media:rating', :as => :rating, :class => MRSSRating
|
39
|
+
element :'media:category', :as => :media_category, :class => MRSSCategory
|
40
|
+
element :'media:hash', :as => :media_hash, :class => MRSSHash
|
41
|
+
element :'media:player', :as => :media_player, :class => MRSSPlayer
|
42
|
+
elements :'media:credit', :as => :credits, :class => MRSSCredit
|
43
|
+
element :'media:copyright', :as => :copyright, :class => MRSSCopyright
|
44
|
+
element :'media:restriction', :as => :media_restriction, :class => MRSSRestriction
|
45
|
+
element :'media:text', :as => :text, :class => MRSSText
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/mrss_content'
|
2
|
+
require File.dirname(__FILE__) + '/mrss_credit'
|
3
|
+
require File.dirname(__FILE__) + '/mrss_restriction'
|
4
|
+
require File.dirname(__FILE__) + '/mrss_group'
|
5
|
+
require File.dirname(__FILE__) + '/mrss_category'
|
6
|
+
require File.dirname(__FILE__) + '/mrss_copyright'
|
7
|
+
require File.dirname(__FILE__) + '/mrss_hash'
|
8
|
+
require File.dirname(__FILE__) + '/mrss_player'
|
9
|
+
require File.dirname(__FILE__) + '/mrss_rating'
|
10
|
+
require File.dirname(__FILE__) + '/mrss_restriction'
|
11
|
+
require File.dirname(__FILE__) + '/mrss_text'
|
12
|
+
require File.dirname(__FILE__) + '/mrss_thumbnail'
|
13
|
+
|
14
|
+
module Feedzirra
|
15
|
+
module Parser
|
16
|
+
class MRSSGroup
|
17
|
+
include SAXMachine
|
18
|
+
|
19
|
+
elements :'media:content', :as => :media_content, :class => MRSSContent
|
20
|
+
|
21
|
+
# optional elements
|
22
|
+
element :'media:title', :as => :media_title
|
23
|
+
element :'media:keywords', :as => :media_keywords
|
24
|
+
element :'media:description', :as => :media_description
|
25
|
+
|
26
|
+
element :'media:thumbnail', :as => :media_thumbnail, :class => MRSSThumbnail
|
27
|
+
element :'media:rating', :as => :rating, :class => MRSSRating
|
28
|
+
element :'media:category', :as => :media_category, :class => MRSSCategory
|
29
|
+
element :'media:hash', :as => :media_hash, :class => MRSSHash
|
30
|
+
element :'media:player', :as => :media_player, :class => MRSSPlayer
|
31
|
+
elements :'media:credit', :as => :credits, :class => MRSSCredit
|
32
|
+
element :'media:copyright', :as => :copyright, :class => MRSSCopyright
|
33
|
+
element :'media:restriction', :as => :media_restriction, :class => MRSSRestriction
|
34
|
+
element :'media:text', :as => :text, :class => MRSSText
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|