Spectives-logophobia-feedzirra 0.0.31
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +169 -0
- data/README.textile +205 -0
- data/Rakefile +56 -0
- data/lib/core_ext/date.rb +21 -0
- data/lib/core_ext/string.rb +9 -0
- data/lib/feedzirra.rb +44 -0
- data/lib/feedzirra/feed.rb +333 -0
- data/lib/feedzirra/feed_entry_utilities.rb +45 -0
- data/lib/feedzirra/feed_utilities.rb +71 -0
- data/lib/feedzirra/parser/atom.rb +35 -0
- data/lib/feedzirra/parser/atom_entry.rb +41 -0
- data/lib/feedzirra/parser/itunes_category.rb +12 -0
- data/lib/feedzirra/parser/mrss_category.rb +11 -0
- data/lib/feedzirra/parser/mrss_content.rb +48 -0
- data/lib/feedzirra/parser/mrss_copyright.rb +10 -0
- data/lib/feedzirra/parser/mrss_credit.rb +11 -0
- data/lib/feedzirra/parser/mrss_group.rb +37 -0
- data/lib/feedzirra/parser/mrss_hash.rb +10 -0
- data/lib/feedzirra/parser/mrss_player.rb +11 -0
- data/lib/feedzirra/parser/mrss_rating.rb +10 -0
- data/lib/feedzirra/parser/mrss_restriction.rb +11 -0
- data/lib/feedzirra/parser/mrss_text.rb +13 -0
- data/lib/feedzirra/parser/mrss_thumbnail.rb +11 -0
- data/lib/feedzirra/parser/rss.rb +83 -0
- data/lib/feedzirra/parser/rss_entry.rb +83 -0
- data/lib/feedzirra/parser/rss_image.rb +15 -0
- data/spec/benchmarks/feed_benchmarks.rb +98 -0
- data/spec/benchmarks/feedzirra_benchmarks.rb +40 -0
- data/spec/benchmarks/fetching_benchmarks.rb +28 -0
- data/spec/benchmarks/parsing_benchmark.rb +30 -0
- data/spec/benchmarks/updating_benchmarks.rb +33 -0
- data/spec/feedzirra/feed_entry_utilities_spec.rb +52 -0
- data/spec/feedzirra/feed_spec.rb +546 -0
- data/spec/feedzirra/feed_utilities_spec.rb +149 -0
- data/spec/feedzirra/parser/atom_entry_spec.rb +49 -0
- data/spec/feedzirra/parser/atom_feed_burner_entry_spec.rb +42 -0
- data/spec/feedzirra/parser/atom_feed_burner_spec.rb +39 -0
- data/spec/feedzirra/parser/atom_spec.rb +43 -0
- data/spec/feedzirra/parser/mrss_content_spec.rb +32 -0
- data/spec/feedzirra/parser/rss_entry_spec.rb +154 -0
- data/spec/feedzirra/parser/rss_spec.rb +93 -0
- data/spec/sample_feeds/run_against_sample.rb +20 -0
- data/spec/spec_helper.rb +62 -0
- metadata +155 -0
data/lib/feedzirra.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__)) unless $LOAD_PATH.include?(File.dirname(__FILE__))
|
2
|
+
|
3
|
+
gem 'activesupport'
|
4
|
+
|
5
|
+
require 'zlib'
|
6
|
+
require 'curb'
|
7
|
+
require 'sax-machine'
|
8
|
+
require 'dryopteris'
|
9
|
+
require 'uri'
|
10
|
+
require 'active_support/basic_object'
|
11
|
+
require 'active_support/core_ext/object'
|
12
|
+
require 'active_support/core_ext/time'
|
13
|
+
|
14
|
+
|
15
|
+
require 'core_ext/date'
|
16
|
+
require 'core_ext/string'
|
17
|
+
|
18
|
+
require 'feedzirra/feed_utilities'
|
19
|
+
require 'feedzirra/feed_entry_utilities'
|
20
|
+
require 'feedzirra/feed'
|
21
|
+
|
22
|
+
require 'feedzirra/parser/mrss_content'
|
23
|
+
require 'feedzirra/parser/mrss_credit'
|
24
|
+
require 'feedzirra/parser/mrss_restriction'
|
25
|
+
require 'feedzirra/parser/mrss_group'
|
26
|
+
require 'feedzirra/parser/mrss_category'
|
27
|
+
require 'feedzirra/parser/mrss_copyright'
|
28
|
+
require 'feedzirra/parser/mrss_hash'
|
29
|
+
require 'feedzirra/parser/mrss_player'
|
30
|
+
require 'feedzirra/parser/mrss_rating'
|
31
|
+
require 'feedzirra/parser/mrss_restriction'
|
32
|
+
require 'feedzirra/parser/mrss_text'
|
33
|
+
require 'feedzirra/parser/mrss_thumbnail'
|
34
|
+
require 'feedzirra/parser/rss_entry'
|
35
|
+
require 'feedzirra/parser/rss_image'
|
36
|
+
require 'feedzirra/parser/itunes_category'
|
37
|
+
require 'feedzirra/parser/atom_entry'
|
38
|
+
|
39
|
+
require 'feedzirra/parser/rss'
|
40
|
+
require 'feedzirra/parser/atom'
|
41
|
+
|
42
|
+
module Feedzirra
|
43
|
+
VERSION = "0.0.31"
|
44
|
+
end
|
@@ -0,0 +1,333 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
class NoParserAvailable < StandardError; end
|
3
|
+
|
4
|
+
class Feed
|
5
|
+
USER_AGENT = "feedzirra http://github.com/pauldix/feedzirra/tree/master"
|
6
|
+
|
7
|
+
# Takes a raw XML feed and attempts to parse it. If no parser is available a Feedzirra::NoParserAvailable exception is raised.
|
8
|
+
#
|
9
|
+
# === Parameters
|
10
|
+
# [xml<String>] The XML that you would like parsed.
|
11
|
+
# === Returns
|
12
|
+
# An instance of the determined feed type. By default a Feedzirra::Atom, Feedzirra::AtomFeedBurner, Feedzirra::RDF, or Feedzirra::RSS object.
|
13
|
+
# === Raises
|
14
|
+
# Feedzirra::NoParserAvailable : If no valid parser classes could be found for the feed.
|
15
|
+
def self.parse(xml)
|
16
|
+
if parser = determine_feed_parser_for_xml(xml)
|
17
|
+
parser.parse(xml)
|
18
|
+
else
|
19
|
+
raise NoParserAvailable.new("No valid parser for XML.")
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
# Determines the correct parser class to use for parsing the feed.
|
24
|
+
#
|
25
|
+
# === Parameters
|
26
|
+
# [xml<String>] The XML that you would like determine the parser for.
|
27
|
+
# === Returns
|
28
|
+
# The class name of the parser that can handle the XML.
|
29
|
+
def self.determine_feed_parser_for_xml(xml)
|
30
|
+
start_of_doc = xml.slice(0, 2000)
|
31
|
+
feed_classes.detect {|klass| klass.able_to_parse?(start_of_doc)}
|
32
|
+
end
|
33
|
+
|
34
|
+
# Adds a new feed parsing class that will be used for parsing.
|
35
|
+
#
|
36
|
+
# === Parameters
|
37
|
+
# [klass<Constant>] The class/constant that you want to register.
|
38
|
+
# === Returns
|
39
|
+
# A updated array of feed parser class names.
|
40
|
+
def self.add_feed_class(klass)
|
41
|
+
feed_classes.unshift klass
|
42
|
+
end
|
43
|
+
|
44
|
+
# Provides a list of registered feed parsing classes.
|
45
|
+
#
|
46
|
+
# === Returns
|
47
|
+
# A array of class names.
|
48
|
+
def self.feed_classes
|
49
|
+
@feed_classes ||= [
|
50
|
+
Feedzirra::Parser::RSS,
|
51
|
+
Feedzirra::Parser::Atom
|
52
|
+
]
|
53
|
+
end
|
54
|
+
|
55
|
+
# Makes all entry types look for the passed in element to parse. This is actually just a call to
|
56
|
+
# element (a SAXMachine call) in the class
|
57
|
+
#
|
58
|
+
# === Parameters
|
59
|
+
# [element_tag<String>]
|
60
|
+
# [options<Hash>] Valid keys are same as with SAXMachine
|
61
|
+
def self.add_common_feed_entry_element(element_tag, options = {})
|
62
|
+
# need to think of a better way to do this. will break for people who want this behavior
|
63
|
+
# across their added classes
|
64
|
+
feed_classes.map{|k| eval("#{k}Entry") }.each do |klass|
|
65
|
+
klass.send(:element, element_tag, options)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Fetches and returns the raw XML for each URL provided.
|
70
|
+
#
|
71
|
+
# === Parameters
|
72
|
+
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
73
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
74
|
+
# :user_agent - String that overrides the default user agent.
|
75
|
+
# :if_modified_since - Time object representing when the feed was last updated.
|
76
|
+
# :if_none_match - String that's normally an etag for the request that was stored previously.
|
77
|
+
# :on_success - Block that gets executed after a successful request.
|
78
|
+
# :on_failure - Block that gets executed after a failed request.
|
79
|
+
# === Returns
|
80
|
+
# A String of XML if a single URL is passed.
|
81
|
+
#
|
82
|
+
# A Hash if multiple URL's are passed. The key will be the URL, and the value the XML.
|
83
|
+
def self.fetch_raw(urls, options = {})
|
84
|
+
url_queue = [*urls]
|
85
|
+
multi = Curl::Multi.new
|
86
|
+
responses = {}
|
87
|
+
url_queue.each do |url|
|
88
|
+
easy = Curl::Easy.new(url) do |curl|
|
89
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
90
|
+
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
91
|
+
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
92
|
+
curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
|
93
|
+
curl.follow_location = true
|
94
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
95
|
+
|
96
|
+
curl.max_redirects = options[:max_redirects] if options[:max_redirects]
|
97
|
+
curl.timeout = options[:timeout] if options[:timeout]
|
98
|
+
|
99
|
+
curl.on_success do |c|
|
100
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
101
|
+
responses[url] = decode_content(c)
|
102
|
+
end
|
103
|
+
curl.on_failure do |c|
|
104
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
105
|
+
responses[url] = c.response_code
|
106
|
+
end
|
107
|
+
end
|
108
|
+
multi.add(easy)
|
109
|
+
end
|
110
|
+
|
111
|
+
multi.perform
|
112
|
+
urls.is_a?(String) ? responses.values.first : responses
|
113
|
+
end
|
114
|
+
|
115
|
+
# Fetches and returns the parsed XML for each URL provided.
|
116
|
+
#
|
117
|
+
# === Parameters
|
118
|
+
# [urls<String> or <Array>] A single feed URL, or an array of feed URLs.
|
119
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
120
|
+
# * :user_agent - String that overrides the default user agent.
|
121
|
+
# * :if_modified_since - Time object representing when the feed was last updated.
|
122
|
+
# * :if_none_match - String, an etag for the request that was stored previously.
|
123
|
+
# * :on_success - Block that gets executed after a successful request.
|
124
|
+
# * :on_failure - Block that gets executed after a failed request.
|
125
|
+
# === Returns
|
126
|
+
# A Feed object if a single URL is passed.
|
127
|
+
#
|
128
|
+
# A Hash if multiple URL's are passed. The key will be the URL, and the value the Feed object.
|
129
|
+
def self.fetch_and_parse(urls, options = {})
|
130
|
+
url_queue = [*urls]
|
131
|
+
multi = Curl::Multi.new
|
132
|
+
responses = {}
|
133
|
+
|
134
|
+
# I broke these down so I would only try to do 30 simultaneously because
|
135
|
+
# I was getting weird errors when doing a lot. As one finishes it pops another off the queue.
|
136
|
+
url_queue.slice!(0, 30).each do |url|
|
137
|
+
add_url_to_multi(multi, url, url_queue, responses, options)
|
138
|
+
end
|
139
|
+
|
140
|
+
multi.perform
|
141
|
+
return urls.is_a?(String) ? responses.values.first : responses
|
142
|
+
end
|
143
|
+
|
144
|
+
# Decodes the XML document if it was compressed.
|
145
|
+
#
|
146
|
+
# === Parameters
|
147
|
+
# [curl_request<Curl::Easy>] The Curl::Easy response object from the request.
|
148
|
+
# === Returns
|
149
|
+
# A decoded string of XML.
|
150
|
+
def self.decode_content(c)
|
151
|
+
if c.header_str.match(/Content-Encoding: gzip/)
|
152
|
+
begin
|
153
|
+
gz = Zlib::GzipReader.new(StringIO.new(c.body_str))
|
154
|
+
xml = gz.read
|
155
|
+
gz.close
|
156
|
+
rescue Zlib::GzipFile::Error
|
157
|
+
# Maybe this is not gzipped?
|
158
|
+
xml = c.body_str
|
159
|
+
end
|
160
|
+
elsif c.header_str.match(/Content-Encoding: deflate/)
|
161
|
+
xml = Zlib::Inflate.inflate(c.body_str)
|
162
|
+
else
|
163
|
+
xml = c.body_str
|
164
|
+
end
|
165
|
+
|
166
|
+
xml
|
167
|
+
end
|
168
|
+
|
169
|
+
# Updates each feed for each Feed object provided.
|
170
|
+
#
|
171
|
+
# === Parameters
|
172
|
+
# [feeds<Feed> or <Array>] A single feed object, or an array of feed objects.
|
173
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
174
|
+
# * :user_agent - String that overrides the default user agent.
|
175
|
+
# * :on_success - Block that gets executed after a successful request.
|
176
|
+
# * :on_failure - Block that gets executed after a failed request.
|
177
|
+
# === Returns
|
178
|
+
# A updated Feed object if a single URL is passed.
|
179
|
+
#
|
180
|
+
# A Hash if multiple Feeds are passed. The key will be the URL, and the value the updated Feed object.
|
181
|
+
def self.update(feeds, options = {})
|
182
|
+
feed_queue = [*feeds]
|
183
|
+
multi = Curl::Multi.new
|
184
|
+
responses = {}
|
185
|
+
|
186
|
+
feed_queue.slice!(0, 30).each do |feed|
|
187
|
+
add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
188
|
+
end
|
189
|
+
|
190
|
+
multi.perform
|
191
|
+
responses.size == 1 ? responses.values.first : responses.values
|
192
|
+
end
|
193
|
+
|
194
|
+
# An abstraction for adding a feed by URL to the passed Curb::multi stack.
|
195
|
+
#
|
196
|
+
# === Parameters
|
197
|
+
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
198
|
+
# [url<String>] The URL of the feed that you would like to be fetched.
|
199
|
+
# [url_queue<Array>] An array of URLs that are queued for request.
|
200
|
+
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
201
|
+
# [feeds<String> or <Array>] A single feed object, or an array of feed objects.
|
202
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
203
|
+
# * :user_agent - String that overrides the default user agent.
|
204
|
+
# * :on_success - Block that gets executed after a successful request.
|
205
|
+
# * :on_failure - Block that gets executed after a failed request.
|
206
|
+
# === Returns
|
207
|
+
# The updated Curl::Multi object with the request details added to it's stack.
|
208
|
+
def self.add_url_to_multi(multi, url, url_queue, responses, options)
|
209
|
+
easy = Curl::Easy.new(url) do |curl|
|
210
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
211
|
+
curl.headers["If-Modified-Since"] = options[:if_modified_since].httpdate if options.has_key?(:if_modified_since)
|
212
|
+
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
|
213
|
+
curl.headers["Accept-encoding"] = 'gzip, deflate' if options.has_key?(:compress)
|
214
|
+
curl.follow_location = true
|
215
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
216
|
+
|
217
|
+
curl.max_redirects = options[:max_redirects] if options[:max_redirects]
|
218
|
+
curl.timeout = options[:timeout] if options[:timeout]
|
219
|
+
|
220
|
+
curl.on_success do |c|
|
221
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
222
|
+
add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
|
223
|
+
xml = decode_content(c)
|
224
|
+
klass = determine_feed_parser_for_xml(xml)
|
225
|
+
|
226
|
+
if klass
|
227
|
+
begin
|
228
|
+
feed = klass.parse(xml)
|
229
|
+
feed.feed_url = c.last_effective_url
|
230
|
+
feed.etag = etag_from_header(c.header_str)
|
231
|
+
feed.last_modified = last_modified_from_header(c.header_str)
|
232
|
+
responses[url] = feed
|
233
|
+
options[:on_success].call(url, feed) if options.has_key?(:on_success)
|
234
|
+
rescue Exception => e
|
235
|
+
options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
236
|
+
end
|
237
|
+
else
|
238
|
+
# puts "Error determining parser for #{url} - #{c.last_effective_url}"
|
239
|
+
# raise NoParserAvailable.new("no valid parser for content.") (this would unfirtunately fail the whole 'multi', so it's not really useable)
|
240
|
+
options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
curl.on_failure do |c|
|
245
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
246
|
+
add_url_to_multi(multi, url_queue.shift, url_queue, responses, options) unless url_queue.empty?
|
247
|
+
responses[url] = c.response_code
|
248
|
+
options[:on_failure].call(url, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
multi.add(easy)
|
252
|
+
end
|
253
|
+
|
254
|
+
# An abstraction for adding a feed by a Feed object to the passed Curb::multi stack.
|
255
|
+
#
|
256
|
+
# === Parameters
|
257
|
+
# [multi<Curl::Multi>] The Curl::Multi object that the request should be added too.
|
258
|
+
# [feed<Feed>] A feed object that you would like to be fetched.
|
259
|
+
# [url_queue<Array>] An array of feed objects that are queued for request.
|
260
|
+
# [responses<Hash>] Existing responses that you want the response from the request added to.
|
261
|
+
# [feeds<String>] or <Array> A single feed object, or an array of feed objects.
|
262
|
+
# [options<Hash>] Valid keys for this argument as as followed:
|
263
|
+
# * :user_agent - String that overrides the default user agent.
|
264
|
+
# * :on_success - Block that gets executed after a successful request.
|
265
|
+
# * :on_failure - Block that gets executed after a failed request.
|
266
|
+
# === Returns
|
267
|
+
# The updated Curl::Multi object with the request details added to it's stack.
|
268
|
+
def self.add_feed_to_multi(multi, feed, feed_queue, responses, options)
|
269
|
+
easy = Curl::Easy.new(feed.feed_url) do |curl|
|
270
|
+
curl.headers["User-Agent"] = (options[:user_agent] || USER_AGENT)
|
271
|
+
curl.headers["If-Modified-Since"] = feed.last_modified.httpdate if feed.last_modified
|
272
|
+
curl.headers["If-None-Match"] = feed.etag if feed.etag
|
273
|
+
curl.userpwd = options[:http_authentication].join(':') if options.has_key?(:http_authentication)
|
274
|
+
curl.follow_location = true
|
275
|
+
|
276
|
+
curl.max_redirects = options[:max_redirects] if options[:max_redirects]
|
277
|
+
curl.timeout = options[:timeout] if options[:timeout]
|
278
|
+
|
279
|
+
curl.on_success do |c|
|
280
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
281
|
+
begin
|
282
|
+
add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
|
283
|
+
updated_feed = Feed.parse(c.body_str)
|
284
|
+
updated_feed.feed_url = c.last_effective_url
|
285
|
+
updated_feed.etag = etag_from_header(c.header_str)
|
286
|
+
updated_feed.last_modified = last_modified_from_header(c.header_str)
|
287
|
+
feed.update_from_feed(updated_feed)
|
288
|
+
responses[feed.feed_url] = feed
|
289
|
+
options[:on_success].call(feed) if options.has_key?(:on_success)
|
290
|
+
rescue Exception => e
|
291
|
+
options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
292
|
+
end
|
293
|
+
end
|
294
|
+
|
295
|
+
curl.on_failure do |c|
|
296
|
+
c = c.select { |e| e.kind_of? Curl::Easy }.first if(c.kind_of? Array)
|
297
|
+
add_feed_to_multi(multi, feed_queue.shift, feed_queue, responses, options) unless feed_queue.empty?
|
298
|
+
response_code = c.response_code
|
299
|
+
if response_code == 304 # it's not modified. this isn't an error condition
|
300
|
+
responses[feed.feed_url] = feed
|
301
|
+
options[:on_success].call(feed) if options.has_key?(:on_success)
|
302
|
+
else
|
303
|
+
responses[feed.url] = c.response_code
|
304
|
+
options[:on_failure].call(feed, c.response_code, c.header_str, c.body_str) if options.has_key?(:on_failure)
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
multi.add(easy)
|
309
|
+
end
|
310
|
+
|
311
|
+
# Determines the etag from the request headers.
|
312
|
+
#
|
313
|
+
# === Parameters
|
314
|
+
# [header<String>] Raw request header returned from the request
|
315
|
+
# === Returns
|
316
|
+
# A string of the etag or nil if it cannot be found in the headers.
|
317
|
+
def self.etag_from_header(header)
|
318
|
+
header =~ /.*ETag:\s(.*)\r/
|
319
|
+
$1
|
320
|
+
end
|
321
|
+
|
322
|
+
# Determines the last modified date from the request headers.
|
323
|
+
#
|
324
|
+
# === Parameters
|
325
|
+
# [header<String>] Raw request header returned from the request
|
326
|
+
# === Returns
|
327
|
+
# A Time object of the last modified date or nil if it cannot be found in the headers.
|
328
|
+
def self.last_modified_from_header(header)
|
329
|
+
header =~ /.*Last-Modified:\s(.*)\r/
|
330
|
+
Time.parse($1) if $1
|
331
|
+
end
|
332
|
+
end
|
333
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedEntryUtilities
|
3
|
+
def published
|
4
|
+
@published || @updated
|
5
|
+
end
|
6
|
+
|
7
|
+
def parse_datetime(string)
|
8
|
+
begin
|
9
|
+
DateTime.parse(string).feed_utils_to_gm_time
|
10
|
+
rescue
|
11
|
+
puts "DATE CAN'T BE PARSED: #{string}"
|
12
|
+
nil
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
##
|
17
|
+
# Returns the id of the entry or its url if not id is present, as some formats don't support it
|
18
|
+
def id
|
19
|
+
@id || @url
|
20
|
+
end
|
21
|
+
|
22
|
+
##
|
23
|
+
# Writter for published. By default, we keep the "oldest" publish time found.
|
24
|
+
def published=(val)
|
25
|
+
parsed = parse_datetime(val)
|
26
|
+
@published = parsed if !@published || parsed < @published
|
27
|
+
end
|
28
|
+
|
29
|
+
##
|
30
|
+
# Writter for udapted. By default, we keep the most recenet update time found.
|
31
|
+
def updated=(val)
|
32
|
+
parsed = parse_datetime(val)
|
33
|
+
@updated = parsed if !@updated || parsed > @updated
|
34
|
+
end
|
35
|
+
|
36
|
+
def sanitize!
|
37
|
+
self.title.sanitize! if self.title
|
38
|
+
self.author.sanitize! if self.author
|
39
|
+
self.summary.sanitize! if self.summary
|
40
|
+
self.content.sanitize! if self.content
|
41
|
+
end
|
42
|
+
|
43
|
+
alias_method :last_modified, :published
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module Feedzirra
|
2
|
+
module FeedUtilities
|
3
|
+
UPDATABLE_ATTRIBUTES = %w(title feed_url url last_modified)
|
4
|
+
|
5
|
+
attr_writer :new_entries, :updated, :last_modified
|
6
|
+
attr_accessor :etag
|
7
|
+
|
8
|
+
def last_modified
|
9
|
+
@last_modified ||= begin
|
10
|
+
entry = entries.reject {|e| e.published.nil? }.sort_by { |entry| entry.published if entry.published }.last
|
11
|
+
entry ? entry.published : nil
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def updated?
|
16
|
+
@updated
|
17
|
+
end
|
18
|
+
|
19
|
+
def new_entries
|
20
|
+
@new_entries ||= []
|
21
|
+
end
|
22
|
+
|
23
|
+
def has_new_entries?
|
24
|
+
new_entries.size > 0
|
25
|
+
end
|
26
|
+
|
27
|
+
def update_from_feed(feed)
|
28
|
+
self.new_entries += find_new_entries_for(feed)
|
29
|
+
self.entries.unshift(*self.new_entries)
|
30
|
+
|
31
|
+
updated! if UPDATABLE_ATTRIBUTES.any? { |name| update_attribute(feed, name) }
|
32
|
+
end
|
33
|
+
|
34
|
+
def update_attribute(feed, name)
|
35
|
+
old_value, new_value = send(name), feed.send(name)
|
36
|
+
|
37
|
+
if old_value != new_value
|
38
|
+
send("#{name}=", new_value)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def sanitize_entries!
|
43
|
+
entries.each {|entry| entry.sanitize!}
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def updated!
|
49
|
+
@updated = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def find_new_entries_for(feed)
|
53
|
+
# this implementation is a hack, which is why it's so ugly.
|
54
|
+
# it's to get around the fact that not all feeds have a published date.
|
55
|
+
# however, they're always ordered with the newest one first.
|
56
|
+
# So we go through the entries just parsed and insert each one as a new entry
|
57
|
+
# until we get to one that has the same url as the the newest for the feed
|
58
|
+
latest_entry = self.entries.first
|
59
|
+
found_new_entries = []
|
60
|
+
feed.entries.each do |entry|
|
61
|
+
break if entry.url == latest_entry.url
|
62
|
+
found_new_entries << entry
|
63
|
+
end
|
64
|
+
found_new_entries
|
65
|
+
end
|
66
|
+
|
67
|
+
def existing_entry?(test_entry)
|
68
|
+
entries.any? { |entry| entry.url == test_entry.url }
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|