feedtools 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ == FeedTools 0.1.0
2
+ * basic support for rss, atom, cdf
3
+ * basic caching using active record
4
+ * support for etags
data/README ADDED
@@ -0,0 +1,13 @@
1
+ FeedTools was designed to be a simple XML feed parser, generator, and translator with a built-in
2
+ caching system.
3
+
4
+ == Example
5
+ slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss')
6
+ slashdot_feed.title
7
+ => "Slashdot"
8
+ slashdot_feed.description
9
+ => "News for nerds, stuff that matters"
10
+ slashdot_feed.link
11
+ => "http://slashdot.org/"
12
+ slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s
13
+ => "43,37,28,23,11,3,1"
@@ -0,0 +1,30 @@
1
+ require 'rbconfig'
2
+ require 'find'
3
+ require 'ftools'
4
+
5
+ include Config
6
+
7
+ # this was adapted from rdoc's install.rb by ways of Log4r
8
+
9
+ $sitedir = CONFIG["sitelibdir"]
10
+ unless $sitedir
11
+ version = CONFIG["MAJOR"] + "." + CONFIG["MINOR"]
12
+ $libdir = File.join(CONFIG["libdir"], "ruby", version)
13
+ $sitedir = $:.find {|x| x =~ /site_ruby/ }
14
+ if !$sitedir
15
+ $sitedir = File.join($libdir, "site_ruby")
16
+ elsif $sitedir !~ Regexp.quote(version)
17
+ $sitedir = File.join($sitedir, version)
18
+ end
19
+ end
20
+
21
+ # the acual gruntwork
22
+ Dir.chdir("lib")
23
+
24
+ Find.find("feed_tools", "feed_tools.rb") { |f|
25
+ if f[-3..-1] == ".rb"
26
+ File::install(f, File.join($sitedir, *f.split(/\//)), 0644, true)
27
+ else
28
+ File::makedirs(File.join($sitedir, *f.split(/\//)))
29
+ end
30
+ }
@@ -0,0 +1,1126 @@
1
+ #--
2
+ # Copyright (c) 2005 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] || ENV['RAILS_ENV'] || 'production'
25
+
26
+ $:.unshift(File.dirname(__FILE__))
27
+ $:.unshift(File.dirname(__FILE__) + "/../../activerecord/lib")
28
+
29
+ begin
30
+ require 'active_record'
31
+ rescue LoadError
32
+ require 'rubygems'
33
+ require_gem 'activerecord'
34
+ end
35
+
36
+ begin
37
+ require 'rubygems'
38
+ require 'builder'
39
+ rescue LoadError
40
+ # RubyGems is not available, use included Builder
41
+ $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
42
+ require 'feed_tools/vendor/builder'
43
+ end
44
+
45
+ require 'open-uri'
46
+ require 'time'
47
+ require 'rexml/document'
48
+ require 'yaml'
49
+ require 'cgi'
50
+
51
+ module FeedTools
52
+ class Feed < ActiveRecord::Base
53
+ include REXML
54
+
55
+ has_many :feed_items_unsorted, :class_name => "FeedItem"
56
+
57
+ def initialize
58
+ @live = false
59
+ @feed_items_unsorted = nil
60
+ super
61
+ end
62
+
63
+ # Loads the feed specified by the url, pulling the data from the cache if it hasn't expired
64
+ # Be aware that this method translates from the feed: and rss: pseudo-protocols to the
65
+ # http: protocol as needed. This means that if you pass in a feed url that looks like
66
+ # 'feed://www.anywhere.com/feed.xml' it will end up being stored in the cache as
67
+ # 'http://www.anywhere.com/feed.xml' instead. This does affect the usage of methods like
68
+ # find_by_url, but otherwise should be fairly transparent.
69
+ def Feed.open(url)
70
+ # deal with all of the ugly possibilities involved in the rss: and feed: pseudo-protocols
71
+ if (url =~ /feed:/) == 0
72
+ url = url.gsub(/feed:\/\/http:\/\/\//, "http://")
73
+ url = url.gsub(/feed:\/\/http:\/\//, "http://")
74
+ url = url.gsub(/feed:http:\/\/\//, "http://")
75
+ url = url.gsub(/feed:http:\/\//, "http://")
76
+ url = url.gsub(/feed:\/\/\//, "http://")
77
+ url = url.gsub(/feed:\/\//, "http://")
78
+ url = url.gsub(/feed:\//, "http://")
79
+ url = url.gsub(/feed:/, "http://")
80
+ end
81
+ if (url =~ /rss:/) == 0
82
+ url = url.gsub(/rss:\/\/http:\/\/\//, "http://")
83
+ url = url.gsub(/rss:\/\/http:\/\//, "http://")
84
+ url = url.gsub(/rss:http:\/\/\//, "http://")
85
+ url = url.gsub(/rss:http:\/\//, "http://")
86
+ url = url.gsub(/rss:\/\/\//, "http://")
87
+ url = url.gsub(/rss:\/\//, "http://")
88
+ url = url.gsub(/rss:\//, "http://")
89
+ url = url.gsub(/rss:/, "http://")
90
+ end
91
+
92
+ feed = nil
93
+ begin
94
+ feed = Feed.find_by_url(url)
95
+ rescue ActiveRecord::StatementInvalid
96
+ # make sure that the necessary tables are present and recover if possible
97
+ FeedTools::Feed.prepare_connection
98
+ unless FeedTools::Feed.cache_exists?
99
+ FeedTools::Feed.create_cache
100
+ end
101
+ feed = Feed.find_by_url(url)
102
+ end
103
+ unless feed.nil?
104
+ feed.update_if_needed
105
+ else
106
+ feed = Feed.new
107
+ feed.url = url
108
+ feed.load_remote_feed
109
+ end
110
+ return feed
111
+ end
112
+
113
+ # Checks if the feed has expired and updates if it has
114
+ def update_if_needed
115
+ if expired?
116
+ load_remote_feed
117
+ end
118
+ end
119
+
120
+ # Verifies that the table structure exists
121
+ def Feed.cache_exists?
122
+ return Feed.table_exists? && FeedItem.table_exists?
123
+ end
124
+
125
+ # Verifies that the required fields exist; additional ones added by the user are fine
126
+ def Feed.table_exists?
127
+ begin
128
+ connection.execute "select id, url, link, image_link, title, description, " +
129
+ "tags, last_updated, etag, time_to_live from feeds limit 1"
130
+ rescue ActiveRecord::StatementInvalid
131
+ return false
132
+ end
133
+ return true
134
+ end
135
+
136
+ # Generates the table structure if necessary
137
+ def Feed.create_cache
138
+ unless Feed.cache_exists?
139
+ feed_items_mysql = <<-SQL_END
140
+ CREATE TABLE `feed_items` (
141
+ `id` int(6) unsigned NOT NULL auto_increment,
142
+ `feed_id` int(6) unsigned NOT NULL default '0',
143
+ `link` varchar(255) default NULL,
144
+ `title` varchar(255) default NULL,
145
+ `author` varchar(255) default NULL,
146
+ `description` text default NULL,
147
+ `time` datetime NOT NULL default '0000-00-00 00:00:00',
148
+ `tags` varchar(255) default NULL,
149
+ PRIMARY KEY (`id`)
150
+ ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
151
+ SQL_END
152
+ feed_items_sqlite = <<-SQL_END
153
+ CREATE TABLE 'feed_items' (
154
+ 'id' INTEGER PRIMARY KEY NOT NULL,
155
+ 'feed_id' INTEGER NOT NULL,
156
+ 'link' VARCHAR(255) DEFAULT NULL,
157
+ 'title' VARCHAR(255) DEFAULT NULL,
158
+ 'author' VARCHAR(255) DEFAULT NULL,
159
+ 'description' TEXT DEFAULT NULL,
160
+ 'time' DATETIME DEFAULT NULL,
161
+ 'tags' VARCHAR(255) DEFAULT NULL
162
+ );
163
+ SQL_END
164
+ feed_items_psql = <<-SQL_END
165
+ CREATE TABLE feed_items (
166
+ id SERIAL PRIMARY KEY NOT NULL,
167
+ feed_id int REFERENCES feeds,
168
+ link varchar(255) default NULL,
169
+ title varchar(255) default NULL,
170
+ author varchar(255) default NULL,
171
+ description text default NULL,
172
+ time datetime default NULL,
173
+ tags varchar(255) default NULL
174
+ );
175
+ SQL_END
176
+ unless FeedItem.table_exists?
177
+ table_creation_sql = nil
178
+ if configurations["adapter"] == "mysql"
179
+ table_creation_sql = feed_items_mysql
180
+ elsif configurations["adapter"] == "sqlite"
181
+ table_creation_sql = feed_items_sqlite
182
+ elsif configurations["adapter"] == "postgresql"
183
+ table_creation_sql = feeds_psql
184
+ end
185
+ if table_creation_sql.nil?
186
+ raise "Could not build feed_items table."
187
+ else
188
+ connection.execute table_creation_sql
189
+ end
190
+ end
191
+ feeds_mysql = <<-SQL_END
192
+ CREATE TABLE `feeds` (
193
+ `id` int(6) unsigned NOT NULL auto_increment,
194
+ `url` varchar(255) NOT NULL default '',
195
+ `link` varchar(255) NOT NULL default '',
196
+ `image_link` varchar(255) default NULL,
197
+ `title` varchar(255) default NULL,
198
+ `description` text default NULL,
199
+ `tags` varchar(255) default NULL,
200
+ `last_updated` datetime default NULL,
201
+ `etag` varchar(255) default NULL,
202
+ `time_to_live` int(4) default NULL,
203
+ PRIMARY KEY (`id`)
204
+ ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
205
+ SQL_END
206
+ feeds_sqlite = <<-SQL_END
207
+ CREATE TABLE 'feeds' (
208
+ 'id' INTEGER PRIMARY KEY NOT NULL,
209
+ 'url' VARCHAR(255) DEFAULT NULL,
210
+ 'link' VARCHAR(255) DEFAULT NULL,
211
+ 'image_link' VARCHAR(255) DEFAULT NULL,
212
+ 'title' VARCHAR(255) DEFAULT NULL,
213
+ 'description' TEXT DEFAULT NULL,
214
+ 'tags' VARCHAR(255) DEFAULT NULL,
215
+ 'last_updated' DATETIME DEFAULT NULL,
216
+ 'etag' VARCHAR(255) DEFAULT NULL,
217
+ 'time_to_live' INTEGER DEFAULT NULL
218
+ );
219
+ SQL_END
220
+ feeds_psql = <<-SQL_END
221
+ CREATE TABLE feeds (
222
+ id SERIAL PRIMARY KEY NOT NULL,
223
+ url varchar(255) default NULL,
224
+ link varchar(255) default NULL,
225
+ image_link varchar(255) default NULL,
226
+ title varchar(255) default NULL,
227
+ description text default NULL,
228
+ tags varchar(255) default NULL,
229
+ last_updated datetime default NULL,
230
+ etag varchar(255) default NULL,
231
+ time_to_live int default NULL
232
+ );
233
+ SQL_END
234
+ unless Feed.table_exists?
235
+ table_creation_sql = nil
236
+ if configurations["adapter"] == "mysql"
237
+ table_creation_sql = feeds_mysql
238
+ elsif configurations["adapter"] == "sqlite"
239
+ table_creation_sql = feeds_sqlite
240
+ elsif configurations["adapter"] == "postgresql"
241
+ table_creation_sql = feeds_psql
242
+ end
243
+ if table_creation_sql.nil?
244
+ raise "Could not build feed_items table."
245
+ else
246
+ connection.execute table_creation_sql
247
+ end
248
+ end
249
+ end
250
+ end
251
+
252
+ # Removes all feed entries from the cache
253
+ # This could obviously be a very dangerous operation if you use the cache for more than simply
254
+ # caching the feeds.
255
+ def Feed.clear_cache
256
+ FeedItem.delete_all
257
+ Feed.delete_all
258
+ end
259
+
260
+ # Removes all feed items from the cache and resets the last updated time for all feeds
261
+ # This is probably much safer than the clear_cache method
262
+ def Feed.expire_cache
263
+ FeedItem.delete_all
264
+ Feed.update_all("last_updated = NULL")
265
+ end
266
+
267
+ # Removes all feed items older than the specified number of seconds
268
+ def Feed.purge_cache(purge_time=1.week)
269
+ purge_date = (Time.now - purge_time).strftime("%Y-%m-%d %H:%M:%S")
270
+ FeedItem.delete_all("time < '#{purge_date}'")
271
+ end
272
+
273
+ # If ActiveRecord is not already connected, attempts to find a configuration file and use
274
+ # it to open a connection for ActiveRecord.
275
+ # This method is probably unnecessary for anything but testing and debugging purposes.
276
+ def Feed.prepare_connection
277
+ begin
278
+ ActiveRecord::Base.connection
279
+ rescue
280
+ possible_config_files = [
281
+ "./config/database.yml",
282
+ "./database.yml"
283
+ ]
284
+ database_config_file = nil
285
+ for file in possible_config_files
286
+ if File.exists? file
287
+ database_config_file = file
288
+ break
289
+ end
290
+ end
291
+ database_config_hash = File.open(database_config_file) do |file|
292
+ config_hash = YAML::load(file)
293
+ unless config_hash[FEED_TOOLS_ENV].nil?
294
+ config_hash = config_hash[FEED_TOOLS_ENV]
295
+ end
296
+ config_hash
297
+ end
298
+ ActiveRecord::Base.configurations = database_config_hash
299
+ ActiveRecord::Base.establish_connection(database_config_hash)
300
+ ActiveRecord::Base.connection
301
+ end
302
+ end
303
+
304
+ def Feed.cache_enabled?
305
+ return true
306
+ end
307
+
308
+ def title
309
+ return (self["title"] or "Untitled Feed")
310
+ end
311
+
312
+ # Optional feed attribute.
313
+ # If you want to use it, the database table needs to have a language field added, otherwise
314
+ # it will just default to "en-US".
315
+ def language
316
+ begin
317
+ return (self["language"] or "en-US")
318
+ rescue
319
+ return "en-US"
320
+ end
321
+ end
322
+
323
+ def live?
324
+ if @live
325
+ return true
326
+ else
327
+ return false
328
+ end
329
+ end
330
+
331
+ def expired?
332
+ return last_updated == nil || (last_updated + time_to_live) < Time.now
333
+ end
334
+
335
+ # Forces this feed to expire.
336
+ def expire
337
+ FeedItem.delete_all("feed_id = '#{self.id}'")
338
+ @feed_items_unsorted = nil
339
+ self.last_updated = Time.mktime(1980)
340
+ self.save
341
+ end
342
+
343
+ # The ammount of time in seconds between the last time the feed was updated and the next
344
+ # valid time to retrieve a remote feed.
345
+ def time_to_live
346
+ return self['time_to_live'].nil? ? 1.hour : self['time_to_live'].hour
347
+ end
348
+
349
+ def tag_list
350
+ return tags.nil? ? nil : tags[1..-2].split("|")
351
+ end
352
+
353
+ def tag_list=(new_tag_list)
354
+ self.tags = "|" + (new_tag_list.map { |x| x.strip }).join("|") + "|"
355
+ end
356
+
357
+ def tag_string
358
+ return (tags.nil? ? nil : tags[1..-2]).split("|").join(", ")
359
+ end
360
+
361
+ def tag_string=(new_tag_string)
362
+ self.tags = "|" + (new_tag_string.split(",").map { |x| x.strip }).join("|") + "|"
363
+ end
364
+
365
+ # Returns a list of the feed_items, sorted by date
366
+ def feed_items
367
+ begin
368
+ if @feed_items_unsorted.nil?
369
+ @feed_items_unsorted = feed_items_unsorted
370
+ end
371
+ return @feed_items_unsorted.sort do |a,b|
372
+ b.time <=> a.time
373
+ end
374
+ rescue
375
+ unless @feed_items_unsorted.nil?
376
+ return @feed_items_unsorted
377
+ else
378
+ return feed_items_unsorted
379
+ end
380
+ end
381
+ end
382
+
383
+ # Attempts to load the feed from the remote location. Requires the url to be set.
384
+ # If an etag has been set, attempts to use it to prevent unnecessary reloading of identical
385
+ # content.
386
+ def load_remote_feed
387
+ @live = true
388
+ self.last_updated = Time.now
389
+ if (etag != nil)
390
+ # TODO: verify that the etag code works as intended
391
+ # -> may need to check what gets returned when the
392
+ # etag is matched
393
+ # =================================================
394
+ open(url, "If-None-Match" => @etag ) do |http|
395
+ etag = http.meta['etag']
396
+ parse_feed(http.read)
397
+ end
398
+ else
399
+ open(url) do |http|
400
+ etag = http.meta['etag']
401
+ parse_feed(http.read)
402
+ end
403
+ end
404
+ end
405
+
406
+ def parse_feed_hook(feed_data)
407
+ return nil
408
+ end
409
+
410
+ def parse_feed(feed_data)
411
+ root_node = Document.new(feed_data).root
412
+ metadata_node = XPath.first(root_node, "channel")
413
+ if metadata_node == nil
414
+ metadata_node = root_node
415
+ end
416
+
417
+ # get the feed title
418
+ title = XPath.first(metadata_node, "title/text()").to_s
419
+
420
+ # is the title escaped?
421
+ if XPath.first(metadata_node, "title/@mode").to_s == "escaped"
422
+ title = CGI.unescapeHTML(title)
423
+ end
424
+
425
+ # get the feed link
426
+ link = XPath.first(metadata_node, "link[@rel='alternate' @type='text/html']/@href").to_s
427
+ if link == ""
428
+ link = XPath.first(metadata_node, "link[@rel='alternate']/@href").to_s
429
+ end
430
+ if link == ""
431
+ link = XPath.first(metadata_node, "link/@href").to_s
432
+ end
433
+ if link == ""
434
+ link = XPath.first(metadata_node, "link/text()").to_s
435
+ end
436
+ if link == ""
437
+ # The ordering here is somewhat incorrect, but the more correct ordering would
438
+ # introduce much more serious problems, so I've chosen to go with the lesser of two
439
+ # evils. (The completely correct implementation would require a vestigial 'base' method
440
+ # on the Feed class to fully support CDF files. This method will support almost all CDF
441
+ # files without any unnecessary methods.) But given that this only exists to support
442
+ # CDF files, it's not a big deal. It's not like CDF files really exist in the wild.
443
+ # (The assumption this ordering makes is that the 'base' attribute points to a valid
444
+ # location, hopefully the same as the 'href' location. Chances are pretty good that this
445
+ # is true.)
446
+ link = XPath.first(metadata_node, "@base").to_s
447
+ end
448
+ if link == ""
449
+ link = XPath.first(metadata_node, "@href").to_s
450
+ end
451
+
452
+ # get the feed description
453
+ description = XPath.first(metadata_node, "description/text()").to_s
454
+ if description != ""
455
+ if XPath.first(metadata_node, "description/@encoding").to_s != ""
456
+ description = "[Embedded data objects are not supported.]"
457
+ else
458
+ description = CGI.unescapeHTML(description)
459
+ end
460
+ end
461
+ if description == ""
462
+ description = XPath.first(metadata_node, "tagline/text()").to_s
463
+ if description != "" && XPath.first(metadata_node, "tagline/@mode").to_s == "escaped"
464
+ description = CGI.unescapeHTML(description)
465
+ end
466
+ end
467
+ if description == "" && XPath.first(metadata_node, "tagline") == nil
468
+ description = XPath.first(metadata_node, "info/text()").to_s
469
+ if description != "" && XPath.first(metadata_node, "info/@mode").to_s == "escaped"
470
+ description = CGI.unescapeHTML(description)
471
+ end
472
+ end
473
+ if description == ""
474
+ description = CGI.unescapeHTML(XPath.first(metadata_node, "abstract/text()").to_s)
475
+ end
476
+
477
+ # get the image link
478
+ image_link = XPath.first(metadata_node, "image/url/text()").to_s
479
+ if image_link == ""
480
+ image_link = XPath.first(metadata_node, "image/@rdf:resource").to_s
481
+ end
482
+ if image_link == ""
483
+ image_link = XPath.first(metadata_node, "link[@type='image/jpeg']/@href").to_s
484
+ end
485
+ if image_link == ""
486
+ image_link = XPath.first(metadata_node, "link[@type='image/gif']/@href").to_s
487
+ end
488
+ if image_link == ""
489
+ image_link = XPath.first(metadata_node, "link[@type='image/png']/@href").to_s
490
+ end
491
+ if image_link == ""
492
+ image_link = XPath.first(metadata_node, "logo[@style='image']/@href").to_s
493
+ end
494
+ if image_link == ""
495
+ image_link = XPath.first(metadata_node, "logo/@href").to_s
496
+ end
497
+
498
+ # get the feed time to live (expressed in hours)
499
+ feed_time_to_live = nil
500
+ update_frequency = XPath.first(metadata_node, "syn:updateFrequency/text()").to_s
501
+ if update_frequency != ""
502
+ update_period = XPath.first(metadata_node, "syn:updatePeriod/text()").to_s
503
+ if update_period == "daily"
504
+ feed_time_to_live = update_frequency.to_i * 24
505
+ elsif update_period == "weekly"
506
+ feed_time_to_live = update_frequency.to_i * 24 * 7
507
+ elsif update_period == "monthly"
508
+ feed_time_to_live = update_frequency.to_i * 24 * 30
509
+ elsif update_period == "yearly"
510
+ feed_time_to_live = update_frequency.to_i * 24 * 365
511
+ else
512
+ # hourly
513
+ feed_time_to_live = update_frequency.to_i
514
+ end
515
+ end
516
+ if feed_time_to_live == nil
517
+ # expressed in minutes
518
+ update_frequency = XPath.first(metadata_node, "ttl/text()").to_s
519
+ if update_frequency != ""
520
+ feed_time_to_live = (update_frequency.to_i / 60)
521
+ end
522
+ end
523
+
524
+ # TODO: handle time_to_live for CDF files
525
+ # =======================================
526
+
527
+ # get the feed items
528
+ items = XPath.match(root_node, "item")
529
+ if items == nil || items == []
530
+ items = XPath.match(metadata_node, "item")
531
+ end
532
+ if items == nil || items == []
533
+ items = XPath.match(metadata_node, "entry")
534
+ end
535
+
536
+ # set all of the properties
537
+ if title != ""
538
+ self.title = title
539
+ else
540
+ self.title = nil
541
+ end
542
+ if link != ""
543
+ self.link = link
544
+ else
545
+ self.link = nil
546
+ end
547
+ if description != ""
548
+ self.description = description
549
+ else
550
+ self.description = nil
551
+ end
552
+ if image_link != ""
553
+ self.image_link = image_link
554
+ else
555
+ self.image_link = nil
556
+ end
557
+ if feed_time_to_live != nil
558
+ self.time_to_live = feed_time_to_live
559
+ else
560
+ self.time_to_live = nil
561
+ end
562
+
563
+ parse_feed_hook(feed_data)
564
+ if Feed.cache_enabled?
565
+ save
566
+ end
567
+
568
+ # check and make sure we don't have any cached feed_items with a nil link
569
+ # if we do, we need to start from scratch to avoid duplicates
570
+ for item_link in feed_items.map { |item| item.link }
571
+ if item_link.nil?
572
+ FeedItem.delete_all("feed_id = '#{self.id}'")
573
+ break
574
+ end
575
+ end
576
+
577
+ # parse the feed items
578
+ @feed_items_unsorted = []
579
+ if items != nil
580
+ for item_node in items
581
+ @feed_items_unsorted << handle_feed_item(item_node.to_s)
582
+ end
583
+ end
584
+ return self
585
+ end
586
+
587
+ # Locates the feed item in the database based on the supplied item xml data.
588
+ def find_feed_item_by_data(item_data)
589
+ item_node = Document.new(item_data).root
590
+
591
+ # get the link
592
+ item_link = XPath.first(item_node, "link[@rel='alternate']/@href").to_s
593
+ if item_link == ""
594
+ item_link = XPath.first(item_node, "link/@href").to_s
595
+ end
596
+ if item_link == ""
597
+ item_link = XPath.first(item_node, "link/text()").to_s
598
+ end
599
+ if item_link == ""
600
+ item_link = XPath.first(item_node, "@rdf:about").to_s
601
+ end
602
+ if item_link == ""
603
+ item_link = XPath.first(item_node, "guid/text()").to_s
604
+ end
605
+ item_title = XPath.first(item_node, "title/text()").to_s
606
+
607
+ feed_item = FeedItem.find_by_feed_id_and_link(self.id, item_link)
608
+ unless feed_item.nil?
609
+ # Some blogging tools alter the title of an item when the number of comments change (for
610
+ # example, TextPattern) and many email feed dumps use the same link for multiple
611
+ # items (for example, GMail). We try to take both of these cases into account here.
612
+ existing_title = feed_item.title
613
+ item_title = item_title.gsub(/\[\d*\]/,"").strip
614
+ existing_title = existing_title.gsub(/\[\d*\]/,"").strip
615
+ item_title = item_title.gsub(/\(\d*\)/,"").strip
616
+ existing_title = existing_title.gsub(/\(\d*\)/,"").strip
617
+ item_title = item_title.gsub(/\{\d*\}/,"").strip
618
+ existing_title = existing_title.gsub(/\{\d*\}/,"").strip
619
+ if existing_title != item_title
620
+ feed_item = nil
621
+ end
622
+ end
623
+ return feed_item
624
+ end
625
+
626
+ def handle_feed_item(item_data)
627
+ feed_item = find_feed_item_by_data(item_data)
628
+ if feed_item.nil?
629
+ feed_item = FeedItem.new
630
+ end
631
+ feed_item.feed = self
632
+ feed_item.parse_item(item_data)
633
+ return feed_item
634
+ end
635
+
636
+ def build_feed_hook(feed_type, version, xml_builder)
637
+ return nil
638
+ end
639
+
640
+ def build_feed(feed_type, version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
641
+ if feed_type == "rss" && version == 0.0
642
+ version = 1.0
643
+ elsif feed_type == "atom" && version == 0.0
644
+ version = 0.3
645
+ end
646
+ if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
647
+ # RDF-based rss format
648
+ return xml_builder.tag!("rdf:RDF") do
649
+ xml_builder.channel("rdf:about" => CGI.escapeHTML(link)) do
650
+ unless title.nil? || title == ""
651
+ xml_builder.title(title)
652
+ else
653
+ xml_builder.title
654
+ end
655
+ unless link.nil? || link == ""
656
+ xml_builder.link(link)
657
+ else
658
+ xml_builder.link
659
+ end
660
+ unless image_link.nil? || image_link == ""
661
+ xml_builder.image("rdf:resource" => CGI.escapeHTML(image_link))
662
+ end
663
+ unless description.nil? || description == ""
664
+ xml_builder.description(description)
665
+ else
666
+ xml_builder.description
667
+ end
668
+ unless language.nil? || language == ""
669
+ xml_builder.tag!("dc:language", language)
670
+ end
671
+ xml_builder.tag!("syn:updatePeriod", "hourly")
672
+ xml_builder.tag!("syn:updateFrequency", (time_to_live / 1.hour).to_s)
673
+ xml_builder.tag!("syn:updateBase", Time.mktime(1970).iso8601)
674
+ xml_builder.items do
675
+ xml_builder.tag!("rdf:Seq") do
676
+ unless feed_items.nil?
677
+ for item in feed_items
678
+ if item.link.nil?
679
+ raise "Cannot generate an rdf-based feed with a nil item link field."
680
+ end
681
+ xml_builder.tag!("rdf:li", "rdf:resource" => CGI.escapeHTML(item.link))
682
+ end
683
+ end
684
+ end
685
+ end
686
+ build_feed_hook(feed_type, version, xml_builder)
687
+ end
688
+ unless image_link.nil? || image_link == ""
689
+ xml_builder.image("rdf:about" => CGI.escapeHTML(image_link)) do
690
+ unless title.nil? || title == ""
691
+ xml_builder.title(title)
692
+ else
693
+ xml_builder.title
694
+ end
695
+ unless image_link.nil? || image_link == ""
696
+ xml_builder.url(image_link)
697
+ end
698
+ unless link.nil? || link == ""
699
+ xml_builder.link(link)
700
+ else
701
+ xml_builder.link
702
+ end
703
+ end
704
+ end
705
+ unless feed_items.nil?
706
+ for item in feed_items
707
+ item.build_feed_item(feed_type, version, xml_builder)
708
+ end
709
+ end
710
+ end
711
+ elsif feed_type == "rss"
712
+ # normal rss format
713
+ return xml_builder.rss("version" => version.to_s) do
714
+ unless title.nil? || title == ""
715
+ xml_builder.title(title)
716
+ end
717
+ unless link.nil? || link == ""
718
+ xml_builder.link(link)
719
+ end
720
+ unless description.nil? || description == ""
721
+ xml_builder.description(description)
722
+ end
723
+ xml_builder.ttl((time_to_live / 1.minute).to_s)
724
+ xml_builder.generator("http://www.sporkmonger.com/projects/feedtools")
725
+ build_feed_hook(feed_type, version, xml_builder)
726
+ unless feed_items.nil?
727
+ for item in feed_items
728
+ item.build_feed_item(feed_type, version, xml_builder)
729
+ end
730
+ end
731
+ end
732
+ elsif feed_type == "atom"
733
+ # normal atom format
734
+ return xml_builder.feed("xmlns" => "http://purl.org/atom/ns#",
735
+ "version" => version.to_s,
736
+ "xml:lang" => language) do
737
+ unless title.nil? || title == ""
738
+ xml_builder.title(title,
739
+ "mode" => "escaped",
740
+ "type" => "text/html")
741
+ end
742
+ unless link.nil? || link == ""
743
+ xml_builder.link("href" => link,
744
+ "rel" => "alternate",
745
+ "type" => "text/html",
746
+ "title" => title)
747
+ end
748
+ unless description.nil? || description == ""
749
+ xml_builder.tagline(description,
750
+ "mode" => "escaped",
751
+ "type" => "text/html")
752
+ end
753
+ xml_builder.generator("FeedTools",
754
+ "url" => "http://www.sporkmonger.com/projects/feedtools")
755
+ build_feed_hook(feed_type, version, xml_builder)
756
+ unless feed_items.nil?
757
+ for item in feed_items
758
+ item.build_feed_item(feed_type, version, xml_builder)
759
+ end
760
+ end
761
+ end
762
+ end
763
+ end
764
+
765
+ # Saves the current state of the feed to the database unless the feed lacks a remote location
766
+ def save
767
+ unless url.nil? || url == ""
768
+ super
769
+ end
770
+ end
771
+ end
772
+
773
+ class FeedItem < ActiveRecord::Base
774
+ include REXML
775
+
776
+ # Verifies that the required fields exist; additional ones added by the user are fine
777
+ def FeedItem.table_exists?
778
+ begin
779
+ connection.execute "select id, feed_id, link, title, author, description, " +
780
+ "time, tags from feed_items limit 1"
781
+ rescue ActiveRecord::StatementInvalid
782
+ return false
783
+ end
784
+ return true
785
+ end
786
+
787
+ def feed
788
+ if @feed != nil
789
+ return @feed
790
+ elsif @feed_id != nil
791
+ @feed = Feed.find_by_id(self.feed_id)
792
+ return @feed
793
+ else
794
+ return nil
795
+ end
796
+ end
797
+
798
+ def feed=(new_feed)
799
+ self.feed_id = new_feed.id
800
+ @feed = new_feed
801
+ end
802
+
803
+ def title
804
+ return (self['title'] or "Untitled Entry")
805
+ end
806
+
807
+ def tag_list
808
+ return tags.nil? ? nil : tags[1..-2].split("|")
809
+ end
810
+
811
+ def tag_list=(new_tag_list)
812
+ self.tags = "|" + (new_tag_list.map { |x| x.strip }).join("|") + "|"
813
+ end
814
+
815
+ def tag_string
816
+ return (tags.nil? ? nil : tags[1..-2]).split("|").join(", ")
817
+ end
818
+
819
+ def tag_string=(new_tag_string)
820
+ self.tags = "|" + (new_tag_string.split(",").map { |x| x.strip }).join("|") + "|"
821
+ end
822
+
823
+ def parse_feed_item_hook(item_data)
824
+ return nil
825
+ end
826
+
827
+ def parse_item(item_data)
828
+ item_node = Document.new(item_data).root
829
+
830
+ # get the feed base, in case the feed items use relative paths
831
+ base = feed.link
832
+
833
+ # get the link
834
+ link = XPath.first(item_node, "link[@rel='alternate']/@href").to_s
835
+ if link == ""
836
+ link = XPath.first(item_node, "link/@href").to_s
837
+ end
838
+ if link == ""
839
+ link = XPath.first(item_node, "link/text()").to_s
840
+ end
841
+ if link == ""
842
+ link = XPath.first(item_node, "@rdf:about").to_s
843
+ end
844
+ if link == ""
845
+ link = XPath.first(item_node, "guid/text()").to_s
846
+ end
847
+ if link != ""
848
+ link = CGI.unescapeHTML(link)
849
+ end
850
+ if link != "" && (link =~ /http:\/\//) != 0 && (link =~ /https:\/\//) != 0
851
+ # ensure that we don't end up with 'http://www.foobar.com//path/to/entry'
852
+ # future-proofed this so that it doesn't break when Ruby 1.9/2.0 starts
853
+ # returning single character Strings instead of FixNums
854
+ if (base[-1] == 47 && link[0] == 47) || (base[-1] == "/" && link[0] == "/")
855
+ link = link[1..-1]
856
+ end
857
+ # prepend the base to the link since they seem to have used a relative path
858
+ link = base + link
859
+ end
860
+
861
+ title = XPath.first(item_node, "title/text()").to_s
862
+ if title != ""
863
+ # some blogging tools (notably TextPattern I believe) include the number of
864
+ # comments in a post in the title... this is ugly, so we're gonna strip them out
865
+ title = title.gsub(/\[\d*\]/,"").strip
866
+ end
867
+
868
+ # get the item author
869
+ author = CGI.unescapeHTML(XPath.first(item_node, "author/name/text()").to_s)
870
+ if author == ""
871
+ author = CGI.unescapeHTML(XPath.first(item_node, "dc:creator/text()").to_s)
872
+ end
873
+ if author == ""
874
+ author = CGI.unescapeHTML(XPath.first(item_node, "author/text()").to_s)
875
+ end
876
+
877
+ # get the item content
878
+ description = ""
879
+ body = XPath.first(item_node, "xhtml:body")
880
+ if body == nil
881
+ body = XPath.first(item_node, "body")
882
+ end
883
+ if body != nil
884
+ description = body.inner_xml
885
+ end
886
+ if description == ""
887
+ description = CGI.unescapeHTML(XPath.first(item_node, "content:encoded/text()").to_s)
888
+ end
889
+ if description == ""
890
+ description = XPath.first(item_node, "description/text()").to_s
891
+ if description != ""
892
+ if XPath.first(item_node, "description/@encoding").to_s != ""
893
+ description = "[Embedded data objects are not supported.]"
894
+ else
895
+ description = CGI.unescapeHTML(description)
896
+ end
897
+ end
898
+ end
899
+ if description == ""
900
+ description = XPath.first(item_node,"content/text()").to_s
901
+ if description != "" && XPath.first(item_node, "content/@mode").to_s == "escaped"
902
+ description = CGI.unescapeHTML(description)
903
+ end
904
+ end
905
+
906
+ # get the item time
907
+ time = XPath.first(item_node, "pubDate/text()").to_s
908
+ if time == ""
909
+ time = XPath.first(item_node, "dc:date/text()").to_s
910
+ end
911
+ if time == ""
912
+ time = XPath.first(item_node, "issued/text()").to_s
913
+ end
914
+
915
+ # get the item tags
916
+ tags_array = []
917
+ if tags_array == nil || tags_array.size == 0
918
+ tags_array = []
919
+ tag_list = XPath.match(item_node, "dc:subject/rdf:Bag/rdf:li/text()")
920
+ if tag_list.size > 1
921
+ for tag in tag_list
922
+ tags_array << tag.to_s.downcase.strip
923
+ end
924
+ end
925
+ end
926
+ if tags_array == nil || tags_array.size == 0
927
+ tags_array = []
928
+ tag_list = XPath.match(item_node, "category/text()")
929
+ if tag_list.size > 1
930
+ for tag in tag_list
931
+ tags_array << tag.to_s.downcase.strip
932
+ end
933
+ end
934
+ end
935
+ if tags_array == nil || tags_array.size == 0
936
+ tags_array = []
937
+ tag_list = XPath.match(item_node, "dc:subject/text()")
938
+ if tag_list.size > 1
939
+ for tag in tag_list
940
+ tags_array << tag.to_s.downcase.strip
941
+ end
942
+ end
943
+ end
944
+ if tags_array == nil || tags_array.size == 0
945
+ tags_array = XPath.first(item_node,
946
+ "category/text()").to_s.downcase.split(" ")
947
+ end
948
+ if tags_array == nil || tags_array.size == 0
949
+ begin
950
+ tags_array = XPath.first(item_node,
951
+ "dc:subject/text()").to_s.downcase.split(" ")
952
+ rescue
953
+ tags_array = []
954
+ end
955
+ end
956
+ if tags_array == nil || tags_array.size == 0
957
+ tags_array = []
958
+ rdf_bag = XPath.match(item_node,
959
+ "taxo:topics/rdf:Bag/rdf:li")
960
+ if rdf_bag != nil && rdf_bag.size > 0
961
+ for tag_node in rdf_bag
962
+ begin
963
+ tag_url = XPath.first(tag_node, "@resource").to_s
964
+ tag_match = tag_url.scan(/\/(tag|tags)\/(\w+)/)
965
+ if tag_match.size > 0
966
+ tags_array << tag_match.first.last.downcase.strip
967
+ end
968
+ rescue
969
+ end
970
+ end
971
+ end
972
+ end
973
+
974
+ # set all of the properties
975
+ if link != ""
976
+ self.link = link
977
+ else
978
+ self.link = nil
979
+ end
980
+ if title != ""
981
+ self.title = title
982
+ end
983
+ if description != ""
984
+ self.description = description.strip
985
+ end
986
+ if time != ""
987
+ self.time = Time.parse(time) rescue Time.now
988
+ elsif @time == nil
989
+ self.time = Time.now
990
+ end
991
+ if tags_array.size > 0
992
+ self.tag_list = tags_array
993
+ end
994
+ parse_feed_item_hook(item_data)
995
+ if Feed.cache_enabled?
996
+ save
997
+ end
998
+ return self
999
+ end
1000
+
1001
+ def build_feed_item_hook(feed_type, version, xml_builder)
1002
+ end
1003
+
1004
+ def build_feed_item(feed_type, version, xml_builder=Builder::XmlMarkup.new(:indent => 2))
1005
+ if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
1006
+ # RDF-based rss format
1007
+ if link.nil?
1008
+ raise "Cannot generate an rdf-based feed item with a nil link field."
1009
+ end
1010
+ return xml_builder.item("rdf:about" => CGI.escapeHTML(link)) do
1011
+ unless title.nil? || title == ""
1012
+ xml_builder.title(title)
1013
+ else
1014
+ xml_builder.title
1015
+ end
1016
+ unless link.nil? || link == ""
1017
+ xml_builder.link(link)
1018
+ else
1019
+ xml_builder.link
1020
+ end
1021
+ unless description.nil? || description == ""
1022
+ xml_builder.description(description)
1023
+ else
1024
+ xml_builder.description
1025
+ end
1026
+ unless time.nil?
1027
+ xml_builder.tag!("dc:date", time.iso8601)
1028
+ end
1029
+ unless tags.nil?
1030
+ xml_builder.tag!("dc:subject") do
1031
+ xml_builder.tag!("rdf:Bag") do
1032
+ for tag in tag_list
1033
+ xml_builder.tag!("rdf:li", tag)
1034
+ end
1035
+ end
1036
+ end
1037
+ end
1038
+ end
1039
+ elsif feed_type == "rss"
1040
+ # normal rss format
1041
+ return xml_builder.item do
1042
+ unless title.nil? || title == ""
1043
+ xml_builder.title(title)
1044
+ end
1045
+ unless link.nil? || link == ""
1046
+ xml_builder.link(link)
1047
+ end
1048
+ unless description.nil? || description == ""
1049
+ xml_builder.description(description)
1050
+ end
1051
+ unless time.nil?
1052
+ xml_builder.pubDate(time.rfc822)
1053
+ end
1054
+ unless tags.nil?
1055
+ xml_builder.tag!("dc:subject") do
1056
+ xml_builder.tag!("rdf:Bag") do
1057
+ for tag in tag_list
1058
+ xml_builder.tag!("rdf:li", tag)
1059
+ end
1060
+ end
1061
+ end
1062
+ end
1063
+ end
1064
+ elsif feed_type == "atom"
1065
+ # normal atom format
1066
+ return xml_builder.entry("xmlns" => "http://purl.org/atom/ns#") do
1067
+ unless title.nil? || title == ""
1068
+ xml_builder.title(title,
1069
+ "mode" => "escaped",
1070
+ "type" => "text/html")
1071
+ end
1072
+ unless link.nil? || link == ""
1073
+ xml_builder.link("href" => link,
1074
+ "rel" => "alternate",
1075
+ "type" => "text/html",
1076
+ "title" => title)
1077
+ end
1078
+ unless description.nil? || description == ""
1079
+ xml_builder.content(description,
1080
+ "mode" => "escaped",
1081
+ "type" => "text/html")
1082
+ end
1083
+ unless time.nil?
1084
+ xml_builder.issued(time.iso8601)
1085
+ end
1086
+ unless tags.nil?
1087
+ for tag in tag_list
1088
+ xml_builder.category(tag)
1089
+ end
1090
+ end
1091
+ end
1092
+ end
1093
+ end
1094
+
1095
+ # Saves the current state of the feed item to the database unless the feed lacks
1096
+ # a remote location
1097
+ def save
1098
+ unless feed.nil? || feed.url.nil? || feed.url == ""
1099
+ super
1100
+ end
1101
+ end
1102
+ end
1103
+ end
1104
+
1105
+ module REXML
1106
+ class Element
1107
+ # small extension to REXML to simplify parsing of xhtml feed items
1108
+ def inner_xml
1109
+ result = ""
1110
+ each_child do |child|
1111
+ result << child.to_s
1112
+ end
1113
+ return result
1114
+ end
1115
+ end
1116
+ end
1117
+
1118
+ begin
1119
+ FeedTools::Feed.prepare_connection
1120
+ unless FeedTools::Feed.cache_exists?
1121
+ FeedTools::Feed.create_cache
1122
+ end
1123
+ rescue
1124
+ # Nothing can be done until someone sets up the database connection.
1125
+ # We'll just assume for now that the user will take care of that.
1126
+ end