feedtools 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ == FeedTools 0.1.0
2
+ * basic support for rss, atom, cdf
3
+ * basic caching using active record
4
+ * support for etags
data/README ADDED
@@ -0,0 +1,13 @@
1
+ FeedTools was designed to be a simple XML feed parser, generator, and translator with a built-in
2
+ caching system.
3
+
4
+ == Example
5
+ slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss')
6
+ slashdot_feed.title
7
+ => "Slashdot"
8
+ slashdot_feed.description
9
+ => "News for nerds, stuff that matters"
10
+ slashdot_feed.link
11
+ => "http://slashdot.org/"
12
+ slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s
13
+ => "43,37,28,23,11,3,1"
@@ -0,0 +1,30 @@
1
+ require 'rbconfig'
2
+ require 'find'
3
+ require 'ftools'
4
+
5
+ include Config
6
+
7
+ # this was adapted from rdoc's install.rb by ways of Log4r
8
+
9
+ $sitedir = CONFIG["sitelibdir"]
10
+ unless $sitedir
11
+ version = CONFIG["MAJOR"] + "." + CONFIG["MINOR"]
12
+ $libdir = File.join(CONFIG["libdir"], "ruby", version)
13
+ $sitedir = $:.find {|x| x =~ /site_ruby/ }
14
+ if !$sitedir
15
+ $sitedir = File.join($libdir, "site_ruby")
16
+ elsif $sitedir !~ Regexp.quote(version)
17
+ $sitedir = File.join($sitedir, version)
18
+ end
19
+ end
20
+
21
+ # the acual gruntwork
22
+ Dir.chdir("lib")
23
+
24
+ Find.find("feed_tools", "feed_tools.rb") { |f|
25
+ if f[-3..-1] == ".rb"
26
+ File::install(f, File.join($sitedir, *f.split(/\//)), 0644, true)
27
+ else
28
+ File::makedirs(File.join($sitedir, *f.split(/\//)))
29
+ end
30
+ }
@@ -0,0 +1,1126 @@
1
+ #--
2
+ # Copyright (c) 2005 Robert Aman
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #++
23
+
24
+ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] || ENV['RAILS_ENV'] || 'production'
25
+
26
+ $:.unshift(File.dirname(__FILE__))
27
+ $:.unshift(File.dirname(__FILE__) + "/../../activerecord/lib")
28
+
29
+ begin
30
+ require 'active_record'
31
+ rescue LoadError
32
+ require 'rubygems'
33
+ require_gem 'activerecord'
34
+ end
35
+
36
+ begin
37
+ require 'rubygems'
38
+ require 'builder'
39
+ rescue LoadError
40
+ # RubyGems is not available, use included Builder
41
+ $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
42
+ require 'feed_tools/vendor/builder'
43
+ end
44
+
45
+ require 'open-uri'
46
+ require 'time'
47
+ require 'rexml/document'
48
+ require 'yaml'
49
+ require 'cgi'
50
+
51
+ module FeedTools
52
+ class Feed < ActiveRecord::Base
53
+ include REXML
54
+
55
+ has_many :feed_items_unsorted, :class_name => "FeedItem"
56
+
57
+ def initialize
58
+ @live = false
59
+ @feed_items_unsorted = nil
60
+ super
61
+ end
62
+
63
+ # Loads the feed specified by the url, pulling the data from the cache if it hasn't expired
64
+ # Be aware that this method translates from the feed: and rss: pseudo-protocols to the
65
+ # http: protocol as needed. This means that if you pass in a feed url that looks like
66
+ # 'feed://www.anywhere.com/feed.xml' it will end up being stored in the cache as
67
+ # 'http://www.anywhere.com/feed.xml' instead. This does affect the usage of methods like
68
+ # find_by_url, but otherwise should be fairly transparent.
69
+ def Feed.open(url)
70
+ # deal with all of the ugly possibilities involved in the rss: and feed: pseudo-protocols
71
+ if (url =~ /feed:/) == 0
72
+ url = url.gsub(/feed:\/\/http:\/\/\//, "http://")
73
+ url = url.gsub(/feed:\/\/http:\/\//, "http://")
74
+ url = url.gsub(/feed:http:\/\/\//, "http://")
75
+ url = url.gsub(/feed:http:\/\//, "http://")
76
+ url = url.gsub(/feed:\/\/\//, "http://")
77
+ url = url.gsub(/feed:\/\//, "http://")
78
+ url = url.gsub(/feed:\//, "http://")
79
+ url = url.gsub(/feed:/, "http://")
80
+ end
81
+ if (url =~ /rss:/) == 0
82
+ url = url.gsub(/rss:\/\/http:\/\/\//, "http://")
83
+ url = url.gsub(/rss:\/\/http:\/\//, "http://")
84
+ url = url.gsub(/rss:http:\/\/\//, "http://")
85
+ url = url.gsub(/rss:http:\/\//, "http://")
86
+ url = url.gsub(/rss:\/\/\//, "http://")
87
+ url = url.gsub(/rss:\/\//, "http://")
88
+ url = url.gsub(/rss:\//, "http://")
89
+ url = url.gsub(/rss:/, "http://")
90
+ end
91
+
92
+ feed = nil
93
+ begin
94
+ feed = Feed.find_by_url(url)
95
+ rescue ActiveRecord::StatementInvalid
96
+ # make sure that the necessary tables are present and recover if possible
97
+ FeedTools::Feed.prepare_connection
98
+ unless FeedTools::Feed.cache_exists?
99
+ FeedTools::Feed.create_cache
100
+ end
101
+ feed = Feed.find_by_url(url)
102
+ end
103
+ unless feed.nil?
104
+ feed.update_if_needed
105
+ else
106
+ feed = Feed.new
107
+ feed.url = url
108
+ feed.load_remote_feed
109
+ end
110
+ return feed
111
+ end
112
+
113
+ # Checks if the feed has expired and updates if it has
114
+ def update_if_needed
115
+ if expired?
116
+ load_remote_feed
117
+ end
118
+ end
119
+
120
+ # Verifies that the table structure exists
121
+ def Feed.cache_exists?
122
+ return Feed.table_exists? && FeedItem.table_exists?
123
+ end
124
+
125
+ # Verifies that the required fields exist; additional ones added by the user are fine
126
+ def Feed.table_exists?
127
+ begin
128
+ connection.execute "select id, url, link, image_link, title, description, " +
129
+ "tags, last_updated, etag, time_to_live from feeds limit 1"
130
+ rescue ActiveRecord::StatementInvalid
131
+ return false
132
+ end
133
+ return true
134
+ end
135
+
136
+ # Generates the table structure if necessary
137
+ def Feed.create_cache
138
+ unless Feed.cache_exists?
139
+ feed_items_mysql = <<-SQL_END
140
+ CREATE TABLE `feed_items` (
141
+ `id` int(6) unsigned NOT NULL auto_increment,
142
+ `feed_id` int(6) unsigned NOT NULL default '0',
143
+ `link` varchar(255) default NULL,
144
+ `title` varchar(255) default NULL,
145
+ `author` varchar(255) default NULL,
146
+ `description` text default NULL,
147
+ `time` datetime NOT NULL default '0000-00-00 00:00:00',
148
+ `tags` varchar(255) default NULL,
149
+ PRIMARY KEY (`id`)
150
+ ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
151
+ SQL_END
152
+ feed_items_sqlite = <<-SQL_END
153
+ CREATE TABLE 'feed_items' (
154
+ 'id' INTEGER PRIMARY KEY NOT NULL,
155
+ 'feed_id' INTEGER NOT NULL,
156
+ 'link' VARCHAR(255) DEFAULT NULL,
157
+ 'title' VARCHAR(255) DEFAULT NULL,
158
+ 'author' VARCHAR(255) DEFAULT NULL,
159
+ 'description' TEXT DEFAULT NULL,
160
+ 'time' DATETIME DEFAULT NULL,
161
+ 'tags' VARCHAR(255) DEFAULT NULL
162
+ );
163
+ SQL_END
164
+ feed_items_psql = <<-SQL_END
165
+ CREATE TABLE feed_items (
166
+ id SERIAL PRIMARY KEY NOT NULL,
167
+ feed_id int REFERENCES feeds,
168
+ link varchar(255) default NULL,
169
+ title varchar(255) default NULL,
170
+ author varchar(255) default NULL,
171
+ description text default NULL,
172
+ time datetime default NULL,
173
+ tags varchar(255) default NULL
174
+ );
175
+ SQL_END
176
+ unless FeedItem.table_exists?
177
+ table_creation_sql = nil
178
+ if configurations["adapter"] == "mysql"
179
+ table_creation_sql = feed_items_mysql
180
+ elsif configurations["adapter"] == "sqlite"
181
+ table_creation_sql = feed_items_sqlite
182
+ elsif configurations["adapter"] == "postgresql"
183
+ table_creation_sql = feeds_psql
184
+ end
185
+ if table_creation_sql.nil?
186
+ raise "Could not build feed_items table."
187
+ else
188
+ connection.execute table_creation_sql
189
+ end
190
+ end
191
+ feeds_mysql = <<-SQL_END
192
+ CREATE TABLE `feeds` (
193
+ `id` int(6) unsigned NOT NULL auto_increment,
194
+ `url` varchar(255) NOT NULL default '',
195
+ `link` varchar(255) NOT NULL default '',
196
+ `image_link` varchar(255) default NULL,
197
+ `title` varchar(255) default NULL,
198
+ `description` text default NULL,
199
+ `tags` varchar(255) default NULL,
200
+ `last_updated` datetime default NULL,
201
+ `etag` varchar(255) default NULL,
202
+ `time_to_live` int(4) default NULL,
203
+ PRIMARY KEY (`id`)
204
+ ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
205
+ SQL_END
206
+ feeds_sqlite = <<-SQL_END
207
+ CREATE TABLE 'feeds' (
208
+ 'id' INTEGER PRIMARY KEY NOT NULL,
209
+ 'url' VARCHAR(255) DEFAULT NULL,
210
+ 'link' VARCHAR(255) DEFAULT NULL,
211
+ 'image_link' VARCHAR(255) DEFAULT NULL,
212
+ 'title' VARCHAR(255) DEFAULT NULL,
213
+ 'description' TEXT DEFAULT NULL,
214
+ 'tags' VARCHAR(255) DEFAULT NULL,
215
+ 'last_updated' DATETIME DEFAULT NULL,
216
+ 'etag' VARCHAR(255) DEFAULT NULL,
217
+ 'time_to_live' INTEGER DEFAULT NULL
218
+ );
219
+ SQL_END
220
+ feeds_psql = <<-SQL_END
221
+ CREATE TABLE feeds (
222
+ id SERIAL PRIMARY KEY NOT NULL,
223
+ url varchar(255) default NULL,
224
+ link varchar(255) default NULL,
225
+ image_link varchar(255) default NULL,
226
+ title varchar(255) default NULL,
227
+ description text default NULL,
228
+ tags varchar(255) default NULL,
229
+ last_updated datetime default NULL,
230
+ etag varchar(255) default NULL,
231
+ time_to_live int default NULL
232
+ );
233
+ SQL_END
234
+ unless Feed.table_exists?
235
+ table_creation_sql = nil
236
+ if configurations["adapter"] == "mysql"
237
+ table_creation_sql = feeds_mysql
238
+ elsif configurations["adapter"] == "sqlite"
239
+ table_creation_sql = feeds_sqlite
240
+ elsif configurations["adapter"] == "postgresql"
241
+ table_creation_sql = feeds_psql
242
+ end
243
+ if table_creation_sql.nil?
244
+ raise "Could not build feed_items table."
245
+ else
246
+ connection.execute table_creation_sql
247
+ end
248
+ end
249
+ end
250
+ end
251
+
252
+ # Removes all feed entries from the cache
253
+ # This could obviously be a very dangerous operation if you use the cache for more than simply
254
+ # caching the feeds.
255
+ def Feed.clear_cache
256
+ FeedItem.delete_all
257
+ Feed.delete_all
258
+ end
259
+
260
+ # Removes all feed items from the cache and resets the last updated time for all feeds
261
+ # This is probably much safer than the clear_cache method
262
+ def Feed.expire_cache
263
+ FeedItem.delete_all
264
+ Feed.update_all("last_updated = NULL")
265
+ end
266
+
267
+ # Removes all feed items older than the specified number of seconds
268
+ def Feed.purge_cache(purge_time=1.week)
269
+ purge_date = (Time.now - purge_time).strftime("%Y-%m-%d %H:%M:%S")
270
+ FeedItem.delete_all("time < '#{purge_date}'")
271
+ end
272
+
273
+ # If ActiveRecord is not already connected, attempts to find a configuration file and use
274
+ # it to open a connection for ActiveRecord.
275
+ # This method is probably unnecessary for anything but testing and debugging purposes.
276
+ def Feed.prepare_connection
277
+ begin
278
+ ActiveRecord::Base.connection
279
+ rescue
280
+ possible_config_files = [
281
+ "./config/database.yml",
282
+ "./database.yml"
283
+ ]
284
+ database_config_file = nil
285
+ for file in possible_config_files
286
+ if File.exists? file
287
+ database_config_file = file
288
+ break
289
+ end
290
+ end
291
+ database_config_hash = File.open(database_config_file) do |file|
292
+ config_hash = YAML::load(file)
293
+ unless config_hash[FEED_TOOLS_ENV].nil?
294
+ config_hash = config_hash[FEED_TOOLS_ENV]
295
+ end
296
+ config_hash
297
+ end
298
+ ActiveRecord::Base.configurations = database_config_hash
299
+ ActiveRecord::Base.establish_connection(database_config_hash)
300
+ ActiveRecord::Base.connection
301
+ end
302
+ end
303
+
304
+ def Feed.cache_enabled?
305
+ return true
306
+ end
307
+
308
+ def title
309
+ return (self["title"] or "Untitled Feed")
310
+ end
311
+
312
+ # Optional feed attribute.
313
+ # If you want to use it, the database table needs to have a language field added, otherwise
314
+ # it will just default to "en-US".
315
+ def language
316
+ begin
317
+ return (self["language"] or "en-US")
318
+ rescue
319
+ return "en-US"
320
+ end
321
+ end
322
+
323
+ def live?
324
+ if @live
325
+ return true
326
+ else
327
+ return false
328
+ end
329
+ end
330
+
331
+ def expired?
332
+ return last_updated == nil || (last_updated + time_to_live) < Time.now
333
+ end
334
+
335
+ # Forces this feed to expire.
336
+ def expire
337
+ FeedItem.delete_all("feed_id = '#{self.id}'")
338
+ @feed_items_unsorted = nil
339
+ self.last_updated = Time.mktime(1980)
340
+ self.save
341
+ end
342
+
343
+ # The ammount of time in seconds between the last time the feed was updated and the next
344
+ # valid time to retrieve a remote feed.
345
+ def time_to_live
346
+ return self['time_to_live'].nil? ? 1.hour : self['time_to_live'].hour
347
+ end
348
+
349
+ def tag_list
350
+ return tags.nil? ? nil : tags[1..-2].split("|")
351
+ end
352
+
353
+ def tag_list=(new_tag_list)
354
+ self.tags = "|" + (new_tag_list.map { |x| x.strip }).join("|") + "|"
355
+ end
356
+
357
+ def tag_string
358
+ return (tags.nil? ? nil : tags[1..-2]).split("|").join(", ")
359
+ end
360
+
361
+ def tag_string=(new_tag_string)
362
+ self.tags = "|" + (new_tag_string.split(",").map { |x| x.strip }).join("|") + "|"
363
+ end
364
+
365
+ # Returns a list of the feed_items, sorted by date
366
+ def feed_items
367
+ begin
368
+ if @feed_items_unsorted.nil?
369
+ @feed_items_unsorted = feed_items_unsorted
370
+ end
371
+ return @feed_items_unsorted.sort do |a,b|
372
+ b.time <=> a.time
373
+ end
374
+ rescue
375
+ unless @feed_items_unsorted.nil?
376
+ return @feed_items_unsorted
377
+ else
378
+ return feed_items_unsorted
379
+ end
380
+ end
381
+ end
382
+
383
+ # Attempts to load the feed from the remote location. Requires the url to be set.
384
+ # If an etag has been set, attempts to use it to prevent unnecessary reloading of identical
385
+ # content.
386
+ def load_remote_feed
387
+ @live = true
388
+ self.last_updated = Time.now
389
+ if (etag != nil)
390
+ # TODO: verify that the etag code works as intended
391
+ # -> may need to check what gets returned when the
392
+ # etag is matched
393
+ # =================================================
394
+ open(url, "If-None-Match" => @etag ) do |http|
395
+ etag = http.meta['etag']
396
+ parse_feed(http.read)
397
+ end
398
+ else
399
+ open(url) do |http|
400
+ etag = http.meta['etag']
401
+ parse_feed(http.read)
402
+ end
403
+ end
404
+ end
405
+
406
+ def parse_feed_hook(feed_data)
407
+ return nil
408
+ end
409
+
410
+ def parse_feed(feed_data)
411
+ root_node = Document.new(feed_data).root
412
+ metadata_node = XPath.first(root_node, "channel")
413
+ if metadata_node == nil
414
+ metadata_node = root_node
415
+ end
416
+
417
+ # get the feed title
418
+ title = XPath.first(metadata_node, "title/text()").to_s
419
+
420
+ # is the title escaped?
421
+ if XPath.first(metadata_node, "title/@mode").to_s == "escaped"
422
+ title = CGI.unescapeHTML(title)
423
+ end
424
+
425
+ # get the feed link
426
+ link = XPath.first(metadata_node, "link[@rel='alternate' @type='text/html']/@href").to_s
427
+ if link == ""
428
+ link = XPath.first(metadata_node, "link[@rel='alternate']/@href").to_s
429
+ end
430
+ if link == ""
431
+ link = XPath.first(metadata_node, "link/@href").to_s
432
+ end
433
+ if link == ""
434
+ link = XPath.first(metadata_node, "link/text()").to_s
435
+ end
436
+ if link == ""
437
+ # The ordering here is somewhat incorrect, but the more correct ordering would
438
+ # introduce much more serious problems, so I've chosen to go with the lesser of two
439
+ # evils. (The completely correct implementation would require a vestigial 'base' method
440
+ # on the Feed class to fully support CDF files. This method will support almost all CDF
441
+ # files without any unnecessary methods.) But given that this only exists to support
442
+ # CDF files, it's not a big deal. It's not like CDF files really exist in the wild.
443
+ # (The assumption this ordering makes is that the 'base' attribute points to a valid
444
+ # location, hopefully the same as the 'href' location. Chances are pretty good that this
445
+ # is true.)
446
+ link = XPath.first(metadata_node, "@base").to_s
447
+ end
448
+ if link == ""
449
+ link = XPath.first(metadata_node, "@href").to_s
450
+ end
451
+
452
+ # get the feed description
453
+ description = XPath.first(metadata_node, "description/text()").to_s
454
+ if description != ""
455
+ if XPath.first(metadata_node, "description/@encoding").to_s != ""
456
+ description = "[Embedded data objects are not supported.]"
457
+ else
458
+ description = CGI.unescapeHTML(description)
459
+ end
460
+ end
461
+ if description == ""
462
+ description = XPath.first(metadata_node, "tagline/text()").to_s
463
+ if description != "" && XPath.first(metadata_node, "tagline/@mode").to_s == "escaped"
464
+ description = CGI.unescapeHTML(description)
465
+ end
466
+ end
467
+ if description == "" && XPath.first(metadata_node, "tagline") == nil
468
+ description = XPath.first(metadata_node, "info/text()").to_s
469
+ if description != "" && XPath.first(metadata_node, "info/@mode").to_s == "escaped"
470
+ description = CGI.unescapeHTML(description)
471
+ end
472
+ end
473
+ if description == ""
474
+ description = CGI.unescapeHTML(XPath.first(metadata_node, "abstract/text()").to_s)
475
+ end
476
+
477
+ # get the image link
478
+ image_link = XPath.first(metadata_node, "image/url/text()").to_s
479
+ if image_link == ""
480
+ image_link = XPath.first(metadata_node, "image/@rdf:resource").to_s
481
+ end
482
+ if image_link == ""
483
+ image_link = XPath.first(metadata_node, "link[@type='image/jpeg']/@href").to_s
484
+ end
485
+ if image_link == ""
486
+ image_link = XPath.first(metadata_node, "link[@type='image/gif']/@href").to_s
487
+ end
488
+ if image_link == ""
489
+ image_link = XPath.first(metadata_node, "link[@type='image/png']/@href").to_s
490
+ end
491
+ if image_link == ""
492
+ image_link = XPath.first(metadata_node, "logo[@style='image']/@href").to_s
493
+ end
494
+ if image_link == ""
495
+ image_link = XPath.first(metadata_node, "logo/@href").to_s
496
+ end
497
+
498
+ # get the feed time to live (expressed in hours)
499
+ feed_time_to_live = nil
500
+ update_frequency = XPath.first(metadata_node, "syn:updateFrequency/text()").to_s
501
+ if update_frequency != ""
502
+ update_period = XPath.first(metadata_node, "syn:updatePeriod/text()").to_s
503
+ if update_period == "daily"
504
+ feed_time_to_live = update_frequency.to_i * 24
505
+ elsif update_period == "weekly"
506
+ feed_time_to_live = update_frequency.to_i * 24 * 7
507
+ elsif update_period == "monthly"
508
+ feed_time_to_live = update_frequency.to_i * 24 * 30
509
+ elsif update_period == "yearly"
510
+ feed_time_to_live = update_frequency.to_i * 24 * 365
511
+ else
512
+ # hourly
513
+ feed_time_to_live = update_frequency.to_i
514
+ end
515
+ end
516
+ if feed_time_to_live == nil
517
+ # expressed in minutes
518
+ update_frequency = XPath.first(metadata_node, "ttl/text()").to_s
519
+ if update_frequency != ""
520
+ feed_time_to_live = (update_frequency.to_i / 60)
521
+ end
522
+ end
523
+
524
+ # TODO: handle time_to_live for CDF files
525
+ # =======================================
526
+
527
+ # get the feed items
528
+ items = XPath.match(root_node, "item")
529
+ if items == nil || items == []
530
+ items = XPath.match(metadata_node, "item")
531
+ end
532
+ if items == nil || items == []
533
+ items = XPath.match(metadata_node, "entry")
534
+ end
535
+
536
+ # set all of the properties
537
+ if title != ""
538
+ self.title = title
539
+ else
540
+ self.title = nil
541
+ end
542
+ if link != ""
543
+ self.link = link
544
+ else
545
+ self.link = nil
546
+ end
547
+ if description != ""
548
+ self.description = description
549
+ else
550
+ self.description = nil
551
+ end
552
+ if image_link != ""
553
+ self.image_link = image_link
554
+ else
555
+ self.image_link = nil
556
+ end
557
+ if feed_time_to_live != nil
558
+ self.time_to_live = feed_time_to_live
559
+ else
560
+ self.time_to_live = nil
561
+ end
562
+
563
+ parse_feed_hook(feed_data)
564
+ if Feed.cache_enabled?
565
+ save
566
+ end
567
+
568
+ # check and make sure we don't have any cached feed_items with a nil link
569
+ # if we do, we need to start from scratch to avoid duplicates
570
+ for item_link in feed_items.map { |item| item.link }
571
+ if item_link.nil?
572
+ FeedItem.delete_all("feed_id = '#{self.id}'")
573
+ break
574
+ end
575
+ end
576
+
577
+ # parse the feed items
578
+ @feed_items_unsorted = []
579
+ if items != nil
580
+ for item_node in items
581
+ @feed_items_unsorted << handle_feed_item(item_node.to_s)
582
+ end
583
+ end
584
+ return self
585
+ end
586
+
587
+ # Locates the feed item in the database based on the supplied item xml data.
588
+ def find_feed_item_by_data(item_data)
589
+ item_node = Document.new(item_data).root
590
+
591
+ # get the link
592
+ item_link = XPath.first(item_node, "link[@rel='alternate']/@href").to_s
593
+ if item_link == ""
594
+ item_link = XPath.first(item_node, "link/@href").to_s
595
+ end
596
+ if item_link == ""
597
+ item_link = XPath.first(item_node, "link/text()").to_s
598
+ end
599
+ if item_link == ""
600
+ item_link = XPath.first(item_node, "@rdf:about").to_s
601
+ end
602
+ if item_link == ""
603
+ item_link = XPath.first(item_node, "guid/text()").to_s
604
+ end
605
+ item_title = XPath.first(item_node, "title/text()").to_s
606
+
607
+ feed_item = FeedItem.find_by_feed_id_and_link(self.id, item_link)
608
+ unless feed_item.nil?
609
+ # Some blogging tools alter the title of an item when the number of comments change (for
610
+ # example, TextPattern) and many email feed dumps use the same link for multiple
611
+ # items (for example, GMail). We try to take both of these cases into account here.
612
+ existing_title = feed_item.title
613
+ item_title = item_title.gsub(/\[\d*\]/,"").strip
614
+ existing_title = existing_title.gsub(/\[\d*\]/,"").strip
615
+ item_title = item_title.gsub(/\(\d*\)/,"").strip
616
+ existing_title = existing_title.gsub(/\(\d*\)/,"").strip
617
+ item_title = item_title.gsub(/\{\d*\}/,"").strip
618
+ existing_title = existing_title.gsub(/\{\d*\}/,"").strip
619
+ if existing_title != item_title
620
+ feed_item = nil
621
+ end
622
+ end
623
+ return feed_item
624
+ end
625
+
626
+ def handle_feed_item(item_data)
627
+ feed_item = find_feed_item_by_data(item_data)
628
+ if feed_item.nil?
629
+ feed_item = FeedItem.new
630
+ end
631
+ feed_item.feed = self
632
+ feed_item.parse_item(item_data)
633
+ return feed_item
634
+ end
635
+
636
+ def build_feed_hook(feed_type, version, xml_builder)
637
+ return nil
638
+ end
639
+
640
+ def build_feed(feed_type, version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
641
+ if feed_type == "rss" && version == 0.0
642
+ version = 1.0
643
+ elsif feed_type == "atom" && version == 0.0
644
+ version = 0.3
645
+ end
646
+ if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
647
+ # RDF-based rss format
648
+ return xml_builder.tag!("rdf:RDF") do
649
+ xml_builder.channel("rdf:about" => CGI.escapeHTML(link)) do
650
+ unless title.nil? || title == ""
651
+ xml_builder.title(title)
652
+ else
653
+ xml_builder.title
654
+ end
655
+ unless link.nil? || link == ""
656
+ xml_builder.link(link)
657
+ else
658
+ xml_builder.link
659
+ end
660
+ unless image_link.nil? || image_link == ""
661
+ xml_builder.image("rdf:resource" => CGI.escapeHTML(image_link))
662
+ end
663
+ unless description.nil? || description == ""
664
+ xml_builder.description(description)
665
+ else
666
+ xml_builder.description
667
+ end
668
+ unless language.nil? || language == ""
669
+ xml_builder.tag!("dc:language", language)
670
+ end
671
+ xml_builder.tag!("syn:updatePeriod", "hourly")
672
+ xml_builder.tag!("syn:updateFrequency", (time_to_live / 1.hour).to_s)
673
+ xml_builder.tag!("syn:updateBase", Time.mktime(1970).iso8601)
674
+ xml_builder.items do
675
+ xml_builder.tag!("rdf:Seq") do
676
+ unless feed_items.nil?
677
+ for item in feed_items
678
+ if item.link.nil?
679
+ raise "Cannot generate an rdf-based feed with a nil item link field."
680
+ end
681
+ xml_builder.tag!("rdf:li", "rdf:resource" => CGI.escapeHTML(item.link))
682
+ end
683
+ end
684
+ end
685
+ end
686
+ build_feed_hook(feed_type, version, xml_builder)
687
+ end
688
+ unless image_link.nil? || image_link == ""
689
+ xml_builder.image("rdf:about" => CGI.escapeHTML(image_link)) do
690
+ unless title.nil? || title == ""
691
+ xml_builder.title(title)
692
+ else
693
+ xml_builder.title
694
+ end
695
+ unless image_link.nil? || image_link == ""
696
+ xml_builder.url(image_link)
697
+ end
698
+ unless link.nil? || link == ""
699
+ xml_builder.link(link)
700
+ else
701
+ xml_builder.link
702
+ end
703
+ end
704
+ end
705
+ unless feed_items.nil?
706
+ for item in feed_items
707
+ item.build_feed_item(feed_type, version, xml_builder)
708
+ end
709
+ end
710
+ end
711
+ elsif feed_type == "rss"
712
+ # normal rss format
713
+ return xml_builder.rss("version" => version.to_s) do
714
+ unless title.nil? || title == ""
715
+ xml_builder.title(title)
716
+ end
717
+ unless link.nil? || link == ""
718
+ xml_builder.link(link)
719
+ end
720
+ unless description.nil? || description == ""
721
+ xml_builder.description(description)
722
+ end
723
+ xml_builder.ttl((time_to_live / 1.minute).to_s)
724
+ xml_builder.generator("http://www.sporkmonger.com/projects/feedtools")
725
+ build_feed_hook(feed_type, version, xml_builder)
726
+ unless feed_items.nil?
727
+ for item in feed_items
728
+ item.build_feed_item(feed_type, version, xml_builder)
729
+ end
730
+ end
731
+ end
732
+ elsif feed_type == "atom"
733
+ # normal atom format
734
+ return xml_builder.feed("xmlns" => "http://purl.org/atom/ns#",
735
+ "version" => version.to_s,
736
+ "xml:lang" => language) do
737
+ unless title.nil? || title == ""
738
+ xml_builder.title(title,
739
+ "mode" => "escaped",
740
+ "type" => "text/html")
741
+ end
742
+ unless link.nil? || link == ""
743
+ xml_builder.link("href" => link,
744
+ "rel" => "alternate",
745
+ "type" => "text/html",
746
+ "title" => title)
747
+ end
748
+ unless description.nil? || description == ""
749
+ xml_builder.tagline(description,
750
+ "mode" => "escaped",
751
+ "type" => "text/html")
752
+ end
753
+ xml_builder.generator("FeedTools",
754
+ "url" => "http://www.sporkmonger.com/projects/feedtools")
755
+ build_feed_hook(feed_type, version, xml_builder)
756
+ unless feed_items.nil?
757
+ for item in feed_items
758
+ item.build_feed_item(feed_type, version, xml_builder)
759
+ end
760
+ end
761
+ end
762
+ end
763
+ end
764
+
765
+ # Saves the current state of the feed to the database unless the feed lacks a remote location
766
+ def save
767
+ unless url.nil? || url == ""
768
+ super
769
+ end
770
+ end
771
+ end
772
+
773
+ class FeedItem < ActiveRecord::Base
774
+ include REXML
775
+
776
+ # Verifies that the required fields exist; additional ones added by the user are fine
777
+ def FeedItem.table_exists?
778
+ begin
779
+ connection.execute "select id, feed_id, link, title, author, description, " +
780
+ "time, tags from feed_items limit 1"
781
+ rescue ActiveRecord::StatementInvalid
782
+ return false
783
+ end
784
+ return true
785
+ end
786
+
787
+ def feed
788
+ if @feed != nil
789
+ return @feed
790
+ elsif @feed_id != nil
791
+ @feed = Feed.find_by_id(self.feed_id)
792
+ return @feed
793
+ else
794
+ return nil
795
+ end
796
+ end
797
+
798
+ def feed=(new_feed)
799
+ self.feed_id = new_feed.id
800
+ @feed = new_feed
801
+ end
802
+
803
+ def title
804
+ return (self['title'] or "Untitled Entry")
805
+ end
806
+
807
+ def tag_list
808
+ return tags.nil? ? nil : tags[1..-2].split("|")
809
+ end
810
+
811
+ def tag_list=(new_tag_list)
812
+ self.tags = "|" + (new_tag_list.map { |x| x.strip }).join("|") + "|"
813
+ end
814
+
815
+ def tag_string
816
+ return (tags.nil? ? nil : tags[1..-2]).split("|").join(", ")
817
+ end
818
+
819
+ def tag_string=(new_tag_string)
820
+ self.tags = "|" + (new_tag_string.split(",").map { |x| x.strip }).join("|") + "|"
821
+ end
822
+
823
+ def parse_feed_item_hook(item_data)
824
+ return nil
825
+ end
826
+
827
+ def parse_item(item_data)
828
+ item_node = Document.new(item_data).root
829
+
830
+ # get the feed base, in case the feed items use relative paths
831
+ base = feed.link
832
+
833
+ # get the link
834
+ link = XPath.first(item_node, "link[@rel='alternate']/@href").to_s
835
+ if link == ""
836
+ link = XPath.first(item_node, "link/@href").to_s
837
+ end
838
+ if link == ""
839
+ link = XPath.first(item_node, "link/text()").to_s
840
+ end
841
+ if link == ""
842
+ link = XPath.first(item_node, "@rdf:about").to_s
843
+ end
844
+ if link == ""
845
+ link = XPath.first(item_node, "guid/text()").to_s
846
+ end
847
+ if link != ""
848
+ link = CGI.unescapeHTML(link)
849
+ end
850
+ if link != "" && (link =~ /http:\/\//) != 0 && (link =~ /https:\/\//) != 0
851
+ # ensure that we don't end up with 'http://www.foobar.com//path/to/entry'
852
+ # future-proofed this so that it doesn't break when Ruby 1.9/2.0 starts
853
+ # returning single character Strings instead of FixNums
854
+ if (base[-1] == 47 && link[0] == 47) || (base[-1] == "/" && link[0] == "/")
855
+ link = link[1..-1]
856
+ end
857
+ # prepend the base to the link since they seem to have used a relative path
858
+ link = base + link
859
+ end
860
+
861
+ title = XPath.first(item_node, "title/text()").to_s
862
+ if title != ""
863
+ # some blogging tools (notably TextPattern I believe) include the number of
864
+ # comments in a post in the title... this is ugly, so we're gonna strip them out
865
+ title = title.gsub(/\[\d*\]/,"").strip
866
+ end
867
+
868
+ # get the item author
869
+ author = CGI.unescapeHTML(XPath.first(item_node, "author/name/text()").to_s)
870
+ if author == ""
871
+ author = CGI.unescapeHTML(XPath.first(item_node, "dc:creator/text()").to_s)
872
+ end
873
+ if author == ""
874
+ author = CGI.unescapeHTML(XPath.first(item_node, "author/text()").to_s)
875
+ end
876
+
877
+ # get the item content
878
+ description = ""
879
+ body = XPath.first(item_node, "xhtml:body")
880
+ if body == nil
881
+ body = XPath.first(item_node, "body")
882
+ end
883
+ if body != nil
884
+ description = body.inner_xml
885
+ end
886
+ if description == ""
887
+ description = CGI.unescapeHTML(XPath.first(item_node, "content:encoded/text()").to_s)
888
+ end
889
+ if description == ""
890
+ description = XPath.first(item_node, "description/text()").to_s
891
+ if description != ""
892
+ if XPath.first(item_node, "description/@encoding").to_s != ""
893
+ description = "[Embedded data objects are not supported.]"
894
+ else
895
+ description = CGI.unescapeHTML(description)
896
+ end
897
+ end
898
+ end
899
+ if description == ""
900
+ description = XPath.first(item_node,"content/text()").to_s
901
+ if description != "" && XPath.first(item_node, "content/@mode").to_s == "escaped"
902
+ description = CGI.unescapeHTML(description)
903
+ end
904
+ end
905
+
906
+ # get the item time
907
+ time = XPath.first(item_node, "pubDate/text()").to_s
908
+ if time == ""
909
+ time = XPath.first(item_node, "dc:date/text()").to_s
910
+ end
911
+ if time == ""
912
+ time = XPath.first(item_node, "issued/text()").to_s
913
+ end
914
+
915
+ # get the item tags
916
+ tags_array = []
917
+ if tags_array == nil || tags_array.size == 0
918
+ tags_array = []
919
+ tag_list = XPath.match(item_node, "dc:subject/rdf:Bag/rdf:li/text()")
920
+ if tag_list.size > 1
921
+ for tag in tag_list
922
+ tags_array << tag.to_s.downcase.strip
923
+ end
924
+ end
925
+ end
926
+ if tags_array == nil || tags_array.size == 0
927
+ tags_array = []
928
+ tag_list = XPath.match(item_node, "category/text()")
929
+ if tag_list.size > 1
930
+ for tag in tag_list
931
+ tags_array << tag.to_s.downcase.strip
932
+ end
933
+ end
934
+ end
935
+ if tags_array == nil || tags_array.size == 0
936
+ tags_array = []
937
+ tag_list = XPath.match(item_node, "dc:subject/text()")
938
+ if tag_list.size > 1
939
+ for tag in tag_list
940
+ tags_array << tag.to_s.downcase.strip
941
+ end
942
+ end
943
+ end
944
+ if tags_array == nil || tags_array.size == 0
945
+ tags_array = XPath.first(item_node,
946
+ "category/text()").to_s.downcase.split(" ")
947
+ end
948
+ if tags_array == nil || tags_array.size == 0
949
+ begin
950
+ tags_array = XPath.first(item_node,
951
+ "dc:subject/text()").to_s.downcase.split(" ")
952
+ rescue
953
+ tags_array = []
954
+ end
955
+ end
956
+ if tags_array == nil || tags_array.size == 0
957
+ tags_array = []
958
+ rdf_bag = XPath.match(item_node,
959
+ "taxo:topics/rdf:Bag/rdf:li")
960
+ if rdf_bag != nil && rdf_bag.size > 0
961
+ for tag_node in rdf_bag
962
+ begin
963
+ tag_url = XPath.first(tag_node, "@resource").to_s
964
+ tag_match = tag_url.scan(/\/(tag|tags)\/(\w+)/)
965
+ if tag_match.size > 0
966
+ tags_array << tag_match.first.last.downcase.strip
967
+ end
968
+ rescue
969
+ end
970
+ end
971
+ end
972
+ end
973
+
974
+ # set all of the properties
975
+ if link != ""
976
+ self.link = link
977
+ else
978
+ self.link = nil
979
+ end
980
+ if title != ""
981
+ self.title = title
982
+ end
983
+ if description != ""
984
+ self.description = description.strip
985
+ end
986
+ if time != ""
987
+ self.time = Time.parse(time) rescue Time.now
988
+ elsif @time == nil
989
+ self.time = Time.now
990
+ end
991
+ if tags_array.size > 0
992
+ self.tag_list = tags_array
993
+ end
994
+ parse_feed_item_hook(item_data)
995
+ if Feed.cache_enabled?
996
+ save
997
+ end
998
+ return self
999
+ end
1000
+
1001
+ def build_feed_item_hook(feed_type, version, xml_builder)
1002
+ end
1003
+
1004
+ def build_feed_item(feed_type, version, xml_builder=Builder::XmlMarkup.new(:indent => 2))
1005
+ if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
1006
+ # RDF-based rss format
1007
+ if link.nil?
1008
+ raise "Cannot generate an rdf-based feed item with a nil link field."
1009
+ end
1010
+ return xml_builder.item("rdf:about" => CGI.escapeHTML(link)) do
1011
+ unless title.nil? || title == ""
1012
+ xml_builder.title(title)
1013
+ else
1014
+ xml_builder.title
1015
+ end
1016
+ unless link.nil? || link == ""
1017
+ xml_builder.link(link)
1018
+ else
1019
+ xml_builder.link
1020
+ end
1021
+ unless description.nil? || description == ""
1022
+ xml_builder.description(description)
1023
+ else
1024
+ xml_builder.description
1025
+ end
1026
+ unless time.nil?
1027
+ xml_builder.tag!("dc:date", time.iso8601)
1028
+ end
1029
+ unless tags.nil?
1030
+ xml_builder.tag!("dc:subject") do
1031
+ xml_builder.tag!("rdf:Bag") do
1032
+ for tag in tag_list
1033
+ xml_builder.tag!("rdf:li", tag)
1034
+ end
1035
+ end
1036
+ end
1037
+ end
1038
+ end
1039
+ elsif feed_type == "rss"
1040
+ # normal rss format
1041
+ return xml_builder.item do
1042
+ unless title.nil? || title == ""
1043
+ xml_builder.title(title)
1044
+ end
1045
+ unless link.nil? || link == ""
1046
+ xml_builder.link(link)
1047
+ end
1048
+ unless description.nil? || description == ""
1049
+ xml_builder.description(description)
1050
+ end
1051
+ unless time.nil?
1052
+ xml_builder.pubDate(time.rfc822)
1053
+ end
1054
+ unless tags.nil?
1055
+ xml_builder.tag!("dc:subject") do
1056
+ xml_builder.tag!("rdf:Bag") do
1057
+ for tag in tag_list
1058
+ xml_builder.tag!("rdf:li", tag)
1059
+ end
1060
+ end
1061
+ end
1062
+ end
1063
+ end
1064
+ elsif feed_type == "atom"
1065
+ # normal atom format
1066
+ return xml_builder.entry("xmlns" => "http://purl.org/atom/ns#") do
1067
+ unless title.nil? || title == ""
1068
+ xml_builder.title(title,
1069
+ "mode" => "escaped",
1070
+ "type" => "text/html")
1071
+ end
1072
+ unless link.nil? || link == ""
1073
+ xml_builder.link("href" => link,
1074
+ "rel" => "alternate",
1075
+ "type" => "text/html",
1076
+ "title" => title)
1077
+ end
1078
+ unless description.nil? || description == ""
1079
+ xml_builder.content(description,
1080
+ "mode" => "escaped",
1081
+ "type" => "text/html")
1082
+ end
1083
+ unless time.nil?
1084
+ xml_builder.issued(time.iso8601)
1085
+ end
1086
+ unless tags.nil?
1087
+ for tag in tag_list
1088
+ xml_builder.category(tag)
1089
+ end
1090
+ end
1091
+ end
1092
+ end
1093
+ end
1094
+
1095
+ # Saves the current state of the feed item to the database unless the feed lacks
1096
+ # a remote location
1097
+ def save
1098
+ unless feed.nil? || feed.url.nil? || feed.url == ""
1099
+ super
1100
+ end
1101
+ end
1102
+ end
1103
+ end
1104
+
1105
+ module REXML
1106
+ class Element
1107
+ # small extension to REXML to simplify parsing of xhtml feed items
1108
+ def inner_xml
1109
+ result = ""
1110
+ each_child do |child|
1111
+ result << child.to_s
1112
+ end
1113
+ return result
1114
+ end
1115
+ end
1116
+ end
1117
+
1118
+ begin
1119
+ FeedTools::Feed.prepare_connection
1120
+ unless FeedTools::Feed.cache_exists?
1121
+ FeedTools::Feed.create_cache
1122
+ end
1123
+ rescue
1124
+ # Nothing can be done until someone sets up the database connection.
1125
+ # We'll just assume for now that the user will take care of that.
1126
+ end