feedtools 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. data/CHANGELOG +11 -0
  2. data/lib/feed_tools.rb +2496 -810
  3. data/lib/feed_tools/vendor/builder.rb +2 -0
  4. data/lib/feed_tools/vendor/builder/blankslate.rb +2 -0
  5. data/lib/feed_tools/vendor/builder/xmlbase.rb +2 -1
  6. data/lib/feed_tools/vendor/builder/xmlevents.rb +2 -0
  7. data/lib/feed_tools/vendor/builder/xmlmarkup.rb +4 -2
  8. data/lib/feed_tools/vendor/htree.rb +97 -0
  9. data/lib/feed_tools/vendor/htree/container.rb +10 -0
  10. data/lib/feed_tools/vendor/htree/context.rb +67 -0
  11. data/lib/feed_tools/vendor/htree/display.rb +27 -0
  12. data/lib/feed_tools/vendor/htree/doc.rb +149 -0
  13. data/lib/feed_tools/vendor/htree/elem.rb +262 -0
  14. data/lib/feed_tools/vendor/htree/encoder.rb +163 -0
  15. data/lib/feed_tools/vendor/htree/equality.rb +218 -0
  16. data/lib/feed_tools/vendor/htree/extract_text.rb +37 -0
  17. data/lib/feed_tools/vendor/htree/fstr.rb +33 -0
  18. data/lib/feed_tools/vendor/htree/gencode.rb +97 -0
  19. data/lib/feed_tools/vendor/htree/htmlinfo.rb +672 -0
  20. data/lib/feed_tools/vendor/htree/inspect.rb +108 -0
  21. data/lib/feed_tools/vendor/htree/leaf.rb +94 -0
  22. data/lib/feed_tools/vendor/htree/loc.rb +367 -0
  23. data/lib/feed_tools/vendor/htree/modules.rb +48 -0
  24. data/lib/feed_tools/vendor/htree/name.rb +124 -0
  25. data/lib/feed_tools/vendor/htree/output.rb +207 -0
  26. data/lib/feed_tools/vendor/htree/parse.rb +407 -0
  27. data/lib/feed_tools/vendor/htree/raw_string.rb +124 -0
  28. data/lib/feed_tools/vendor/htree/regexp-util.rb +15 -0
  29. data/lib/feed_tools/vendor/htree/rexml.rb +130 -0
  30. data/lib/feed_tools/vendor/htree/scan.rb +166 -0
  31. data/lib/feed_tools/vendor/htree/tag.rb +111 -0
  32. data/lib/feed_tools/vendor/htree/template.rb +909 -0
  33. data/lib/feed_tools/vendor/htree/text.rb +115 -0
  34. data/lib/feed_tools/vendor/htree/traverse.rb +465 -0
  35. data/rakefile +1 -1
  36. data/test/rss_test.rb +97 -0
  37. metadata +30 -1
data/CHANGELOG CHANGED
@@ -1,3 +1,14 @@
1
+ == FeedTools 0.2.0
2
+ * more complete support for rss, atom, cdf
3
+ * modular caching mechanism
4
+ * lazy parsing
5
+ * HTML sanitization of possibly dangerous fields
6
+ * HTML tidy support
7
+ * support for podcasts and vidlogs
8
+ * corrected handling of http redirection
9
+ * made http header information available
10
+ * file: protocol support
11
+ * custom parsing can be done using the find_node and find_all_nodes methods
1
12
  == FeedTools 0.1.0
2
13
  * basic support for rss, atom, cdf
3
14
  * basic caching using active record
@@ -21,987 +21,2669 @@
21
21
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
22
  #++
23
23
 
24
- FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] || ENV['RAILS_ENV'] || 'production'
24
+ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
25
+ ENV['RAILS_ENV'] ||
26
+ 'production' # :nodoc:
27
+
28
+ FEED_TOOLS_VERSION = "0.2.0"
25
29
 
26
30
  $:.unshift(File.dirname(__FILE__))
27
31
  $:.unshift(File.dirname(__FILE__) + "/../../activerecord/lib")
32
+ $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
33
+
34
+ require 'rubygems'
35
+ require 'active_record'
28
36
 
29
37
  begin
30
- require 'active_record'
38
+ require 'builder'
31
39
  rescue LoadError
32
- require 'rubygems'
33
- require_gem 'activerecord'
40
+ # RubyGems version is not available, use included Builder
41
+ require 'feed_tools/vendor/builder'
34
42
  end
35
43
 
36
44
  begin
37
- require 'rubygems'
38
- require 'builder'
45
+ require 'tidy'
39
46
  rescue LoadError
40
- # RubyGems is not available, use included Builder
41
- $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
42
- require 'feed_tools/vendor/builder'
47
+ # Ignore the error for now.
43
48
  end
44
49
 
45
- require 'open-uri'
46
- require 'time'
50
+ require 'feed_tools/vendor/htree'
51
+
52
+ require 'net/http'
53
+ require 'net/https'
54
+ require 'net/ftp'
55
+
47
56
  require 'rexml/document'
48
- require 'yaml'
57
+
58
+ require 'iconv'
59
+ require 'uri'
60
+ require 'time'
49
61
  require 'cgi'
62
+ require 'pp'
63
+ require 'yaml'
50
64
 
65
+ #= feed_tools.rb
66
+ #
67
+ # FeedTools was designed to be a simple XML feed parser, generator, and translator with a built-in
68
+ # caching system.
69
+ #
70
+ #== Example
71
+ # slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss')
72
+ # slashdot_feed.title
73
+ # => "Slashdot"
74
+ # slashdot_feed.description
75
+ # => "News for nerds, stuff that matters"
76
+ # slashdot_feed.link
77
+ # => "http://slashdot.org/"
78
+ # slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s
79
+ # => "43,37,28,23,11,3,1"
51
80
  module FeedTools
52
- class Feed < ActiveRecord::Base
53
- include REXML
54
81
 
55
- has_many :feed_items_unsorted, :class_name => "FeedItem"
56
-
57
- def initialize
58
- @live = false
59
- @feed_items_unsorted = nil
60
- super
61
- end
62
-
63
- # Loads the feed specified by the url, pulling the data from the cache if it hasn't expired
64
- # Be aware that this method translates from the feed: and rss: pseudo-protocols to the
65
- # http: protocol as needed. This means that if you pass in a feed url that looks like
66
- # 'feed://www.anywhere.com/feed.xml' it will end up being stored in the cache as
67
- # 'http://www.anywhere.com/feed.xml' instead. This does affect the usage of methods like
68
- # find_by_url, but otherwise should be fairly transparent.
69
- def Feed.open(url)
70
- # deal with all of the ugly possibilities involved in the rss: and feed: pseudo-protocols
71
- if (url =~ /feed:/) == 0
72
- url = url.gsub(/feed:\/\/http:\/\/\//, "http://")
73
- url = url.gsub(/feed:\/\/http:\/\//, "http://")
74
- url = url.gsub(/feed:http:\/\/\//, "http://")
75
- url = url.gsub(/feed:http:\/\//, "http://")
76
- url = url.gsub(/feed:\/\/\//, "http://")
77
- url = url.gsub(/feed:\/\//, "http://")
78
- url = url.gsub(/feed:\//, "http://")
79
- url = url.gsub(/feed:/, "http://")
80
- end
81
- if (url =~ /rss:/) == 0
82
- url = url.gsub(/rss:\/\/http:\/\/\//, "http://")
83
- url = url.gsub(/rss:\/\/http:\/\//, "http://")
84
- url = url.gsub(/rss:http:\/\/\//, "http://")
85
- url = url.gsub(/rss:http:\/\//, "http://")
86
- url = url.gsub(/rss:\/\/\//, "http://")
87
- url = url.gsub(/rss:\/\//, "http://")
88
- url = url.gsub(/rss:\//, "http://")
89
- url = url.gsub(/rss:/, "http://")
90
- end
91
-
92
- feed = nil
82
+ # The default caching mechanism for the FeedTools module
83
+ class DatabaseFeedCache < ActiveRecord::Base
84
+ # Overrides the default table name to use the "feeds" table.
85
+ def self.table_name() "feeds" end
86
+
87
+ # If ActiveRecord is not already connected, attempts to find a configuration file and use
88
+ # it to open a connection for ActiveRecord.
89
+ # This method is probably unnecessary for anything but testing and debugging purposes.
90
+ # In a Rails environment, the connection will already have been established
91
+ # and this method will simply do nothing.
92
+ #
93
+ # This method should not raise any exceptions because it's designed to be run only when
94
+ # the module is first loaded. If it fails, the user should get an exception when they
95
+ # try to perform some action that makes use of the caching functionality, and not until.
96
+ def DatabaseFeedCache.initialize_cache
97
+ # Establish a connection if we don't already have one
93
98
  begin
94
- feed = Feed.find_by_url(url)
95
- rescue ActiveRecord::StatementInvalid
96
- # make sure that the necessary tables are present and recover if possible
97
- FeedTools::Feed.prepare_connection
98
- unless FeedTools::Feed.cache_exists?
99
- FeedTools::Feed.create_cache
99
+ ActiveRecord::Base.connection
100
+ rescue
101
+ begin
102
+ possible_config_files = [
103
+ "./config/database.yml",
104
+ "../database.yml",
105
+ "./database.yml"
106
+ ]
107
+ database_config_file = nil
108
+ for file in possible_config_files
109
+ if File.exists? file
110
+ database_config_file = file
111
+ break
112
+ end
113
+ end
114
+ database_config_hash = File.open(database_config_file) do |file|
115
+ config_hash = YAML::load(file)
116
+ unless config_hash[FEED_TOOLS_ENV].nil?
117
+ config_hash = config_hash[FEED_TOOLS_ENV]
118
+ end
119
+ config_hash
120
+ end
121
+ ActiveRecord::Base.configurations = database_config_hash
122
+ ActiveRecord::Base.establish_connection(database_config_hash)
123
+ ActiveRecord::Base.connection
124
+ rescue
100
125
  end
101
- feed = Feed.find_by_url(url)
102
126
  end
103
- unless feed.nil?
104
- feed.update_if_needed
105
- else
106
- feed = Feed.new
107
- feed.url = url
108
- feed.load_remote_feed
127
+ # Verify that the necessary database tables are in place
128
+ # and if they're missing, create them
129
+ unless DatabaseFeedCache.table_exists?
130
+ DatabaseFeedCache.create_table
109
131
  end
110
- return feed
111
- end
112
-
113
- # Checks if the feed has expired and updates if it has
114
- def update_if_needed
115
- if expired?
116
- load_remote_feed
117
- end
118
- end
119
-
120
- # Verifies that the table structure exists
121
- def Feed.cache_exists?
122
- return Feed.table_exists? && FeedItem.table_exists?
132
+ return nil
123
133
  end
124
-
125
- # Verifies that the required fields exist; additional ones added by the user are fine
126
- def Feed.table_exists?
134
+
135
+ # True if the appropriate database table already exists
136
+ def DatabaseFeedCache.table_exists?
127
137
  begin
128
- connection.execute "select id, url, link, image_link, title, description, " +
129
- "tags, last_updated, etag, time_to_live from feeds limit 1"
138
+ ActiveRecord::Base.connection.execute "select id, url, title, " +
139
+ "link, xml_data, http_headers, last_retrieved " +
140
+ "from feeds limit 1"
130
141
  rescue ActiveRecord::StatementInvalid
131
142
  return false
143
+ rescue
144
+ return false
132
145
  end
133
146
  return true
134
147
  end
135
-
136
- # Generates the table structure if necessary
137
- def Feed.create_cache
138
- unless Feed.cache_exists?
139
- feed_items_mysql = <<-SQL_END
140
- CREATE TABLE `feed_items` (
141
- `id` int(6) unsigned NOT NULL auto_increment,
142
- `feed_id` int(6) unsigned NOT NULL default '0',
143
- `link` varchar(255) default NULL,
144
- `title` varchar(255) default NULL,
145
- `author` varchar(255) default NULL,
146
- `description` text default NULL,
147
- `time` datetime NOT NULL default '0000-00-00 00:00:00',
148
- `tags` varchar(255) default NULL,
149
- PRIMARY KEY (`id`)
150
- ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
151
- SQL_END
152
- feed_items_sqlite = <<-SQL_END
153
- CREATE TABLE 'feed_items' (
154
- 'id' INTEGER PRIMARY KEY NOT NULL,
155
- 'feed_id' INTEGER NOT NULL,
156
- 'link' VARCHAR(255) DEFAULT NULL,
157
- 'title' VARCHAR(255) DEFAULT NULL,
158
- 'author' VARCHAR(255) DEFAULT NULL,
159
- 'description' TEXT DEFAULT NULL,
160
- 'time' DATETIME DEFAULT NULL,
161
- 'tags' VARCHAR(255) DEFAULT NULL
162
- );
163
- SQL_END
164
- feed_items_psql = <<-SQL_END
165
- CREATE TABLE feed_items (
166
- id SERIAL PRIMARY KEY NOT NULL,
167
- feed_id int REFERENCES feeds,
168
- link varchar(255) default NULL,
169
- title varchar(255) default NULL,
170
- author varchar(255) default NULL,
171
- description text default NULL,
172
- time datetime default NULL,
173
- tags varchar(255) default NULL
174
- );
175
- SQL_END
176
- unless FeedItem.table_exists?
177
- table_creation_sql = nil
178
- if configurations["adapter"] == "mysql"
179
- table_creation_sql = feed_items_mysql
180
- elsif configurations["adapter"] == "sqlite"
181
- table_creation_sql = feed_items_sqlite
182
- elsif configurations["adapter"] == "postgresql"
183
- table_creation_sql = feeds_psql
184
- end
185
- if table_creation_sql.nil?
186
- raise "Could not build feed_items table."
187
- else
188
- connection.execute table_creation_sql
189
- end
190
- end
148
+
149
+ # Creates the appropriate database table
150
+ def DatabaseFeedCache.create_table
151
+ unless DatabaseFeedCache.table_exists?
191
152
  feeds_mysql = <<-SQL_END
192
153
  CREATE TABLE `feeds` (
193
- `id` int(6) unsigned NOT NULL auto_increment,
194
- `url` varchar(255) NOT NULL default '',
195
- `link` varchar(255) NOT NULL default '',
196
- `image_link` varchar(255) default NULL,
197
- `title` varchar(255) default NULL,
198
- `description` text default NULL,
199
- `tags` varchar(255) default NULL,
200
- `last_updated` datetime default NULL,
201
- `etag` varchar(255) default NULL,
202
- `time_to_live` int(4) default NULL,
154
+ `id` int(10) unsigned NOT NULL auto_increment,
155
+ `url` varchar(255) default NULL,
156
+ `title` varchar(255) default NULL,
157
+ `link` varchar(255) default NULL,
158
+ `xml_data` longtext default NULL,
159
+ `http_headers` text default NULL,
160
+ `last_retrieved` datetime default NULL,
203
161
  PRIMARY KEY (`id`)
204
162
  ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
205
163
  SQL_END
206
164
  feeds_sqlite = <<-SQL_END
207
165
  CREATE TABLE 'feeds' (
208
- 'id' INTEGER PRIMARY KEY NOT NULL,
209
- 'url' VARCHAR(255) DEFAULT NULL,
210
- 'link' VARCHAR(255) DEFAULT NULL,
211
- 'image_link' VARCHAR(255) DEFAULT NULL,
212
- 'title' VARCHAR(255) DEFAULT NULL,
213
- 'description' TEXT DEFAULT NULL,
214
- 'tags' VARCHAR(255) DEFAULT NULL,
215
- 'last_updated' DATETIME DEFAULT NULL,
216
- 'etag' VARCHAR(255) DEFAULT NULL,
217
- 'time_to_live' INTEGER DEFAULT NULL
166
+ 'id' INTEGER PRIMARY KEY NOT NULL,
167
+ 'url' VARCHAR(255) DEFAULT NULL,
168
+ 'title' VARCHAR(255) DEFAULT NULL,
169
+ 'link' VARCHAR(255) DEFAULT NULL,
170
+ 'image_link' VARCHAR(255) DEFAULT NULL,
171
+ 'xml_data' TEXT DEFAULT NULL,
172
+ 'http_headers' TEXT DEFAULT NULL,
173
+ 'last_retrieved' DATETIME DEFAULT NULL,
218
174
  );
219
175
  SQL_END
220
176
  feeds_psql = <<-SQL_END
221
177
  CREATE TABLE feeds (
222
- id SERIAL PRIMARY KEY NOT NULL,
223
- url varchar(255) default NULL,
224
- link varchar(255) default NULL,
225
- image_link varchar(255) default NULL,
226
- title varchar(255) default NULL,
227
- description text default NULL,
228
- tags varchar(255) default NULL,
229
- last_updated datetime default NULL,
230
- etag varchar(255) default NULL,
231
- time_to_live int default NULL
178
+ id SERIAL PRIMARY KEY NOT NULL,
179
+ url varchar(255) default NULL,
180
+ title varchar(255) default NULL,
181
+ link varchar(255) default NULL,
182
+ xml_data text default NULL,
183
+ http_headers text default NULL,
184
+ last_retrieved datetime default NULL,
232
185
  );
233
186
  SQL_END
234
- unless Feed.table_exists?
235
- table_creation_sql = nil
236
- if configurations["adapter"] == "mysql"
237
- table_creation_sql = feeds_mysql
238
- elsif configurations["adapter"] == "sqlite"
239
- table_creation_sql = feeds_sqlite
240
- elsif configurations["adapter"] == "postgresql"
241
- table_creation_sql = feeds_psql
242
- end
243
- if table_creation_sql.nil?
244
- raise "Could not build feed_items table."
245
- else
246
- connection.execute table_creation_sql
247
- end
187
+ table_creation_sql = nil
188
+ if configurations["adapter"] == "mysql"
189
+ table_creation_sql = feeds_mysql
190
+ elsif configurations["adapter"] == "sqlite"
191
+ table_creation_sql = feeds_sqlite
192
+ elsif configurations["adapter"] == "postgresql"
193
+ table_creation_sql = feeds_psql
194
+ end
195
+ if table_creation_sql.nil?
196
+ raise "Could not build feed_items table."
197
+ else
198
+ connection.execute table_creation_sql
248
199
  end
249
200
  end
250
201
  end
251
-
252
- # Removes all feed entries from the cache
253
- # This could obviously be a very dangerous operation if you use the cache for more than simply
254
- # caching the feeds.
255
- def Feed.clear_cache
256
- FeedItem.delete_all
257
- Feed.delete_all
202
+ end
203
+
204
+ # Error raised when a feed cannot be retrieved
205
+ class FeedAccessError < StandardError
206
+ end
207
+
208
+ # Quick method of enabling small classes to have their attributes
209
+ # accessible as a dictionary.
210
+ module AttributeDictionary # :nodoc:
211
+ # Access the attributes as a dictionary
212
+ def [](key)
213
+ # Assignment, and destructive methods should not be
214
+ # accessed like this.
215
+ return nil if key[-1..-1] == "=" || key[-1..-1] == "!"
216
+ return nil unless self.method(key).arity == 0
217
+ return self.send(key)
258
218
  end
259
-
260
- # Removes all feed items from the cache and resets the last updated time for all feeds
261
- # This is probably much safer than the clear_cache method
262
- def Feed.expire_cache
263
- FeedItem.delete_all
264
- Feed.update_all("last_updated = NULL")
219
+
220
+ # Access the attributes as a dictionary
221
+ def []=(key, value)
222
+ # Assignment, and destructive methods should not be
223
+ # accessed like this.
224
+ return nil if key[-1..-1] == "=" || key[-1..-1] == "!"
225
+ return nil unless self.method(key + "=").arity == 1
226
+ return self.send(key + "=", value)
265
227
  end
266
-
267
- # Removes all feed items older than the specified number of seconds
268
- def Feed.purge_cache(purge_time=1.week)
269
- purge_date = (Time.now - purge_time).strftime("%Y-%m-%d %H:%M:%S")
270
- FeedItem.delete_all("time < '#{purge_date}'")
228
+ end
229
+
230
+ @feed_cache = DatabaseFeedCache
231
+ @user_agent = "FeedTools/#{FEED_TOOLS_VERSION} " +
232
+ "+http://www.sporkmonger.com/projects/feedtools/"
233
+
234
+ # Returns the current caching mechanism.
235
+ def FeedTools.feed_cache
236
+ return @feed_cache
237
+ end
238
+
239
+ # Sets the current caching mechanism. If set to nil, disables caching.
240
+ # Default is the DatabaseFeedCache class.
241
+ #
242
+ # Objects of this class must accept the following messages:
243
+ # url
244
+ # url=
245
+ # title
246
+ # title=
247
+ # link
248
+ # link=
249
+ # xml_data
250
+ # xml_data=
251
+ # etag
252
+ # etag=
253
+ # last_modified
254
+ # last_modified=
255
+ # save
256
+ #
257
+ # Additionally, the class itself must accept the following messages:
258
+ # find_by_id
259
+ # find_by_url
260
+ # initialize_cache
261
+ def FeedTools.feed_cache=(new_feed_cache)
262
+ # TODO: ensure that the feed cache class actually does those things.
263
+ # ==================================================================
264
+ @feed_cache = new_feed_cache
265
+ end
266
+
267
+ # Returns the currently used user agent string.
268
+ def FeedTools.user_agent
269
+ return @user_agent
270
+ end
271
+
272
+ # Sets the user agent string to send in the http headers.
273
+ def FeedTools.user_agent=(new_user_agent)
274
+ @user_agent = new_user_agent
275
+ end
276
+
277
+ # Returns true if the html tidy module can be used.
278
+ #
279
+ # Obviously, you need the tidy gem installed in order to run with html
280
+ # tidy features turned on.
281
+ #
282
+ # This method does a fairly complicated, and probably unnecessarily
283
+ # desperate search for the libtidy library. If you want this thing to
284
+ # execute fast, the best thing to do is to set Tidy.path ahead of time.
285
+ # If Tidy.path is set, this method doesn't do much. If it's not set,
286
+ # it will do it's darnedest to find the libtidy library. If you set
287
+ # the LIBTIDYPATH environment variable to the libtidy library, it should
288
+ # be able to find it.
289
+ #
290
+ # Once the library is located, this method will run much faster.
291
+ def FeedTools.tidy_enabled?
292
+ # This is an override variable to keep tidy from being used even if it
293
+ # is available.
294
+ if @force_tidy_enabled == false
295
+ return false
271
296
  end
272
-
273
- # If ActiveRecord is not already connected, attempts to find a configuration file and use
274
- # it to open a connection for ActiveRecord.
275
- # This method is probably unnecessary for anything but testing and debugging purposes.
276
- def Feed.prepare_connection
297
+ if @tidy_enabled.nil? || @tidy_enabled == false
298
+ @tidy_enabled = false
277
299
  begin
278
- ActiveRecord::Base.connection
279
- rescue
280
- possible_config_files = [
281
- "./config/database.yml",
282
- "./database.yml"
283
- ]
284
- database_config_file = nil
285
- for file in possible_config_files
286
- if File.exists? file
287
- database_config_file = file
288
- break
289
- end
290
- end
291
- database_config_hash = File.open(database_config_file) do |file|
292
- config_hash = YAML::load(file)
293
- unless config_hash[FEED_TOOLS_ENV].nil?
294
- config_hash = config_hash[FEED_TOOLS_ENV]
295
- end
296
- config_hash
297
- end
298
- ActiveRecord::Base.configurations = database_config_hash
299
- ActiveRecord::Base.establish_connection(database_config_hash)
300
- ActiveRecord::Base.connection
300
+ require 'tidy'
301
+ if Tidy.path.nil?
302
+ # *Shrug*, just brute force it, I guess. There's a lot of places
303
+ # this thing might be hiding in, depending on platform and general
304
+ # sanity of the person who installed the thing. Most of these are
305
+ # probably unlikely, but it's not like checking unlikely locations
306
+ # hurts. Much. Especially if you actually find it.
307
+ libtidy_locations = [
308
+ '/usr/local/lib/libtidy.dylib',
309
+ '/opt/local/lib/libtidy.dylib',
310
+ '/usr/lib/libtidy.dylib',
311
+ '/usr/local/lib/tidylib.dylib',
312
+ '/opt/local/lib/tidylib.dylib',
313
+ '/usr/lib/tidylib.dylib',
314
+ '/usr/local/lib/tidy.dylib',
315
+ '/opt/local/lib/tidy.dylib',
316
+ '/usr/lib/tidy.dylib',
317
+ '/usr/local/lib/libtidy.so',
318
+ '/opt/local/lib/libtidy.so',
319
+ '/usr/lib/libtidy.so',
320
+ '/usr/local/lib/tidylib.so',
321
+ '/opt/local/lib/tidylib.so',
322
+ '/usr/lib/tidylib.so',
323
+ '/usr/local/lib/tidy.so',
324
+ '/opt/local/lib/tidy.so',
325
+ '/usr/lib/tidy.so',
326
+ 'C:\Program Files\Tidy\tidy.dll',
327
+ 'C:\Tidy\tidy.dll',
328
+ '/usr/local/lib',
329
+ '/opt/local/lib',
330
+ '/usr/lib'
331
+ ]
332
+ # We just made this thing up, but if someone sets it, we'll
333
+ # go ahead and check it
334
+ unless ENV['LIBTIDYPATH'].nil?
335
+ libtidy_locations =
336
+ libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
337
+ end
338
+ for path in libtidy_locations
339
+ if File.exists? path
340
+ if File.ftype(path) == "file"
341
+ Tidy.path = path
342
+ @tidy_enabled = true
343
+ break
344
+ elsif File.ftype(path) == "directory"
345
+ # Ok, now perhaps we're getting a bit more desperate
346
+ lib_paths =
347
+ `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
348
+ # If there's more than one, grab the first one and
349
+ # hope for the best, and if it doesn't work, then blame the
350
+ # user for not specifying more accurately.
351
+ tidy_path = lib_paths.split("\n").first
352
+ unless tidy_path.nil?
353
+ Tidy.path = tidy_path
354
+ @tidy_enabled = true
355
+ break
356
+ end
357
+ end
358
+ end
359
+ end
360
+ # Still couldn't find it.
361
+ unless @tidy_enabled
362
+ @tidy_enabled = false
363
+ end
364
+ else
365
+ @tidy_enabled = true
366
+ end
367
+ rescue LoadError
368
+ # Tidy not installed, disable features that rely on tidy.
369
+ @tidy_enabled = false
301
370
  end
302
371
  end
303
-
304
- def Feed.cache_enabled?
305
- return true
372
+ return @tidy_enabled
373
+ end
374
+
375
+ # Turns html tidy support on or off. Be aware, that setting this to true
376
+ # does not mean tidy will be enabled. It simply means that tidy will be
377
+ # enabled if it is available to be enabled.
378
+ def FeedTools.tidy_enabled=(new_tidy_enabled)
379
+ @force_tidy_enabled = new_tidy_enabled
380
+ end
381
+
382
+ # Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls
383
+ # and makes every effort to figure out what it was supposed to be. Also translates from
384
+ # the feed: and rss: pseudo-protocols to the http: protocol.
385
+ def FeedTools.normalize_url(url)
386
+ if url.nil? || url == ""
387
+ return nil
306
388
  end
389
+ normalized_url = url
307
390
 
308
- def title
309
- return (self["title"] or "Untitled Feed")
391
+ # if a url begins with the '/' character, it only makes sense that they
392
+ # meant to be using a file:// url. Fix it for them.
393
+ if normalized_url.length > 0 && normalized_url[0..0] == "/"
394
+ normalized_url = "file://" + normalized_url
310
395
  end
311
396
 
312
- # Optional feed attribute.
313
- # If you want to use it, the database table needs to have a language field added, otherwise
314
- # it will just default to "en-US".
315
- def language
316
- begin
317
- return (self["language"] or "en-US")
318
- rescue
319
- return "en-US"
320
- end
397
+ # if a url begins with javascript:, it's quite possibly an attempt at
398
+ # doing something malicious. Let's keep that from getting anywhere,
399
+ # shall we?
400
+ if (normalized_url.downcase =~ /javascript:/) != nil
401
+ return "#"
321
402
  end
322
403
 
323
- def live?
324
- if @live
325
- return true
326
- else
327
- return false
328
- end
329
- end
404
+ # deal with all of the many ugly possibilities involved in the rss:
405
+ # and feed: pseudo-protocols (incidentally, whose crazy idea was this
406
+ # mess?)
407
+ normalized_url.gsub!(/^http:\/*(feed:\/*)?/, "http://")
408
+ normalized_url.gsub!(/^http:\/*(rss:\/*)?/, "http://")
409
+ normalized_url.gsub!(/^feed:\/*(http:\/*)?/, "http://")
410
+ normalized_url.gsub!(/^rss:\/*(http:\/*)?/, "http://")
411
+ normalized_url.gsub!(/^file:\/*/, "file:///")
412
+ normalized_url.gsub!(/^https:\/*/, "https://")
413
+ # fix (very) bad urls (usually of the user-entered sort)
414
+ normalized_url.gsub!(/^http:\/*(http:\/*)*/, "http://")
415
+ if (normalized_url =~ /^file:/) == 0
416
+ # fix bad Windows-based entries
417
+ normalized_url.gsub!(/file:\/\/\/([a-zA-Z]):/, 'file:///\1|')
330
418
 
331
- def expired?
332
- return last_updated == nil || (last_updated + time_to_live) < Time.now
419
+ # maybe this is too aggressive?
420
+ normalized_url.gsub!(/\\/, '/')
421
+ return normalized_url
422
+ else
423
+ if (normalized_url =~ /https?:\/\//) == nil
424
+ normalized_url = "http://" + normalized_url
425
+ end
426
+ if normalized_url == "http://"
427
+ return nil
428
+ end
429
+ begin
430
+ feed_uri = URI.parse(normalized_url)
431
+ if feed_uri.scheme == nil
432
+ feed_uri.scheme = "http"
433
+ end
434
+ if feed_uri.path == nil || feed_uri.path == ""
435
+ feed_uri.path = "/"
436
+ end
437
+ if (feed_uri.path =~ /^[\/]+/) == 0
438
+ feed_uri.path.gsub!(/^[\/]+/, "/")
439
+ end
440
+ return feed_uri.to_s
441
+ rescue URI::InvalidURIError
442
+ return normalized_url
443
+ end
333
444
  end
445
+ end
334
446
 
335
- # Forces this feed to expire.
336
- def expire
337
- FeedItem.delete_all("feed_id = '#{self.id}'")
338
- @feed_items_unsorted = nil
339
- self.last_updated = Time.mktime(1980)
340
- self.save
447
+ # Returns true if the parameter appears to be a valid url
448
+ def FeedTools.is_url?(url)
449
+ return false if url.nil?
450
+ begin
451
+ uri = URI.parse(url)
452
+ rescue URI::InvalidURIError
453
+ return false
341
454
  end
455
+ return true
456
+ end
342
457
 
343
- # The ammount of time in seconds between the last time the feed was updated and the next
344
- # valid time to retrieve a remote feed.
345
- def time_to_live
346
- return self['time_to_live'].nil? ? 1.hour : self['time_to_live'].hour
347
- end
458
+ # Removes all html tags from the html formatted text.
459
+ def FeedTools.strip_html(html)
460
+ # TODO: do this properly
461
+ # ======================
462
+ stripped_html = html.gsub(/<\/?[^>]+>/, "")
463
+ return stripped_html
464
+ end
348
465
 
349
- def tag_list
350
- return tags.nil? ? nil : tags[1..-2].split("|")
466
+ # Tidys up the html
467
+ def FeedTools.tidy_html(html)
468
+ if FeedTools.tidy_enabled?
469
+ is_fragment = true
470
+ if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
471
+ (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
472
+ is_fragment = false
473
+ end
474
+ if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
475
+ is_fragment = false
476
+ end
477
+ tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
478
+ tidy.options.output_xml = true
479
+ tidy.options.indent = false
480
+ tidy.options.wrap_attributes = true
481
+ tidy.options.logical_emphasis = true
482
+ tidy.options.doctype = "omit"
483
+ xml = tidy.clean(html)
484
+ xml
485
+ end
486
+ if is_fragment
487
+ # Tidy puts <html>...<body>[our html]</body>...</html> in.
488
+ # We don't want this.
489
+ tidy_html.strip!
490
+ tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
491
+ tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
492
+ tidy_html.strip!
493
+ end
494
+ else
495
+ tidy_html = html
351
496
  end
497
+ return tidy_html
498
+ end
352
499
 
353
- def tag_list=(new_tag_list)
354
- self.tags = "|" + (new_tag_list.map { |x| x.strip }).join("|") + "|"
355
- end
500
+ # Removes all dangerous html tags from the html formatted text.
501
+ # If mode is set to :escape, dangerous and unknown elements will
502
+ # be escaped. If mode is set to :strip, dangerous and unknown
503
+ # elements and all children will be removed entirely.
504
+ # Dangerous or unknown attributes are always removed.
505
+ def FeedTools.sanitize_html(html, mode=:escape)
506
+
507
+ # Lists borrowed from Mark Pilgrim's feedparser
508
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
509
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
510
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
511
+ 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
512
+ 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
513
+ 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
514
+ 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
515
+ 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
516
+ 'u', 'ul', 'var']
356
517
 
357
- def tag_string
358
- return (tags.nil? ? nil : tags[1..-2]).split("|").join(", ")
359
- end
360
-
361
- def tag_string=(new_tag_string)
362
- self.tags = "|" + (new_tag_string.split(",").map { |x| x.strip }).join("|") + "|"
363
- end
518
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
519
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
520
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
521
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
522
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
523
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
524
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
525
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
526
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
527
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
528
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width']
364
529
 
365
- # Returns a list of the feed_items, sorted by date
366
- def feed_items
367
- begin
368
- if @feed_items_unsorted.nil?
369
- @feed_items_unsorted = feed_items_unsorted
370
- end
371
- return @feed_items_unsorted.sort do |a,b|
372
- b.time <=> a.time
373
- end
374
- rescue
375
- unless @feed_items_unsorted.nil?
376
- return @feed_items_unsorted
377
- else
378
- return feed_items_unsorted
530
+ # Stupid hack to pass this unit test:
531
+ # http://feedparser.org/tests/wellformed/rss/
532
+ # item_description_not_a_doctype.xml
533
+ html.gsub!(/<!'/, "&lt;!'")
534
+
535
+ # The closer we are to proper xhtml, the more accurate the
536
+ # sanitization will be.
537
+ html = FeedTools.tidy_html(html)
538
+
539
+ # Hackity hack. But it works, and it seems plenty fast enough.
540
+ html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
541
+
542
+ sanitize_node = lambda do |html_node|
543
+ if html_node.respond_to? :children
544
+ for child in html_node.children
545
+ if child.kind_of? REXML::Element
546
+ unless acceptable_elements.include? child.name
547
+ if mode == :strip
548
+ html_node.delete_element(child)
549
+ else
550
+ new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
551
+ html_node.insert_after(child, new_child)
552
+ html_node.delete_element(child)
553
+ end
554
+ end
555
+ for attribute in child.attributes.keys
556
+ unless acceptable_attributes.include? attribute
557
+ child.delete_attribute(attribute)
558
+ end
559
+ end
560
+ end
561
+ sanitize_node.call(child)
379
562
  end
380
563
  end
564
+ html_node
381
565
  end
566
+ sanitize_node.call(html_doc.root)
567
+ return html_doc.root.inner_xml
568
+ end
569
+
570
+ class Feed
571
+ include REXML
572
+ include AttributeDictionary
382
573
 
383
- # Attempts to load the feed from the remote location. Requires the url to be set.
384
- # If an etag has been set, attempts to use it to prevent unnecessary reloading of identical
385
- # content.
386
- def load_remote_feed
387
- @live = true
388
- self.last_updated = Time.now
389
- if (etag != nil)
390
- # TODO: verify that the etag code works as intended
391
- # -> may need to check what gets returned when the
392
- # etag is matched
393
- # =================================================
394
- open(url, "If-None-Match" => @etag ) do |http|
395
- etag = http.meta['etag']
396
- parse_feed(http.read)
397
- end
574
+ # Loads the feed specified by the url, pulling the data from the cache if it hasn't expired.
575
+ def Feed.open(url)
576
+ # clean up the url
577
+ url = FeedTools.normalize_url(url)
578
+
579
+ # create and load the new feed
580
+ feed = Feed.new
581
+ feed.url = url
582
+ feed.update
583
+ return feed
584
+ end
585
+
586
+ # Loads the feed from the remote url if the feed has expired from the cache or cannot be
587
+ # retrieved from the cache for some reason.
588
+ def update
589
+ if self.http_headers.nil? && !(self.cache_object.nil?) &&
590
+ !(self.cache_object.http_headers.nil?)
591
+ @http_headers = YAML.load(self.cache_object.http_headers)
592
+ end
593
+ if expired?
594
+ load_remote_feed
398
595
  else
399
- open(url) do |http|
400
- etag = http.meta['etag']
401
- parse_feed(http.read)
402
- end
596
+ @live = false
403
597
  end
404
598
  end
405
-
406
- def parse_feed_hook(feed_data)
407
- return nil
408
- end
409
-
410
- def parse_feed(feed_data)
411
- root_node = Document.new(feed_data).root
412
- metadata_node = XPath.first(root_node, "channel")
413
- if metadata_node == nil
414
- metadata_node = root_node
415
- end
416
599
 
417
- # get the feed title
418
- title = XPath.first(metadata_node, "title/text()").to_s
600
+ # Attempts to load the feed from the remote location. Requires the url
601
+ # field to be set. If an etag or the last_modified date has been set,
602
+ # attempts to use them to prevent unnecessary reloading of identical
603
+ # content.
604
+ def load_remote_feed
605
+ @live = true
606
+ if self.http_headers.nil? && !(self.cache_object.nil?) &&
607
+ !(self.cache_object.http_headers.nil?)
608
+ @http_headers = YAML.load(self.cache_object.http_headers)
609
+ end
419
610
 
420
- # is the title escaped?
421
- if XPath.first(metadata_node, "title/@mode").to_s == "escaped"
422
- title = CGI.unescapeHTML(title)
611
+ if (self.url =~ /^feed:/) == 0
612
+ # Woah, Nelly, how'd that happen? You should've already been
613
+ # corrected. So let's fix that url. And please,
614
+ # just use less crappy browsers instead of badly defined
615
+ # pseudo-protocol hacks.
616
+ self.url = FeedTools.normalize_url(self.url)
423
617
  end
424
-
425
- # get the feed link
426
- link = XPath.first(metadata_node, "link[@rel='alternate' @type='text/html']/@href").to_s
427
- if link == ""
428
- link = XPath.first(metadata_node, "link[@rel='alternate']/@href").to_s
429
- end
430
- if link == ""
431
- link = XPath.first(metadata_node, "link/@href").to_s
432
- end
433
- if link == ""
434
- link = XPath.first(metadata_node, "link/text()").to_s
435
- end
436
- if link == ""
437
- # The ordering here is somewhat incorrect, but the more correct ordering would
438
- # introduce much more serious problems, so I've chosen to go with the lesser of two
439
- # evils. (The completely correct implementation would require a vestigial 'base' method
440
- # on the Feed class to fully support CDF files. This method will support almost all CDF
441
- # files without any unnecessary methods.) But given that this only exists to support
442
- # CDF files, it's not a big deal. It's not like CDF files really exist in the wild.
443
- # (The assumption this ordering makes is that the 'base' attribute points to a valid
444
- # location, hopefully the same as the 'href' location. Chances are pretty good that this
445
- # is true.)
446
- link = XPath.first(metadata_node, "@base").to_s
447
- end
448
- if link == ""
449
- link = XPath.first(metadata_node, "@href").to_s
618
+
619
+ # Find out what method we're going to be using to obtain this feed.
620
+ uri = URI.parse(self.url)
621
+ retrieval_method = "http"
622
+ case uri.scheme
623
+ when "http"
624
+ retrieval_method = "http"
625
+ when "ftp"
626
+ retrieval_method = "ftp"
627
+ when "file"
628
+ retrieval_method = "file"
629
+ when nil
630
+ raise FeedAccessError,
631
+ "No protocol was specified in the url."
632
+ else
633
+ raise FeedAccessError,
634
+ "Cannot retrieve feed using unrecognized protocol: " + uri.scheme
450
635
  end
451
636
 
452
- # get the feed description
453
- description = XPath.first(metadata_node, "description/text()").to_s
454
- if description != ""
455
- if XPath.first(metadata_node, "description/@encoding").to_s != ""
456
- description = "[Embedded data objects are not supported.]"
457
- else
458
- description = CGI.unescapeHTML(description)
637
+ # No need for http headers unless we're actually doing http
638
+ if retrieval_method == "http"
639
+ # Set up the appropriate http headers
640
+ headers = {}
641
+ unless self.http_headers.nil?
642
+ headers["If-None-Match"] =
643
+ self.http_headers['etag'] unless self.http_headers['etag'].nil?
644
+ headers["If-Modified-Since"] =
645
+ self.http_headers['last-modified'] unless
646
+ self.http_headers['last-modified'].nil?
647
+ end
648
+ headers["User-Agent"] =
649
+ FeedTools.user_agent unless FeedTools.user_agent.nil?
650
+
651
+ # The http feed access method
652
+ def http_fetch(feed_url, http_headers, redirect_limit = 10,
653
+ response_chain = []) # :nodoc:
654
+ raise FeedAccessError, 'Redirect too deep' if redirect_limit == 0
655
+ feed_uri = nil
656
+ begin
657
+ feed_uri = URI.parse(feed_url)
658
+ rescue URI::InvalidURIError
659
+ # Uh, maybe try to fix it?
660
+ feed_uri = URI.parse(FeedTools.normalize_url(feed_url))
661
+ end
662
+
663
+ # Borrowed from open-uri:
664
+ # According to RFC2616 14.23, Host: request-header field should be
665
+ # set to an origin server.
666
+ # But net/http wrongly set a proxy server if an absolute URI is
667
+ # specified as a request URI.
668
+ # So override it here explicitly.
669
+ http_headers['Host'] = feed_uri.host
670
+ http_headers['Host'] += ":#{feed_uri.port}" if feed_uri.port
671
+
672
+ Net::HTTP.start(feed_uri.host, (feed_uri.port or 80)) do |http|
673
+ response = http.request_get(feed_uri.path, http_headers)
674
+
675
+ case response
676
+ when Net::HTTPSuccess
677
+ # We've reached the final destination, process all previous
678
+ # redirections, and see if we need to update the url.
679
+ for redirected_response in response_chain
680
+ if redirected_response.last.code.to_i == 301
681
+ self.url = redirected_response.first
682
+ else
683
+ # Jump out as soon as we hit anything that isn't a
684
+ # permanently moved redirection.
685
+ break
686
+ end
687
+ end
688
+ return response
689
+ when Net::HTTPRedirection
690
+ if response.code.to_i == 304
691
+ response.error!
692
+ else
693
+ if response['Location'].nil?
694
+ raise FeedAccessError,
695
+ "No location to redirect to supplied: " + response.code
696
+ end
697
+ response_chain << [feed_url, response]
698
+ new_location = response['location']
699
+ if response_chain.assoc(new_location) != nil
700
+ raise FeedAccessError, "Redirection loop detected."
701
+ end
702
+ # TODO: deal with stupid people using relative urls
703
+ # in Location header
704
+ # =================================================
705
+ http_fetch(new_location, http_headers,
706
+ redirect_limit - 1, response_chain)
707
+ end
708
+ else
709
+ response.error!
710
+ end
711
+ end
712
+ end
713
+
714
+ begin
715
+ @http_response = http_fetch(self.url, headers)
716
+ @http_headers = {}
717
+ self.http_response.each_header do |header|
718
+ self.http_headers[header.first.downcase] = header.last
719
+ end
720
+ self.last_retrieved = Time.now
721
+ self.xml_data = self.http_response.body
722
+ rescue FeedAccessError
723
+ @live = false
724
+ if self.xml_data.nil?
725
+ raise
726
+ end
727
+ rescue Timeout::Error
728
+ # if we time out, do nothing, it should fall back to the xml_data
729
+ # stored in the cache.
730
+ @live = false
731
+ if self.xml_data.nil?
732
+ raise
733
+ end
734
+ rescue Errno::ECONNRESET
735
+ # if the connection gets reset by peer, oh well, fall back to the
736
+ # xml_data stored in the cache
737
+ @live = false
738
+ if self.xml_data.nil?
739
+ raise
740
+ end
741
+ rescue => error
742
+ # heck, if anything at all bad happens, fall back to the xml_data
743
+ # stored in the cache.
744
+
745
+ # If we can, get the HTTPResponse...
746
+ @http_response = nil
747
+ if error.respond_to?(:each_header)
748
+ @http_response = error
749
+ end
750
+ if error.respond_to?(:response) &&
751
+ error.response.respond_to?(:each_header)
752
+ @http_response = error.response
753
+ end
754
+ if @http_response != nil
755
+ @http_headers = {}
756
+ self.http_response.each_header do |header|
757
+ self.http_headers[header.first] = header.last
758
+ end
759
+ if self.http_response.code.to_i == 304
760
+ self.last_retrieved = Time.now
761
+ end
762
+ end
763
+ @live = false
764
+ if self.xml_data.nil?
765
+ raise
766
+ end
767
+ end
768
+ elsif retrieval_method == "https"
769
+ # Not supported... yet
770
+ elsif retrieval_method == "ftp"
771
+ # Not supported... yet
772
+ # Technically, CDF feeds are supposed to be able to be accessed directly
773
+ # from an ftp server. This is silly, but we'll humor Microsoft.
774
+ #
775
+ # Eventually.
776
+ elsif retrieval_method == "file"
777
+ # Now that we've gone to all that trouble to ensure the url begins
778
+ # with 'file://', strip the 'file://' off the front of the url.
779
+ file_name = self.url.gsub(/^file:\/\//, "")
780
+ begin
781
+ open(file_name) do |file|
782
+ @http_response = nil
783
+ @http_headers = {}
784
+ self.last_retrieved = Time.now
785
+ self.xml_data = file.read
786
+ end
787
+ rescue
788
+ @live = false
789
+ # In this case, pulling from the cache is probably not going
790
+ # to help at all, and the use should probably be immediately
791
+ # appraised of the problem. Raise the exception.
792
+ raise
459
793
  end
460
794
  end
461
- if description == ""
462
- description = XPath.first(metadata_node, "tagline/text()").to_s
463
- if description != "" && XPath.first(metadata_node, "tagline/@mode").to_s == "escaped"
464
- description = CGI.unescapeHTML(description)
795
+ unless self.cache_object.nil?
796
+ begin
797
+ self.save
798
+ rescue
465
799
  end
466
800
  end
467
- if description == "" && XPath.first(metadata_node, "tagline") == nil
468
- description = XPath.first(metadata_node, "info/text()").to_s
469
- if description != "" && XPath.first(metadata_node, "info/@mode").to_s == "escaped"
470
- description = CGI.unescapeHTML(description)
801
+ end
802
+
803
+ # Returns the relevant information from an http request.
804
+ def http_response
805
+ return @http_response
806
+ end
807
+
808
+ # Returns a hash of the http headers from the response.
809
+ def http_headers
810
+ return @http_headers
811
+ end
812
+
813
+ # Returns the feed's raw xml data.
814
+ def xml_data
815
+ if @xml_data.nil?
816
+ unless self.cache_object.nil?
817
+ @xml_data = self.cache_object.xml_data
471
818
  end
472
819
  end
473
- if description == ""
474
- description = CGI.unescapeHTML(XPath.first(metadata_node, "abstract/text()").to_s)
820
+ return @xml_data
821
+ end
822
+
823
+ # Sets the feed's xml data.
824
+ def xml_data=(new_xml_data)
825
+ @xml_data = new_xml_data
826
+ unless self.cache_object.nil?
827
+ self.cache_object.xml_data = new_xml_data
475
828
  end
829
+ end
476
830
 
477
- # get the image link
478
- image_link = XPath.first(metadata_node, "image/url/text()").to_s
479
- if image_link == ""
480
- image_link = XPath.first(metadata_node, "image/@rdf:resource").to_s
831
+ # Returns a REXML Document of the xml_data
832
+ def xml
833
+ if @xml_doc.nil?
834
+ begin
835
+ @xml_doc = Document.new(xml_data)
836
+ rescue
837
+ # Something failed, attempt to repair the xml with htree.
838
+ @xml_doc = HTree.parse(xml_data).to_rexml
839
+ end
481
840
  end
482
- if image_link == ""
483
- image_link = XPath.first(metadata_node, "link[@type='image/jpeg']/@href").to_s
841
+ return @xml_doc
842
+ end
843
+
844
+ # Returns the first node within the channel_node that matches the xpath query.
845
+ def find_node(xpath)
846
+ return XPath.first(channel_node, xpath)
847
+ end
848
+
849
+ # Returns all nodes within the channel_node that match the xpath query.
850
+ def find_all_nodes(xpath)
851
+ return XPath.match(channel_node, xpath)
852
+ end
853
+
854
+ # Returns the root node of the feed.
855
+ def root_node
856
+ if @root_node.nil?
857
+ @root_node = xml.root
484
858
  end
485
- if image_link == ""
486
- image_link = XPath.first(metadata_node, "link[@type='image/gif']/@href").to_s
859
+ return @root_node
860
+ end
861
+
862
+ # Returns the channel node of the feed.
863
+ def channel_node
864
+ if @channel_node.nil?
865
+ @channel_node = XPath.first(root_node, "channel")
866
+ if @channel_node == nil
867
+ @channel_node = XPath.first(root_node, "feedinfo")
868
+ end
869
+ if @channel_node == nil
870
+ @channel_node = root_node
871
+ end
487
872
  end
488
- if image_link == ""
489
- image_link = XPath.first(metadata_node, "link[@type='image/png']/@href").to_s
873
+ return @channel_node
874
+ end
875
+
876
+ # The cache object that handles the feed persistence.
877
+ def cache_object
878
+ unless FeedTools.feed_cache.nil?
879
+ if @cache_object.nil?
880
+ begin
881
+ if @id != nil
882
+ @cache_object = FeedTools.feed_cache.find_by_id(@id)
883
+ elsif @url != nil
884
+ @cache_object = FeedTools.feed_cache.find_by_url(@url)
885
+ end
886
+ if @cache_object.nil?
887
+ @cache_object = FeedTools.feed_cache.new
888
+ end
889
+ rescue
890
+ end
891
+ end
490
892
  end
491
- if image_link == ""
492
- image_link = XPath.first(metadata_node, "logo[@style='image']/@href").to_s
893
+ return @cache_object
894
+ end
895
+
896
+ # Sets the cache object for this feed.
897
+ #
898
+ # This can be any object, but it must accept the following messages:
899
+ # url
900
+ # url=
901
+ # title
902
+ # title=
903
+ # link
904
+ # link=
905
+ # xml_data
906
+ # xml_data=
907
+ # etag
908
+ # etag=
909
+ # last_modified
910
+ # last_modified=
911
+ # save
912
+ def cache_object=(new_cache_object)
913
+ @cache_object = new_cache_object
914
+ end
915
+
916
+ # Returns the feed's unique id
917
+ def id
918
+ if @id.nil?
919
+ @id = XPath.first(root_node, "id/text()").to_s
920
+ if @id == ""
921
+ @id = XPath.first(root_node, "guid/text()").to_s
922
+ end
923
+ @id = nil if @id == ""
493
924
  end
494
- if image_link == ""
495
- image_link = XPath.first(metadata_node, "logo/@href").to_s
925
+ return @id
926
+ end
927
+
928
+ # Sets the feed's unique id
929
+ def id=(new_id)
930
+ @id = new_id
931
+ end
932
+
933
+ # Returns the feed url.
934
+ def url
935
+ if @url.nil? && self.xml_data != nil
936
+ @url = XPath.first(channel_node, "link[@rel='self']/@href").to_s
937
+ @url = nil if @url == ""
496
938
  end
939
+ return @url
940
+ end
497
941
 
498
- # get the feed time to live (expressed in hours)
499
- feed_time_to_live = nil
500
- update_frequency = XPath.first(metadata_node, "syn:updateFrequency/text()").to_s
501
- if update_frequency != ""
502
- update_period = XPath.first(metadata_node, "syn:updatePeriod/text()").to_s
503
- if update_period == "daily"
504
- feed_time_to_live = update_frequency.to_i * 24
505
- elsif update_period == "weekly"
506
- feed_time_to_live = update_frequency.to_i * 24 * 7
507
- elsif update_period == "monthly"
508
- feed_time_to_live = update_frequency.to_i * 24 * 30
509
- elsif update_period == "yearly"
510
- feed_time_to_live = update_frequency.to_i * 24 * 365
942
+ # Sets the feed url and prepares the cache_object if necessary.
943
+ def url=(new_url)
944
+ @url = FeedTools.normalize_url(new_url)
945
+ self.cache_object.url = new_url unless self.cache_object.nil?
946
+ end
947
+
948
+ # Returns the feed title
949
+ def title
950
+ if @title.nil?
951
+ if XPath.first(channel_node, "title/@type").to_s == "xhtml" ||
952
+ XPath.first(channel_node, "title/@mode").to_s == "xhtml"
953
+ @title = XPath.first(channel_node, "title").inner_xml
954
+ elsif XPath.first(channel_node, "title/@type").to_s == "escaped" ||
955
+ XPath.first(channel_node, "title/@mode").to_s == "escaped"
956
+ @title = CGI.unescapeHTML(
957
+ XPath.first(channel_node, "title/text()").to_s)
511
958
  else
512
- # hourly
513
- feed_time_to_live = update_frequency.to_i
959
+ @title = CGI.unescapeHTML(
960
+ XPath.first(channel_node, "title/text()").to_s)
514
961
  end
515
- end
516
- if feed_time_to_live == nil
517
- # expressed in minutes
518
- update_frequency = XPath.first(metadata_node, "ttl/text()").to_s
519
- if update_frequency != ""
520
- feed_time_to_live = (update_frequency.to_i / 60)
962
+ unless @title.nil?
963
+ @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip))
521
964
  end
965
+ if @title != "" && !(@title.nil?)
966
+ @title = FeedTools.strip_html(@title).strip
967
+ end
968
+ @title.gsub!(/\n/, " ")
969
+ @title = nil if @title == ""
970
+ self.cache_object.title = @title unless self.cache_object.nil?
522
971
  end
523
-
524
- # TODO: handle time_to_live for CDF files
525
- # =======================================
526
-
527
- # get the feed items
528
- items = XPath.match(root_node, "item")
529
- if items == nil || items == []
530
- items = XPath.match(metadata_node, "item")
972
+ return @title
973
+ end
974
+
975
+ # Sets the feed title
976
+ def title=(new_title)
977
+ @title = new_title
978
+ self.cache_object.title = new_title unless self.cache_object.nil?
979
+ end
980
+
981
+ # Returns the feed description
982
+ def description
983
+ if @description.nil?
984
+ # get the feed description from the xml document
985
+ @description = XPath.first(channel_node, "description/text()").to_s
986
+ if @description != ""
987
+ if XPath.first(channel_node, "description/@encoding").to_s != ""
988
+ @description = "[Embedded data objects are not supported.]"
989
+ else
990
+ @description = CGI.unescapeHTML(description)
991
+ end
992
+ end
993
+ if @description == ""
994
+ @description = XPath.first(channel_node, "subtitle/text()").to_s
995
+ if @description != "" &&
996
+ XPath.first(channel_node, "subtitle/@mode").to_s == "escaped"
997
+ @description = CGI.unescapeHTML(description)
998
+ end
999
+ end
1000
+ if @description == ""
1001
+ @description = XPath.first(channel_node, "tagline/text()").to_s
1002
+ if @description != "" &&
1003
+ XPath.first(channel_node, "tagline/@mode").to_s == "escaped"
1004
+ @description = CGI.unescapeHTML(description)
1005
+ end
1006
+ end
1007
+ if @description == "" && XPath.first(channel_node, "tagline") == nil
1008
+ @description = XPath.first(channel_node, "info/text()").to_s
1009
+ if @description != "" &&
1010
+ XPath.first(channel_node, "info/@mode").to_s == "escaped"
1011
+ @description = CGI.unescapeHTML(description)
1012
+ end
1013
+ end
1014
+ if @description == ""
1015
+ @description = CGI.unescapeHTML(
1016
+ XPath.first(channel_node, "abstract/text()").to_s)
1017
+ end
1018
+ if @description == ""
1019
+ @description = CGI.unescapeHTML(
1020
+ XPath.first(channel_node, "summary/text()").to_s)
1021
+ end
1022
+ if @description == ""
1023
+ # I don't think this is valid for anyone to do, but this is probably
1024
+ # what they meant if they do it.
1025
+ @description = CGI.unescapeHTML(
1026
+ XPath.first(channel_node, "content:encoded/text()").to_s)
1027
+ if @description != ""
1028
+ @bozo = true
1029
+ end
1030
+ end
1031
+ if @description == ""
1032
+ begin
1033
+ @description = XPath.first(channel_node, "description").inner_xml
1034
+ rescue
1035
+ end
1036
+ end
1037
+ if @description == ""
1038
+ @description = self.itunes_summary
1039
+ @description = "" if @description.nil?
1040
+ end
1041
+ if @description == ""
1042
+ @description = self.itunes_subtitle
1043
+ @description = "" if @description.nil?
1044
+ end
1045
+
1046
+ @description =
1047
+ FeedTools.sanitize_html(@description) unless @description.nil?
1048
+ # If it started with a bunch of divs, hack them right off. We can put
1049
+ # them back later if they're needed.
1050
+ @description.gsub!(/^(<div[^>]*>)*/, "")
1051
+ @description.gsub!(/(<\/div>)*$/, "")
1052
+
1053
+ @description.gsub!(/\n/, " ") if @description.size < 80
1054
+ @description = @description.strip unless @description.nil?
1055
+ @description = nil if @description == ""
1056
+ end
1057
+ return @description
1058
+ end
1059
+
1060
+ # Sets the feed description
1061
+ def description=(new_description)
1062
+ @description = new_description
1063
+ end
1064
+
1065
+ # Returns the contents of the itunes:summary element
1066
+ def itunes_summary
1067
+ if @itunes_summary.nil?
1068
+ @itunes_summary = CGI.unescapeHTML(XPath.first(root_node,
1069
+ "itunes:summary/text()").to_s)
1070
+ if @itunes_summary == ""
1071
+ @itunes_summary = nil
1072
+ end
1073
+ @itunes_summary =
1074
+ FeedTools.sanitize_html(@itunes_summary) unless @itunes_summary.nil?
1075
+ end
1076
+ return @itunes_summary
1077
+ end
1078
+
1079
+ # Sets the contents of the itunes:summary element
1080
+ def itunes_summary=(new_itunes_summary)
1081
+ @itunes_summary = new_itunes_summary
1082
+ end
1083
+
1084
+ # Returns the contents of the itunes:subtitle element
1085
+ def itunes_subtitle
1086
+ if @itunes_subtitle.nil?
1087
+ @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node,
1088
+ "itunes:subtitle/text()").to_s)
1089
+ if @itunes_subtitle == ""
1090
+ @itunes_subtitle = nil
1091
+ end
1092
+ unless @itunes_subtitle.nil?
1093
+ @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle)
1094
+ end
1095
+ end
1096
+ return @itunes_subtitle
1097
+ end
1098
+
1099
+ # Sets the contents of the itunes:subtitle element
1100
+ def itunes_subtitle=(new_itunes_subtitle)
1101
+ @itunes_subtitle = new_itunes_subtitle
1102
+ end
1103
+
1104
+ # Returns the feed link
1105
+ def link
1106
+ if @link.nil?
1107
+ # get the feed link from the xml document
1108
+ @link = XPath.first(channel_node, "link[@rel='alternate' @type='text/html']/@href").to_s
1109
+ if @link == ""
1110
+ @link = XPath.first(channel_node, "link[@rel='alternate']/@href").to_s
1111
+ end
1112
+ if @link == ""
1113
+ @link = XPath.first(channel_node, "link/@href").to_s
1114
+ end
1115
+ if @link == ""
1116
+ @link = XPath.first(channel_node, "link/text()").to_s
1117
+ end
1118
+ if @link == ""
1119
+ @link = XPath.first(channel_node, "@href").to_s
1120
+ end
1121
+ if @link == ""
1122
+ if FeedTools.is_url? self.guid
1123
+ @link = self.guid
1124
+ end
1125
+ end
1126
+ if @link == ""
1127
+ # Technically, we shouldn't use the base attribute for this, but if the href attribute
1128
+ # is missing, it's already a given that we're looking at a messed up CDF file. We can
1129
+ # always pray it's correct.
1130
+ @link = XPath.first(channel_node, "@base").to_s
1131
+ end
1132
+ @link = FeedTools.normalize_url(@link)
1133
+ unless self.cache_object.nil?
1134
+ self.cache_object.link = @link
1135
+ end
1136
+ end
1137
+ return @link
1138
+ end
1139
+
1140
+ # Sets the feed link
1141
+ def link=(new_link)
1142
+ @link = new_link
1143
+ unless self.cache_object.nil?
1144
+ self.cache_object.link = new_link
1145
+ end
1146
+ end
1147
+
1148
+ # Returns the feed image link
1149
+ def image_link
1150
+ if @image_link.nil?
1151
+ # get the feed image link from the xml document
1152
+ @image_link = XPath.first(channel_node, "image/url/text()").to_s
1153
+ if @image_link == ""
1154
+ @image_link = XPath.first(channel_node, "image/@rdf:resource").to_s
1155
+ end
1156
+ if @image_link == ""
1157
+ @image_link = XPath.first(channel_node, "link[@type='image/jpeg']/@href").to_s
1158
+ end
1159
+ if @image_link == ""
1160
+ @image_link = XPath.first(channel_node, "link[@type='image/gif']/@href").to_s
1161
+ end
1162
+ if @image_link == ""
1163
+ @image_link = XPath.first(channel_node, "link[@type='image/png']/@href").to_s
1164
+ end
1165
+ if @image_link == ""
1166
+ @image_link = XPath.first(channel_node, "logo[@style='image']/@href").to_s
1167
+ end
1168
+ if @image_link == ""
1169
+ @image_link = XPath.first(channel_node, "logo/@href").to_s
1170
+ end
1171
+ @image_link = FeedTools.normalize_url(@image_link)
1172
+ end
1173
+ return @image_link
1174
+ end
1175
+
1176
+ # Sets the feed image link
1177
+ def image_link=(new_image_link)
1178
+ @image_link = new_image_link
1179
+ end
1180
+
1181
+ # Returns the url to the icon file for this feed.
1182
+ #
1183
+ # This method uses the url from the link field in order to avoid grabbing
1184
+ # the favicon for services like feedburner.
1185
+ def icon_link
1186
+ if @icon_link.nil?
1187
+ @icon_link = XPath.first(channel_node,
1188
+ "link[@rel='icon']/@href").to_s
1189
+ if @icon_link == ""
1190
+ @icon_link = XPath.first(channel_node,
1191
+ "link[@rel='shortcut icon']/@href").to_s
1192
+ end
1193
+ if @icon_link == ""
1194
+ @icon_link = XPath.first(channel_node,
1195
+ "link[@type='image/x-icon']/@href").to_s
1196
+ end
1197
+ if @icon_link == ""
1198
+ @icon_link = XPath.first(channel_node,
1199
+ "icon/@href").to_s
1200
+ end
1201
+ if @icon_link == ""
1202
+ @icon_link = XPath.first(channel_node,
1203
+ "icon/text()").to_s
1204
+ end
1205
+ if @icon_link == ""
1206
+ link_uri = URI.parse(FeedTools.normalize_url(self.link))
1207
+ @icon_link =
1208
+ link_uri.scheme + "://" + link_uri.host + "/favicon.ico"
1209
+ end
1210
+ end
1211
+ return @icon_link
1212
+ end
1213
+
1214
+ # Returns the number of seconds before the feed should expire
1215
+ def time_to_live
1216
+ if @time_to_live.nil?
1217
+ # get the feed time to live from the xml document
1218
+ update_frequency = XPath.first(channel_node, "syn:updateFrequency/text()").to_s
1219
+ if update_frequency != ""
1220
+ update_period = XPath.first(channel_node, "syn:updatePeriod/text()").to_s
1221
+ if update_period == "daily"
1222
+ @time_to_live = update_frequency.to_i * 24
1223
+ elsif update_period == "weekly"
1224
+ @time_to_live = update_frequency.to_i * 24 * 7
1225
+ elsif update_period == "monthly"
1226
+ @time_to_live = update_frequency.to_i * 24 * 30
1227
+ elsif update_period == "yearly"
1228
+ @time_to_live = update_frequency.to_i * 24 * 365
1229
+ else
1230
+ # hourly
1231
+ @time_to_live = update_frequency.to_i
1232
+ end
1233
+ end
1234
+ end
1235
+ if @time_to_live.nil?
1236
+ # expressed in minutes
1237
+ update_frequency = XPath.first(channel_node, "ttl/text()").to_s
1238
+ if update_frequency != ""
1239
+ @time_to_live = (update_frequency.to_i / 60)
1240
+ end
1241
+ end
1242
+ if @time_to_live.nil?
1243
+ @time_to_live = 0
1244
+ update_frequency_days = XPath.first(channel_node, "schedule/intervaltime/@days").to_s
1245
+ update_frequency_hours = XPath.first(channel_node, "schedule/intervaltime/@hour").to_s
1246
+ update_frequency_minutes = XPath.first(channel_node, "schedule/intervaltime/@min").to_s
1247
+ update_frequency_seconds = XPath.first(channel_node, "schedule/intervaltime/@sec").to_s
1248
+ if update_frequency_days != ""
1249
+ @time_to_live = @time_to_live + update_frequency_days.to_i * 24
1250
+ end
1251
+ if update_frequency_hours != ""
1252
+ @time_to_live = @time_to_live + update_frequency_hours.to_i * 1
1253
+ end
1254
+ if update_frequency_minutes != ""
1255
+ @time_to_live = @time_to_live + update_frequency_minutes.to_i / 60
1256
+ end
1257
+ if update_frequency_seconds != ""
1258
+ @time_to_live = @time_to_live + update_frequency_seconds.to_i / 3600
1259
+ end
1260
+ if @time_to_live == 0
1261
+ @time_to_live = nil
1262
+ end
1263
+ end
1264
+ if @time_to_live.nil? || @time_to_live == 0
1265
+ # Default to one hour
1266
+ @time_to_live = 1
1267
+ end
1268
+ @time_to_live = @time_to_live.round
1269
+ return @time_to_live.hour
1270
+ end
1271
+
1272
+ # Sets the feed time to live
1273
+ def time_to_live=(new_time_to_live)
1274
+ @time_to_live = (new_time_to_live / 3600).round
1275
+ @time_to_live = 1 if @time_to_live < 1
1276
+ end
1277
+
1278
+ # Returns the feed language
1279
+ def language
1280
+ if @language.nil?
1281
+ @language = XPath.first(channel_node, "language/text()").to_s
1282
+ if @language == ""
1283
+ @language = XPath.first(channel_node, "dc:language/text()").to_s
1284
+ end
1285
+ if @language == ""
1286
+ @language = XPath.first(channel_node, "xml:lang/text()").to_s
1287
+ end
1288
+ if @language == ""
1289
+ @language = XPath.first(root_node, "xml:lang/text()").to_s
1290
+ end
1291
+ if @language == ""
1292
+ @language = "en-us"
1293
+ end
1294
+ @language = @language.downcase
531
1295
  end
532
- if items == nil || items == []
533
- items = XPath.match(metadata_node, "entry")
1296
+ return @language
1297
+ end
1298
+
1299
+ # Sets the feed language
1300
+ def language=(new_language)
1301
+ @language = new_language
1302
+ end
1303
+
1304
+ # Returns true if this feed contains explicit material.
1305
+ def explicit
1306
+ if @explicit.nil?
1307
+ if XPath.first(channel_node,
1308
+ "media:adult/text()").to_s.downcase == "true" ||
1309
+ XPath.first(channel_node,
1310
+ "itunes:explicit/text()").to_s.downcase == "yes" ||
1311
+ XPath.first(channel_node,
1312
+ "itunes:explicit/text()").to_s.downcase == "true"
1313
+ @explicit = true
1314
+ else
1315
+ @explicit = false
1316
+ end
1317
+ end
1318
+ return @explicit
1319
+ end
1320
+
1321
+ # Sets whether or not the feed contains explicit material
1322
+ def explicit=(new_explicit)
1323
+ @explicit = (new_explicit ? true : false)
1324
+ end
1325
+
1326
+ # Returns the feed items
1327
+ def items
1328
+ if @items.nil?
1329
+ raw_items = XPath.match(root_node, "item")
1330
+ if raw_items == nil || raw_items == []
1331
+ raw_items = XPath.match(channel_node, "item")
1332
+ end
1333
+ if raw_items == nil || raw_items == []
1334
+ raw_items = XPath.match(channel_node, "entry")
1335
+ end
1336
+
1337
+ # create the individual feed items
1338
+ @items = []
1339
+ if raw_items != nil
1340
+ for item_node in raw_items
1341
+ new_item = FeedItem.new
1342
+ new_item.xml_data = item_node.to_s
1343
+ new_item.feed = self
1344
+ @items << new_item
1345
+ end
1346
+ end
534
1347
  end
535
1348
 
536
- # set all of the properties
537
- if title != ""
538
- self.title = title
539
- else
540
- self.title = nil
1349
+ # Sort the items
1350
+ @items = @items.sort do |a,b|
1351
+ (b.time or Time.mktime(1970)) <=> (a.time or Time.mktime(1970))
541
1352
  end
542
- if link != ""
543
- self.link = link
544
- else
545
- self.link = nil
1353
+ return @items
1354
+ end
1355
+
1356
+ # The time that the feed was last requested from the remote server. Nil if it has
1357
+ # never been pulled, or if it was created from scratch.
1358
+ def last_retrieved
1359
+ unless self.cache_object.nil?
1360
+ @last_retrieved = self.cache_object.last_retrieved
546
1361
  end
547
- if description != ""
548
- self.description = description
549
- else
550
- self.description = nil
1362
+ return @last_retrieved
1363
+ end
1364
+
1365
+ # Sets the time that the feed was last updated.
1366
+ def last_retrieved=(new_last_retrieved)
1367
+ @last_retrieved = new_last_retrieved
1368
+ unless self.cache_object.nil?
1369
+ self.cache_object.last_retrieved = new_last_retrieved
551
1370
  end
552
- if image_link != ""
553
- self.image_link = image_link
554
- else
555
- self.image_link = nil
1371
+ end
1372
+
1373
+ # True if this feed contains audio content enclosures
1374
+ def podcast?
1375
+ podcast = false
1376
+ $test_feed.items.each do |item|
1377
+ item.enclosures.each do |enclosure|
1378
+ podcast = true if enclosure.audio?
1379
+ end
556
1380
  end
557
- if feed_time_to_live != nil
558
- self.time_to_live = feed_time_to_live
1381
+ return podcast
1382
+ end
1383
+
1384
+ # True if this feed contains video content enclosures
1385
+ def vidlog?
1386
+ vidlog = false
1387
+ $test_feed.items.each do |item|
1388
+ item.enclosures.each do |enclosure|
1389
+ vidlog = true if enclosure.video?
1390
+ end
1391
+ end
1392
+ return vidlog
1393
+ end
1394
+
1395
+ # True if the feed was not last retrieved from the cache.
1396
+ def live?
1397
+ return @live
1398
+ end
1399
+
1400
+ # True if the feed has expired and must be reacquired from the remote server.
1401
+ def expired?
1402
+ return self.last_retrieved == nil || (self.last_retrieved + self.time_to_live.hour) < Time.now
1403
+ end
1404
+
1405
+ # Forces this feed to expire.
1406
+ def expire
1407
+ self.last_retrieved = Time.mktime(1970)
1408
+ self.save
1409
+ end
1410
+
1411
+ # A hook method that is called during the feed generation process. Overriding this method
1412
+ # will enable additional content to be inserted into the feed.
1413
+ def build_xml_hook(feed_type, version, xml_builder)
1414
+ return nil
1415
+ end
1416
+
1417
+ # Generates xml based on the content of the feed
1418
+ def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
1419
+ if feed_type == "rss" && version == 0.0
1420
+ version = 1.0
1421
+ elsif feed_type == "atom" && version == 0.0
1422
+ version = 0.3
1423
+ end
1424
+ if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
1425
+ # RDF-based rss format
1426
+ return xml_builder.tag!("rdf:RDF") do
1427
+ xml_builder.channel("rdf:about" => CGI.escapeHTML(link)) do
1428
+ unless title.nil? || title == ""
1429
+ xml_builder.title(title)
1430
+ else
1431
+ xml_builder.title
1432
+ end
1433
+ unless link.nil? || link == ""
1434
+ xml_builder.link(link)
1435
+ else
1436
+ xml_builder.link
1437
+ end
1438
+ unless image_link.nil? || image_link == ""
1439
+ xml_builder.image("rdf:resource" => CGI.escapeHTML(image_link))
1440
+ end
1441
+ unless description.nil? || description == ""
1442
+ xml_builder.description(description)
1443
+ else
1444
+ xml_builder.description
1445
+ end
1446
+ unless language.nil? || language == ""
1447
+ xml_builder.tag!("dc:language", language)
1448
+ end
1449
+ xml_builder.tag!("syn:updatePeriod", "hourly")
1450
+ xml_builder.tag!("syn:updateFrequency", (time_to_live / 1.hour).to_s)
1451
+ xml_builder.tag!("syn:updateBase", Time.mktime(1970).iso8601)
1452
+ xml_builder.items do
1453
+ xml_builder.tag!("rdf:Seq") do
1454
+ unless items.nil?
1455
+ for item in items
1456
+ if item.link.nil?
1457
+ raise "Cannot generate an rdf-based feed with a nil item link field."
1458
+ end
1459
+ xml_builder.tag!("rdf:li", "rdf:resource" => CGI.escapeHTML(item.link))
1460
+ end
1461
+ end
1462
+ end
1463
+ end
1464
+ build_xml_hook(feed_type, version, xml_builder)
1465
+ end
1466
+ unless image_link.nil? || image_link == ""
1467
+ xml_builder.image("rdf:about" => CGI.escapeHTML(image_link)) do
1468
+ unless title.nil? || title == ""
1469
+ xml_builder.title(title)
1470
+ else
1471
+ xml_builder.title
1472
+ end
1473
+ unless image_link.nil? || image_link == ""
1474
+ xml_builder.url(image_link)
1475
+ end
1476
+ unless link.nil? || link == ""
1477
+ xml_builder.link(link)
1478
+ else
1479
+ xml_builder.link
1480
+ end
1481
+ end
1482
+ end
1483
+ unless items.nil?
1484
+ for item in items
1485
+ item.build_xml(feed_type, version, xml_builder)
1486
+ end
1487
+ end
1488
+ end
1489
+ elsif feed_type == "rss"
1490
+ # normal rss format
1491
+ return xml_builder.rss("version" => version.to_s) do
1492
+ unless title.nil? || title == ""
1493
+ xml_builder.title(title)
1494
+ end
1495
+ unless link.nil? || link == ""
1496
+ xml_builder.link(link)
1497
+ end
1498
+ unless description.nil? || description == ""
1499
+ xml_builder.description(description)
1500
+ end
1501
+ xml_builder.ttl((time_to_live / 1.minute).to_s)
1502
+ xml_builder.generator("http://www.sporkmonger.com/projects/feedtools")
1503
+ build_xml_hook(feed_type, version, xml_builder)
1504
+ unless items.nil?
1505
+ for item in items
1506
+ item.build_xml(feed_type, version, xml_builder)
1507
+ end
1508
+ end
1509
+ end
1510
+ elsif feed_type == "atom"
1511
+ # normal atom format
1512
+ return xml_builder.feed("xmlns" => "http://purl.org/atom/ns#",
1513
+ "version" => version.to_s,
1514
+ "xml:lang" => language) do
1515
+ unless title.nil? || title == ""
1516
+ xml_builder.title(title,
1517
+ "mode" => "escaped",
1518
+ "type" => "text/html")
1519
+ end
1520
+ unless link.nil? || link == ""
1521
+ xml_builder.link("href" => link,
1522
+ "rel" => "alternate",
1523
+ "type" => "text/html",
1524
+ "title" => title)
1525
+ end
1526
+ unless description.nil? || description == ""
1527
+ xml_builder.tagline(description,
1528
+ "mode" => "escaped",
1529
+ "type" => "text/html")
1530
+ end
1531
+ xml_builder.generator("FeedTools",
1532
+ "url" => "http://www.sporkmonger.com/projects/feedtools")
1533
+ build_xml_hook(feed_type, version, xml_builder)
1534
+ unless items.nil?
1535
+ for item in items
1536
+ item.build_xml(feed_type, version, xml_builder)
1537
+ end
1538
+ end
1539
+ end
1540
+ end
1541
+ end
1542
+
1543
+ # Persists the current feed state to the cache.
1544
+ def save
1545
+ if FeedTools.feed_cache.nil?
1546
+ raise "Caching is currently disabled. Cannot save to cache."
1547
+ elsif self.url.nil?
1548
+ raise "The url field must be set to save to the cache."
1549
+ elsif self.xml_data.nil?
1550
+ raise "The xml_data field must be set to save to the cache."
1551
+ elsif self.cache_object.nil?
1552
+ raise "The cache_object is currently nil. Cannot save to cache."
559
1553
  else
560
- self.time_to_live = nil
1554
+ self.cache_object.url = self.url
1555
+ self.cache_object.title = self.title
1556
+ self.cache_object.link = self.link
1557
+ self.cache_object.xml_data = self.xml_data
1558
+ unless self.http_response.nil?
1559
+ self.cache_object.http_headers = self.http_headers.to_yaml
1560
+ end
1561
+ self.cache_object.last_retrieved = self.last_retrieved
1562
+ self.cache_object.save
1563
+ end
1564
+ end
1565
+
1566
+ alias_method :tagline, :description
1567
+ alias_method :tagline=, :description=
1568
+ alias_method :subtitle, :description
1569
+ alias_method :subtitle=, :description=
1570
+ alias_method :abstract, :description
1571
+ alias_method :abstract=, :description=
1572
+ alias_method :content, :description
1573
+ alias_method :content=, :description=
1574
+ alias_method :ttl, :time_to_live
1575
+ alias_method :ttl=, :time_to_live=
1576
+ alias_method :guid, :id
1577
+ alias_method :guid=, :id=
1578
+ alias_method :entries, :items
1579
+
1580
+ # passes missing methods to the cache_object
1581
+ def method_missing(msg, *params)
1582
+ if self.cache_object.nil?
1583
+ raise NoMethodError, "Invalid method #{msg.to_s}"
1584
+ end
1585
+ return self.cache_object.send(msg, params)
1586
+ end
1587
+
1588
+ # passes missing methods to the FeedTools.feed_cache
1589
+ def Feed.method_missing(msg, *params)
1590
+ if FeedTools.feed_cache.nil?
1591
+ raise NoMethodError, "Invalid method Feed.#{msg.to_s}"
1592
+ end
1593
+ result = FeedTools.feed_cache.send(msg, params)
1594
+ if result.kind_of? FeedTools.feed_cache
1595
+ result = Feed.open(result.url)
1596
+ end
1597
+ return result
1598
+ end
1599
+ end
1600
+
1601
+ class FeedItem
1602
+ include REXML
1603
+ include AttributeDictionary
1604
+
1605
+ # This class stores information about a feed item's file enclosures.
1606
+ class Enclosure
1607
+ include AttributeDictionary
1608
+
1609
+ # The url for the enclosure
1610
+ attr_accessor :url
1611
+ # The MIME type of the file referenced by the enclosure
1612
+ attr_accessor :type
1613
+ # The size of the file referenced by the enclosure
1614
+ attr_accessor :file_size
1615
+ # The total play time of the file referenced by the enclosure
1616
+ attr_accessor :duration
1617
+ # The height in pixels of the enclosed media
1618
+ attr_accessor :height
1619
+ # The width in pixels of the enclosed media
1620
+ attr_accessor :width
1621
+ # The bitrate of the enclosed media
1622
+ attr_accessor :bitrate
1623
+ # The framerate of the enclosed media
1624
+ attr_accessor :framerate
1625
+ # The thumbnail for this enclosure
1626
+ attr_accessor :thumbnail
1627
+ # The categories for this enclosure
1628
+ attr_accessor :categories
1629
+ # A hash of the enclosed file
1630
+ attr_accessor :hash
1631
+ # A website containing some kind of media player instead of a direct
1632
+ # link to the media file.
1633
+ attr_accessor :player
1634
+ # A list of credits for the enclosed media
1635
+ attr_accessor :credits
1636
+ # A text rendition of the enclosed media
1637
+ attr_accessor :text
1638
+ # A list of alternate version of the enclosed media file
1639
+ attr_accessor :versions
1640
+ # The default version of the enclosed media file
1641
+ attr_accessor :default_version
1642
+
1643
+ # Returns true if this is the default enclosure
1644
+ def is_default?
1645
+ return @is_default
1646
+ end
1647
+
1648
+ # Sets whether this is the default enclosure for the media group
1649
+ def is_default=(new_is_default)
1650
+ @is_default = new_is_default
561
1651
  end
562
1652
 
563
- parse_feed_hook(feed_data)
564
- if Feed.cache_enabled?
565
- save
1653
+ # Returns true if the enclosure contains explicit material
1654
+ def explicit?
1655
+ return @explicit
1656
+ end
1657
+
1658
+ # Sets the explicit attribute on the enclosure
1659
+ def explicit=(new_explicit)
1660
+ @explicit = new_explicit
1661
+ end
1662
+
1663
+ # Determines if the object is a sample, or the full version of the
1664
+ # object, or if it is a stream.
1665
+ # Possible values are 'sample', 'full', 'nonstop'.
1666
+ def expression
1667
+ return @expression
1668
+ end
1669
+
1670
+ # Sets the expression attribute on the enclosure.
1671
+ # Allowed values are 'sample', 'full', 'nonstop'.
1672
+ def expression=(new_expression)
1673
+ unless ['sample', 'full', 'nonstop'].include? new_expression.downcase
1674
+ raise ArgumentError,
1675
+ "Permitted values are 'sample', 'full', 'nonstop'."
1676
+ end
1677
+ @expression = new_expression.downcase
566
1678
  end
567
1679
 
568
- # check and make sure we don't have any cached feed_items with a nil link
569
- # if we do, we need to start from scratch to avoid duplicates
570
- for item_link in feed_items.map { |item| item.link }
571
- if item_link.nil?
572
- FeedItem.delete_all("feed_id = '#{self.id}'")
573
- break
1680
+ # Returns true if this enclosure contains audio content
1681
+ def audio?
1682
+ unless self.type.nil?
1683
+ return true if (self.type =~ /^audio/) != nil
1684
+ end
1685
+ # TODO: create a more complete list
1686
+ # =================================
1687
+ audio_extensions = ['mp3', 'm4a', 'm4p', 'wav', 'ogg', 'wma']
1688
+ audio_extensions.each do |extension|
1689
+ if (url =~ /#{extension}$/) != nil
1690
+ return true
1691
+ end
574
1692
  end
1693
+ return false
575
1694
  end
576
1695
 
577
- # parse the feed items
578
- @feed_items_unsorted = []
579
- if items != nil
580
- for item_node in items
581
- @feed_items_unsorted << handle_feed_item(item_node.to_s)
1696
+ # Returns true if this enclosure contains video content
1697
+ def video?
1698
+ unless self.type.nil?
1699
+ return true if (self.type =~ /^video/) != nil
1700
+ return true if self.type == "image/mov"
1701
+ end
1702
+ # TODO: create a more complete list
1703
+ # =================================
1704
+ video_extensions = ['mov', 'mp4', 'avi', 'wmv', 'asf']
1705
+ video_extensions.each do |extension|
1706
+ if (url =~ /#{extension}$/) != nil
1707
+ return true
1708
+ end
582
1709
  end
1710
+ return false
583
1711
  end
584
- return self
585
1712
  end
586
-
587
- # Locates the feed item in the database based on the supplied item xml data.
588
- def find_feed_item_by_data(item_data)
589
- item_node = Document.new(item_data).root
1713
+ EnclosureCategory = Struct.new( "EnclosureCategory", :category, :scheme, :label )
1714
+ EnclosureHash = Struct.new( "EnclosureHash", :hash, :type )
1715
+ EnclosurePlayer = Struct.new( "EnclosurePlayer", :url, :height, :width )
1716
+ EnclosureCredit = Struct.new( "EnclosureCredit", :name, :role )
1717
+ EnclosureThumbnail = Struct.new( "EnclosureThumbnail", :url, :height, :width )
590
1718
 
591
- # get the link
592
- item_link = XPath.first(item_node, "link[@rel='alternate']/@href").to_s
593
- if item_link == ""
594
- item_link = XPath.first(item_node, "link/@href").to_s
1719
+ # Returns the parent feed of this feed item
1720
+ def feed
1721
+ return @feed
1722
+ end
1723
+
1724
+ # Sets the parent feed of this feed item
1725
+ def feed=(new_feed)
1726
+ @feed = new_feed
1727
+ end
1728
+
1729
+ # Returns the feed item's raw xml data.
1730
+ def xml_data
1731
+ return @xml_data
1732
+ end
1733
+
1734
+ # Sets the feed item's xml data.
1735
+ def xml_data=(new_xml_data)
1736
+ @xml_data = new_xml_data
1737
+ end
1738
+
1739
+ # Returns a REXML Document of the xml_data
1740
+ def xml
1741
+ if @xml_doc.nil?
1742
+ @xml_doc = Document.new(xml_data)
1743
+ end
1744
+ return @xml_doc
1745
+ end
1746
+
1747
+ # Returns the first node within the root_node that matches the xpath query.
1748
+ def find_node(xpath)
1749
+ return XPath.first(root_node, xpath)
1750
+ end
1751
+
1752
+ # Returns all nodes within the root_node that match the xpath query.
1753
+ def find_all_nodes(xpath)
1754
+ return XPath.match(root_node, xpath)
1755
+ end
1756
+
1757
+ # Returns the root node of the feed item.
1758
+ def root_node
1759
+ if @root_node.nil?
1760
+ @root_node = xml.root
1761
+ end
1762
+ return @root_node
1763
+ end
1764
+
1765
+ # Returns the feed item title
1766
+ def title
1767
+ if @title.nil?
1768
+ if XPath.first(root_node, "title/@type").to_s == "xhtml" ||
1769
+ XPath.first(root_node, "title/@mode").to_s == "xhtml"
1770
+ @title = XPath.first(root_node, "title").inner_xml
1771
+ elsif XPath.first(root_node, "title/@type").to_s == "escaped" ||
1772
+ XPath.first(root_node, "title/@mode").to_s == "escaped"
1773
+ @title = CGI.unescapeHTML(
1774
+ XPath.first(root_node, "title/text()").to_s)
1775
+ else
1776
+ @title = CGI.unescapeHTML(
1777
+ XPath.first(root_node, "title/text()").to_s)
1778
+ end
1779
+ unless @title.nil?
1780
+ @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip))
1781
+ end
1782
+ if @title != ""
1783
+ # Some blogging tools include the number of comments in a post
1784
+ # in the title... this is supremely ugly, and breaks any
1785
+ # applications which expect the title to be static, so we're
1786
+ # gonna strip them out.
1787
+ #
1788
+ # If for some incredibly wierd reason you need the actual
1789
+ # unstripped title, just use find_node("title/text()").to_s
1790
+ @title = FeedTools.strip_html(
1791
+ @title.strip.gsub(/\[\d*\]$/, "")).strip
1792
+ @title.gsub!(/\n/, " ")
1793
+ end
1794
+ @title = nil if @title == ""
1795
+ end
1796
+ return @title
1797
+ end
1798
+
1799
+ # Sets the feed item title
1800
+ def title=(new_title)
1801
+ @title = new_title
1802
+ end
1803
+
1804
+ # Returns the feed item description
1805
+ def description
1806
+ if @description.nil?
1807
+ # get the item content
1808
+ @description = ""
1809
+ body_node = XPath.first(root_node, "xhtml:body")
1810
+ if body_node == nil
1811
+ body_node = XPath.first(root_node, "body")
1812
+ end
1813
+ if body_node != nil
1814
+ @description = body_node.inner_xml
1815
+ end
1816
+ if @description == ""
1817
+ @description =
1818
+ CGI.unescapeHTML(XPath.first(root_node, "content:encoded/text()").to_s)
1819
+ end
1820
+ if @description == ""
1821
+ begin
1822
+ @description = XPath.first(root_node, "description").cdatas.first.to_s
1823
+ rescue
1824
+ @description = ""
1825
+ end
1826
+ if @description == ""
1827
+ @description = XPath.first(root_node, "description/text()").to_s
1828
+ end
1829
+ if @description != ""
1830
+ if XPath.first(root_node, "description/@encoding").to_s != ""
1831
+ # Not supported... yet.
1832
+ @description = "[Embedded data objects are not supported.]"
1833
+ else
1834
+ @description = CGI.unescapeHTML(@description)
1835
+ end
1836
+ end
1837
+ end
1838
+ if @description == ""
1839
+ @description = XPath.first(root_node, "content/text()").to_s
1840
+ if @description != "" &&
1841
+ (XPath.first(root_node, "content/@mode").to_s == "escaped" ||
1842
+ XPath.first(root_node, "content/@type").to_s == "escaped")
1843
+ @description = CGI.unescapeHTML(@description)
1844
+ end
1845
+ if XPath.first(root_node, "content/@mode").to_s == "xhtml" ||
1846
+ XPath.first(root_node, "content/@type").to_s == "xhtml"
1847
+ @description = XPath.first(root_node, "content").inner_xml
1848
+ end
1849
+ end
1850
+ if @description == ""
1851
+ begin
1852
+ @description = XPath.first(root_node, "description").inner_xml
1853
+ rescue
1854
+ end
1855
+ end
1856
+ if @description == ""
1857
+ @description = self.itunes_summary
1858
+ @description = "" if @description.nil?
1859
+ end
1860
+ if @description == ""
1861
+ @description = self.itunes_subtitle
1862
+ @description = "" if @description.nil?
1863
+ end
1864
+ if @description == ""
1865
+ @description = self.media_text
1866
+ @description = "" if @description.nil?
1867
+ end
1868
+
1869
+ unless @description.nil?
1870
+ @description = FeedTools.sanitize_html(@description)
1871
+ end
1872
+
1873
+ # If it started with a bunch of divs, hack them right off. We can put
1874
+ # them back later if they're needed.
1875
+ @description.gsub!(/^(<div[^>]*>)*/, "")
1876
+ @description.gsub!(/(<\/div>)*$/, "")
1877
+
1878
+ @description.gsub!(/\n/, " ") if @description.size < 80
1879
+ @description = @description.strip unless @description.nil?
1880
+ @description = nil if @description == ""
595
1881
  end
596
- if item_link == ""
597
- item_link = XPath.first(item_node, "link/text()").to_s
1882
+ return @description
1883
+ end
1884
+
1885
+ # Sets the feed item description
1886
+ def description=(new_description)
1887
+ @description = new_description
1888
+ end
1889
+
1890
+ # Returns the feed item link
1891
+ def link
1892
+ if @link.nil?
1893
+ @link = XPath.first(root_node, "link[@rel='alternate']/@href").to_s
1894
+ if @link == ""
1895
+ @link = XPath.first(root_node, "link/@href").to_s
1896
+ end
1897
+ if @link == ""
1898
+ @link = XPath.first(root_node, "link/text()").to_s
1899
+ end
1900
+ if @link == ""
1901
+ @link = XPath.first(root_node, "@rdf:about").to_s
1902
+ end
1903
+ if @link == ""
1904
+ @link = XPath.first(root_node, "guid[@isPermaLink='true']/text()").to_s
1905
+ end
1906
+ if @link == ""
1907
+ if FeedTools.is_url? self.guid
1908
+ @link = self.guid
1909
+ end
1910
+ end
1911
+ if @link != ""
1912
+ @link = CGI.unescapeHTML(@link)
1913
+ end
1914
+ if @link != "" && (@link =~ /http:\/\//) != 0 && (@link =~ /https:\/\//) != 0
1915
+ if (feed.base[-1..-1] == "/" && @link[0..0] == "/")
1916
+ @link = @link[1..-1]
1917
+ end
1918
+ # prepend the base to the link since they seem to have used a relative path
1919
+ @link = feed.base + @link
1920
+ end
1921
+ @link = FeedTools.normalize_url(@link)
598
1922
  end
599
- if item_link == ""
600
- item_link = XPath.first(item_node, "@rdf:about").to_s
1923
+ return @link
1924
+ end
1925
+
1926
+ # Sets the feed item link
1927
+ def link=(new_link)
1928
+ @link = new_link
1929
+ end
1930
+
1931
+ # Returns the feed comment link
1932
+ def comment_link
1933
+ if @comment_link.nil?
1934
+ # get the feed comment link from the xml document
1935
+ @comment_link = XPath.first(root_node, "comments/text()").to_s
1936
+ if @comment_link == ""
1937
+ @comment_link = self.link
1938
+ end
1939
+ @comment_link = FeedTools.normalize_url(@comment_link)
601
1940
  end
602
- if item_link == ""
603
- item_link = XPath.first(item_node, "guid/text()").to_s
1941
+ return @comment_link
1942
+ end
1943
+
1944
+ # Sets the feed comment link
1945
+ def comment_link=(new_comment_link)
1946
+ @comment_link = new_comment_link
1947
+ end
1948
+
1949
+ # Returns the feed image link
1950
+ def image_link
1951
+ if @image_link.nil?
1952
+ # get the feed image link from the xml document
1953
+ if @image_link == ""
1954
+ @image_link = XPath.first(root_node, "link[@type='image/jpeg']/@href").to_s
1955
+ end
1956
+ if @image_link == ""
1957
+ @image_link = XPath.first(root_node, "link[@type='image/gif']/@href").to_s
1958
+ end
1959
+ if @image_link == ""
1960
+ @image_link = XPath.first(root_node, "link[@type='image/png']/@href").to_s
1961
+ end
1962
+ # The following two should technically never occur, but have been included
1963
+ # simply because I've seen both occuring in the wild at least once.
1964
+ if @image_link == ""
1965
+ @image_link = XPath.first(root_node, "image/url/text()").to_s
1966
+ end
1967
+ if @image_link == ""
1968
+ @image_link = XPath.first(root_node, "image/@rdf:resource").to_s
1969
+ end
1970
+ if @image_link == ""
1971
+ # If there's only a media thumbnail, we can just borrow it. Technically, this isn't
1972
+ # ideal, but chances are very good that anything that makes use of this image is
1973
+ # simply not going to care anyhow.
1974
+ @image_link = XPath.first(root_node, "media:thumbnail/@url").to_s
1975
+ if @image_link == ""
1976
+ @media_image_link = @image_link
1977
+ end
1978
+ end
1979
+ if @image_link == ""
1980
+ # If there's only an itunes image, we can just borrow it. See comment above regarding
1981
+ # less-than-ideal-ness.
1982
+ if @itunes_image_link == ""
1983
+ @image_link = XPath.first(root_node, "itunes:image/@href").to_s
1984
+ if @image_link == ""
1985
+ @image_link = XPath.first(root_node, "itunes:link[@rel='image']/@href").to_s
1986
+ end
1987
+ @itunes_image_link = @image_link
1988
+ else
1989
+ @image_link = @itunes_image_link
1990
+ end
1991
+ end
1992
+ @image_link = FeedTools.normalize_url(@image_link)
1993
+ end
1994
+ return @image_link
1995
+ end
1996
+
1997
+ # Sets the feed image link
1998
+ def image_link=(new_image_link)
1999
+ @image_link = new_image_link
2000
+ end
2001
+
2002
+ # Returns the feed item itunes image link
2003
+ #
2004
+ # If it's not present, falls back to the normal image link.
2005
+ # Technically, the itunes spec says that the image needs to be
2006
+ # square and larger than 300x300, but hey, if there's an image
2007
+ # to be had, it's better than none at all.
2008
+ def itunes_image_link
2009
+ if @itunes_image_link.nil?
2010
+ # get the feed item itunes image link from the xml document
2011
+ @itunes_image_link = XPath.first(root_node, "itunes:image/@href").to_s
2012
+ if @itunes_image_link == ""
2013
+ @itunes_image_link = XPath.first(root_node, "itunes:link[@rel='image']/@href").to_s
2014
+ end
2015
+ if @itunes_image_link == ""
2016
+ @itunes_image_link = self.image_link
2017
+ end
2018
+ @itunes_image_link = FeedTools.normalize_url(@itunes_image_link)
2019
+ end
2020
+ return @itunes_image_link
2021
+ end
2022
+
2023
+ # Sets the feed item itunes image link
2024
+ def itunes_image_link=(new_itunes_image_link)
2025
+ @itunes_image_link = new_itunes_image_link
2026
+ end
2027
+
2028
+ # Returns the feed item media thumbnail link
2029
+ #
2030
+ # If it's not present, falls back to the normal image link.
2031
+ def media_thumbnail_link
2032
+ if @media_thumbnail_link.nil?
2033
+ # get the feed item itunes image link from the xml document
2034
+ @media_thumbnail_link = XPath.first(root_node, "media:thumbnail/@url").to_s
2035
+ if @media_thumbnail_link == ""
2036
+ @media_thumbnail_link = image_link
2037
+ end
2038
+ @media_thumbnail_link = FeedTools.normalize_url(@media_thumbnail_link)
604
2039
  end
605
- item_title = XPath.first(item_node, "title/text()").to_s
2040
+ return @media_thumbnail_link
2041
+ end
2042
+
2043
+ # Sets the feed item media thumbnail url
2044
+ def media_thumbnail_link=(new_media_thumbnail_link)
2045
+ @media_thumbnail_link = new_media_thumbnail_link
2046
+ end
606
2047
 
607
- feed_item = FeedItem.find_by_feed_id_and_link(self.id, item_link)
608
- unless feed_item.nil?
609
- # Some blogging tools alter the title of an item when the number of comments change (for
610
- # example, TextPattern) and many email feed dumps use the same link for multiple
611
- # items (for example, GMail). We try to take both of these cases into account here.
612
- existing_title = feed_item.title
613
- item_title = item_title.gsub(/\[\d*\]/,"").strip
614
- existing_title = existing_title.gsub(/\[\d*\]/,"").strip
615
- item_title = item_title.gsub(/\(\d*\)/,"").strip
616
- existing_title = existing_title.gsub(/\(\d*\)/,"").strip
617
- item_title = item_title.gsub(/\{\d*\}/,"").strip
618
- existing_title = existing_title.gsub(/\{\d*\}/,"").strip
619
- if existing_title != item_title
620
- feed_item = nil
2048
+ # Returns the feed items's unique id
2049
+ def id
2050
+ if @id.nil?
2051
+ @id = XPath.first(root_node, "id/text()").to_s
2052
+ if @id == ""
2053
+ @id = XPath.first(root_node, "guid/text()").to_s
621
2054
  end
2055
+ @id = nil if @id == ""
622
2056
  end
623
- return feed_item
2057
+ return @id
624
2058
  end
625
2059
 
626
- def handle_feed_item(item_data)
627
- feed_item = find_feed_item_by_data(item_data)
628
- if feed_item.nil?
629
- feed_item = FeedItem.new
630
- end
631
- feed_item.feed = self
632
- feed_item.parse_item(item_data)
633
- return feed_item
634
- end
635
-
636
- def build_feed_hook(feed_type, version, xml_builder)
637
- return nil
2060
+ # Sets the feed item's unique id
2061
+ def id=(new_id)
2062
+ @id = new_id
638
2063
  end
639
-
640
- def build_feed(feed_type, version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
641
- if feed_type == "rss" && version == 0.0
642
- version = 1.0
643
- elsif feed_type == "atom" && version == 0.0
644
- version = 0.3
645
- end
646
- if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
647
- # RDF-based rss format
648
- return xml_builder.tag!("rdf:RDF") do
649
- xml_builder.channel("rdf:about" => CGI.escapeHTML(link)) do
650
- unless title.nil? || title == ""
651
- xml_builder.title(title)
652
- else
653
- xml_builder.title
2064
+
2065
+ # Returns all feed item enclosures
2066
+ def enclosures
2067
+ if @enclosures.nil?
2068
+ @enclosures = []
2069
+
2070
+ # First, load up all the different possible sources of enclosures
2071
+ rss_enclosures = XPath.match(root_node, "enclosure")
2072
+ atom_enclosures = XPath.match(root_node, "link[@rel='enclosure']")
2073
+ media_content_enclosures = XPath.match(root_node, "media:content")
2074
+ media_group_enclosures = XPath.match(root_node, "media:group")
2075
+
2076
+ # Parse RSS-type enclosures. Thanks to a few buggy enclosures implementations,
2077
+ # sometimes these also manage to show up in atom files.
2078
+ for enclosure_node in rss_enclosures
2079
+ enclosure = Enclosure.new
2080
+ enclosure.url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s)
2081
+ enclosure.type = enclosure_node.attributes["type"].to_s
2082
+ enclosure.file_size = enclosure_node.attributes["length"].to_i
2083
+ enclosure.credits = []
2084
+ enclosure.explicit = false
2085
+ @enclosures << enclosure
2086
+ end
2087
+
2088
+ # Parse atom-type enclosures. If there are repeats of the same enclosure object,
2089
+ # we merge the two together.
2090
+ for enclosure_node in atom_enclosures
2091
+ enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["href"].to_s)
2092
+ enclosure = nil
2093
+ new_enclosure = false
2094
+ for existing_enclosure in @enclosures
2095
+ if existing_enclosure.url == enclosure_url
2096
+ enclosure = existing_enclosure
2097
+ break
654
2098
  end
655
- unless link.nil? || link == ""
656
- xml_builder.link(link)
657
- else
658
- xml_builder.link
2099
+ end
2100
+ if enclosure.nil?
2101
+ new_enclosure = true
2102
+ enclosure = Enclosure.new
2103
+ end
2104
+ enclosure.url = enclosure_url
2105
+ enclosure.type = enclosure_node.attributes["type"].to_s
2106
+ enclosure.file_size = enclosure_node.attributes["length"].to_i
2107
+ enclosure.credits = []
2108
+ enclosure.explicit = false
2109
+ if new_enclosure
2110
+ @enclosures << enclosure
2111
+ end
2112
+ end
2113
+
2114
+ # Creates an anonymous method to parse content objects from the media module. We
2115
+ # do this to avoid excessive duplication of code since we have to do identical
2116
+ # processing for content objects within group objects.
2117
+ parse_media_content = lambda do |media_content_nodes|
2118
+ affected_enclosures = []
2119
+ for enclosure_node in media_content_nodes
2120
+ enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s)
2121
+ enclosure = nil
2122
+ new_enclosure = false
2123
+ for existing_enclosure in @enclosures
2124
+ if existing_enclosure.url == enclosure_url
2125
+ enclosure = existing_enclosure
2126
+ break
2127
+ end
659
2128
  end
660
- unless image_link.nil? || image_link == ""
661
- xml_builder.image("rdf:resource" => CGI.escapeHTML(image_link))
2129
+ if enclosure.nil?
2130
+ new_enclosure = true
2131
+ enclosure = Enclosure.new
662
2132
  end
663
- unless description.nil? || description == ""
664
- xml_builder.description(description)
665
- else
666
- xml_builder.description
2133
+ enclosure.url = enclosure_url
2134
+ enclosure.type = enclosure_node.attributes["type"].to_s
2135
+ enclosure.file_size = enclosure_node.attributes["fileSize"].to_i
2136
+ enclosure.duration = enclosure_node.attributes["duration"].to_s
2137
+ enclosure.height = enclosure_node.attributes["height"].to_i
2138
+ enclosure.width = enclosure_node.attributes["width"].to_i
2139
+ enclosure.bitrate = enclosure_node.attributes["bitrate"].to_i
2140
+ enclosure.framerate = enclosure_node.attributes["framerate"].to_i
2141
+ enclosure.expression = enclosure_node.attributes["expression"].to_s
2142
+ enclosure.is_default =
2143
+ (enclosure_node.attributes["isDefault"].to_s.downcase == "true")
2144
+ if XPath.first(enclosure_node, "media:thumbnail/@url").to_s != ""
2145
+ enclosure.thumbnail = EnclosureThumbnail.new(
2146
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@url").to_s),
2147
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@height").to_s),
2148
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@width").to_s)
2149
+ )
2150
+ if enclosure.thumbnail.height == ""
2151
+ enclosure.thumbnail.height = nil
2152
+ end
2153
+ if enclosure.thumbnail.width == ""
2154
+ enclosure.thumbnail.width = nil
2155
+ end
667
2156
  end
668
- unless language.nil? || language == ""
669
- xml_builder.tag!("dc:language", language)
2157
+ enclosure.categories = []
2158
+ for category in XPath.match(enclosure_node, "media:category")
2159
+ enclosure.categories << EnclosureCategory.new(
2160
+ CGI.unescapeHTML(category.text),
2161
+ CGI.unescapeHTML(category.attributes["scheme"].to_s),
2162
+ CGI.unescapeHTML(category.attributes["label"].to_s)
2163
+ )
2164
+ if enclosure.categories.last.scheme == ""
2165
+ enclosure.categories.last.scheme = nil
2166
+ end
2167
+ if enclosure.categories.last.label == ""
2168
+ enclosure.categories.last.label = nil
2169
+ end
670
2170
  end
671
- xml_builder.tag!("syn:updatePeriod", "hourly")
672
- xml_builder.tag!("syn:updateFrequency", (time_to_live / 1.hour).to_s)
673
- xml_builder.tag!("syn:updateBase", Time.mktime(1970).iso8601)
674
- xml_builder.items do
675
- xml_builder.tag!("rdf:Seq") do
676
- unless feed_items.nil?
677
- for item in feed_items
678
- if item.link.nil?
679
- raise "Cannot generate an rdf-based feed with a nil item link field."
680
- end
681
- xml_builder.tag!("rdf:li", "rdf:resource" => CGI.escapeHTML(item.link))
682
- end
683
- end
2171
+ if XPath.first(enclosure_node, "media:hash/text()").to_s != ""
2172
+ enclosure.hash = EnclosureHash.new(
2173
+ FeedTools.sanitize_html(CGI.unescapeHTML(XPath.first(
2174
+ enclosure_node, "media:hash/text()").to_s), :strip),
2175
+ "md5"
2176
+ )
2177
+ end
2178
+ if XPath.first(enclosure_node, "media:player/@url").to_s != ""
2179
+ enclosure.player = EnclosurePlayer.new(
2180
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@url").to_s),
2181
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@height").to_s),
2182
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@width").to_s)
2183
+ )
2184
+ if enclosure.player.height == ""
2185
+ enclosure.player.height = nil
2186
+ end
2187
+ if enclosure.player.width == ""
2188
+ enclosure.player.width = nil
2189
+ end
2190
+ end
2191
+ enclosure.credits = []
2192
+ for credit in XPath.match(enclosure_node, "media:credit")
2193
+ enclosure.credits << EnclosureCredit.new(
2194
+ CGI.unescapeHTML(CGI.unescapeHTML(credit.text)),
2195
+ CGI.unescapeHTML(credit.attributes["role"].to_s.downcase)
2196
+ )
2197
+ if enclosure.credits.last.role == ""
2198
+ enclosure.credits.last.role = nil
684
2199
  end
685
2200
  end
686
- build_feed_hook(feed_type, version, xml_builder)
2201
+ enclosure.explicit = (XPath.first(enclosure_node,
2202
+ "media:adult/text()").to_s.downcase == "true")
2203
+ if XPath.first(enclosure_node, "media:text/text()").to_s != ""
2204
+ enclosure.text = CGI.unescapeHTML(XPath.first(enclosure_node,
2205
+ "media:text/text()").to_s)
2206
+ end
2207
+ affected_enclosures << enclosure
2208
+ if new_enclosure
2209
+ @enclosures << enclosure
2210
+ end
687
2211
  end
688
- unless image_link.nil? || image_link == ""
689
- xml_builder.image("rdf:about" => CGI.escapeHTML(image_link)) do
690
- unless title.nil? || title == ""
691
- xml_builder.title(title)
692
- else
693
- xml_builder.title
2212
+ affected_enclosures
2213
+ end
2214
+
2215
+ # Parse the independant content objects.
2216
+ parse_media_content.call(media_content_enclosures)
2217
+
2218
+ media_groups = []
2219
+
2220
+ # Parse the group objects.
2221
+ for media_group in media_group_enclosures
2222
+ group_media_content_enclosures =
2223
+ XPath.match(media_group, "media:content")
2224
+
2225
+ # Parse the content objects within the group objects.
2226
+ affected_enclosures =
2227
+ parse_media_content.call(group_media_content_enclosures)
2228
+
2229
+ # Now make sure that content objects inherit certain properties from
2230
+ # the group objects.
2231
+ for enclosure in affected_enclosures
2232
+ if enclosure.thumbnail.nil? &&
2233
+ XPath.first(media_group, "media:thumbnail/@url").to_s != ""
2234
+ enclosure.thumbnail = EnclosureThumbnail.new(
2235
+ CGI.unescapeHTML(
2236
+ XPath.first(media_group, "media:thumbnail/@url").to_s),
2237
+ CGI.unescapeHTML(
2238
+ XPath.first(media_group, "media:thumbnail/@height").to_s),
2239
+ CGI.unescapeHTML(
2240
+ XPath.first(media_group, "media:thumbnail/@width").to_s)
2241
+ )
2242
+ if enclosure.thumbnail.height == ""
2243
+ enclosure.thumbnail.height = nil
694
2244
  end
695
- unless image_link.nil? || image_link == ""
696
- xml_builder.url(image_link)
2245
+ if enclosure.thumbnail.width == ""
2246
+ enclosure.thumbnail.width = nil
697
2247
  end
698
- unless link.nil? || link == ""
699
- xml_builder.link(link)
700
- else
701
- xml_builder.link
2248
+ end
2249
+ if (enclosure.categories.nil? || enclosure.categories.size == 0)
2250
+ enclosure.categories = []
2251
+ for category in XPath.match(media_group, "media:category")
2252
+ enclosure.categories << EnclosureCategory.new(
2253
+ CGI.unescapeHTML(category.text),
2254
+ CGI.unescapeHTML(category.attributes["scheme"].to_s),
2255
+ CGI.unescapeHTML(category.attributes["label"].to_s)
2256
+ )
2257
+ if enclosure.categories.last.scheme == ""
2258
+ enclosure.categories.last.scheme = nil
2259
+ end
2260
+ if enclosure.categories.last.label == ""
2261
+ enclosure.categories.last.label = nil
2262
+ end
702
2263
  end
703
2264
  end
704
- end
705
- unless feed_items.nil?
706
- for item in feed_items
707
- item.build_feed_item(feed_type, version, xml_builder)
2265
+ if enclosure.hash.nil? &&
2266
+ XPath.first(media_group, "media:hash/text()").to_s != ""
2267
+ enclosure.hash = EnclosureHash.new(
2268
+ CGI.unescapeHTML(XPath.first(media_group, "media:hash/text()").to_s),
2269
+ "md5"
2270
+ )
2271
+ end
2272
+ if enclosure.player.nil? &&
2273
+ XPath.first(media_group, "media:player/@url").to_s != ""
2274
+ enclosure.player = EnclosurePlayer.new(
2275
+ CGI.unescapeHTML(XPath.first(media_group, "media:player/@url").to_s),
2276
+ CGI.unescapeHTML(XPath.first(media_group, "media:player/@height").to_s),
2277
+ CGI.unescapeHTML(XPath.first(media_group, "media:player/@width").to_s)
2278
+ )
2279
+ if enclosure.player.height == ""
2280
+ enclosure.player.height = nil
2281
+ end
2282
+ if enclosure.player.width == ""
2283
+ enclosure.player.width = nil
2284
+ end
2285
+ end
2286
+ if enclosure.credits.nil? || enclosure.credits.size == 0
2287
+ enclosure.credits = []
2288
+ for credit in XPath.match(media_group, "media:credit")
2289
+ enclosure.credits << EnclosureCredit.new(
2290
+ CGI.unescapeHTML(CGI.unescapeHTML(credit.text)),
2291
+ CGI.unescapeHTML(credit.attributes["role"].to_s.downcase)
2292
+ )
2293
+ if enclosure.credits.last.role == ""
2294
+ enclosure.credits.last.role = nil
2295
+ end
2296
+ end
2297
+ end
2298
+ if enclosure.explicit?.nil?
2299
+ enclosure.explicit = (XPath.first(media_group,
2300
+ "media:adult/text()").to_s.downcase == "true") ? true : false
2301
+ end
2302
+ if enclosure.text.nil? &&
2303
+ XPath.first(media_group, "media:text/text()").to_s != ""
2304
+ enclosure.text = FeedTools.sanitize_html(CGI.unescapeHTML(
2305
+ XPath.first(media_group, "media:text/text()").to_s), :strip)
708
2306
  end
709
2307
  end
2308
+
2309
+ # Keep track of the media groups
2310
+ media_groups << affected_enclosures
710
2311
  end
711
- elsif feed_type == "rss"
712
- # normal rss format
713
- return xml_builder.rss("version" => version.to_s) do
714
- unless title.nil? || title == ""
715
- xml_builder.title(title)
716
- end
717
- unless link.nil? || link == ""
718
- xml_builder.link(link)
2312
+
2313
+ # Now we need to inherit any relevant item level information.
2314
+ if self.explicit?
2315
+ for enclosure in @enclosures
2316
+ enclosure.explicit = true
719
2317
  end
720
- unless description.nil? || description == ""
721
- xml_builder.description(description)
2318
+ end
2319
+
2320
+ # Add all the itunes categories
2321
+ for itunes_category in XPath.match(root_node, "itunes:category")
2322
+ genre = "Podcasts"
2323
+ category = itunes_category.attributes["text"].to_s
2324
+ subcategory = XPath.first(itunes_category, "itunes:category/@text").to_s
2325
+ category_path = genre
2326
+ if category != ""
2327
+ category_path << "/" + category
722
2328
  end
723
- xml_builder.ttl((time_to_live / 1.minute).to_s)
724
- xml_builder.generator("http://www.sporkmonger.com/projects/feedtools")
725
- build_feed_hook(feed_type, version, xml_builder)
726
- unless feed_items.nil?
727
- for item in feed_items
728
- item.build_feed_item(feed_type, version, xml_builder)
2329
+ if subcategory != ""
2330
+ category_path << "/" + subcategory
2331
+ end
2332
+ for enclosure in @enclosures
2333
+ if enclosure.categories.nil?
2334
+ enclosure.categories = []
729
2335
  end
2336
+ enclosure.categories << EnclosureCategory.new(
2337
+ CGI.unescapeHTML(category_path),
2338
+ CGI.unescapeHTML("http://www.apple.com/itunes/store/"),
2339
+ CGI.unescapeHTML("iTunes Music Store Categories")
2340
+ )
730
2341
  end
731
2342
  end
732
- elsif feed_type == "atom"
733
- # normal atom format
734
- return xml_builder.feed("xmlns" => "http://purl.org/atom/ns#",
735
- "version" => version.to_s,
736
- "xml:lang" => language) do
737
- unless title.nil? || title == ""
738
- xml_builder.title(title,
739
- "mode" => "escaped",
740
- "type" => "text/html")
2343
+
2344
+ for enclosure in @enclosures
2345
+ # Clean up any of those attributes that incorrectly have ""
2346
+ # or 0 as their values
2347
+ if enclosure.type == ""
2348
+ enclosure.type = nil
741
2349
  end
742
- unless link.nil? || link == ""
743
- xml_builder.link("href" => link,
744
- "rel" => "alternate",
745
- "type" => "text/html",
746
- "title" => title)
2350
+ if enclosure.file_size == 0
2351
+ enclosure.file_size = nil
747
2352
  end
748
- unless description.nil? || description == ""
749
- xml_builder.tagline(description,
750
- "mode" => "escaped",
751
- "type" => "text/html")
2353
+ if enclosure.duration == 0
2354
+ enclosure.duration = nil
752
2355
  end
753
- xml_builder.generator("FeedTools",
754
- "url" => "http://www.sporkmonger.com/projects/feedtools")
755
- build_feed_hook(feed_type, version, xml_builder)
756
- unless feed_items.nil?
757
- for item in feed_items
758
- item.build_feed_item(feed_type, version, xml_builder)
2356
+ if enclosure.height == 0
2357
+ enclosure.height = nil
2358
+ end
2359
+ if enclosure.width == 0
2360
+ enclosure.width = nil
2361
+ end
2362
+ if enclosure.bitrate == 0
2363
+ enclosure.bitrate = nil
2364
+ end
2365
+ if enclosure.framerate == 0
2366
+ enclosure.framerate = nil
2367
+ end
2368
+ if enclosure.expression == "" || enclosure.expression.nil?
2369
+ enclosure.expression = "full"
2370
+ end
2371
+
2372
+ # If an enclosure is missing the text field, fall back on the itunes:summary field
2373
+ if enclosure.text.nil? || enclosure.text = ""
2374
+ enclosure.text = self.itunes_summary
2375
+ end
2376
+
2377
+ # Make sure we don't have duplicate categories
2378
+ unless enclosure.categories.nil?
2379
+ enclosure.categories.uniq!
2380
+ end
2381
+ end
2382
+
2383
+ # And finally, now things get complicated. This is where we make
2384
+ # sure that the enclosures method only returns either default
2385
+ # enclosures or enclosures with only one version. Any enclosures
2386
+ # that are wrapped in a media:group will be placed in the appropriate
2387
+ # versions field.
2388
+ affected_enclosure_urls = []
2389
+ for media_group in media_groups
2390
+ affected_enclosure_urls =
2391
+ affected_enclosure_urls | (media_group.map do |enclosure|
2392
+ enclosure.url
2393
+ end)
2394
+ end
2395
+ @enclosures.delete_if do |enclosure|
2396
+ (affected_enclosure_urls.include? enclosure.url)
2397
+ end
2398
+ for media_group in media_groups
2399
+ default_enclosure = nil
2400
+ for enclosure in media_group
2401
+ if enclosure.is_default?
2402
+ default_enclosure = enclosure
759
2403
  end
760
2404
  end
2405
+ for enclosure in media_group
2406
+ enclosure.default_version = default_enclosure
2407
+ enclosure.versions = media_group.clone
2408
+ enclosure.versions.delete(enclosure)
2409
+ end
2410
+ @enclosures << default_enclosure
761
2411
  end
762
2412
  end
763
- end
764
-
765
- # Saves the current state of the feed to the database unless the feed lacks a remote location
766
- def save
767
- unless url.nil? || url == ""
768
- super
769
- end
770
- end
771
- end
772
2413
 
773
- class FeedItem < ActiveRecord::Base
774
- include REXML
775
-
776
- # Verifies that the required fields exist; additional ones added by the user are fine
777
- def FeedItem.table_exists?
778
- begin
779
- connection.execute "select id, feed_id, link, title, author, description, " +
780
- "time, tags from feed_items limit 1"
781
- rescue ActiveRecord::StatementInvalid
782
- return false
2414
+ # If we have a single enclosure, it's safe to inherit the itunes:duration field
2415
+ # if it's missing.
2416
+ if @enclosures.size == 1
2417
+ if @enclosures.first.duration.nil? || @enclosures.first.duration == 0
2418
+ @enclosures.first.duration = self.duration
2419
+ end
783
2420
  end
784
- return true
2421
+
2422
+ return @enclosures
785
2423
  end
786
2424
 
787
- def feed
788
- if @feed != nil
789
- return @feed
790
- elsif @feed_id != nil
791
- @feed = Feed.find_by_id(self.feed_id)
792
- return @feed
793
- else
794
- return nil
795
- end
2425
+ def enclosures=(new_enclosures)
2426
+ @enclosures = new_enclosures
796
2427
  end
797
2428
 
798
- def feed=(new_feed)
799
- self.feed_id = new_feed.id
800
- @feed = new_feed
801
- end
802
-
803
- def title
804
- return (self['title'] or "Untitled Entry")
2429
+ # Returns the feed item author
2430
+ def author_name
2431
+ # TODO: make this not suck, actually ensure we're looking at a name
2432
+ # and not an email address.
2433
+ # Also, factor in itunes module.
2434
+ # =================================================================
2435
+ if @author_name.nil?
2436
+ @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/name/text()").to_s)
2437
+ if @author_name == ""
2438
+ @author_name = CGI.unescapeHTML(XPath.first(root_node, "dc:creator/text()").to_s)
2439
+ end
2440
+ if @author_name == ""
2441
+ @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/text()").to_s)
2442
+ end
2443
+ end
2444
+ return @author_name
805
2445
  end
806
-
807
- def tag_list
808
- return tags.nil? ? nil : tags[1..-2].split("|")
2446
+
2447
+ # Sets the feed item author
2448
+ def author_name=(new_author_name)
2449
+ @author_name = new_author_name
809
2450
  end
810
-
811
- def tag_list=(new_tag_list)
812
- self.tags = "|" + (new_tag_list.map { |x| x.strip }).join("|") + "|"
2451
+
2452
+ # Returns the contents of the itunes:summary element
2453
+ def itunes_summary
2454
+ if @itunes_summary.nil?
2455
+ @itunes_summary = CGI.unescapeHTML(XPath.first(root_node,
2456
+ "itunes:summary/text()").to_s)
2457
+ if @itunes_summary == ""
2458
+ @itunes_summary = nil
2459
+ end
2460
+ unless @itunes_summary.nil?
2461
+ @itunes_summary = FeedTools.sanitize_html(@itunes_summary)
2462
+ end
2463
+ end
2464
+ return @itunes_summary
813
2465
  end
814
2466
 
815
- def tag_string
816
- return (tags.nil? ? nil : tags[1..-2]).split("|").join(", ")
2467
+ # Sets the contents of the itunes:summary element
2468
+ def itunes_summary=(new_itunes_summary)
2469
+ @itunes_summary = new_itunes_summary
817
2470
  end
818
2471
 
819
- def tag_string=(new_tag_string)
820
- self.tags = "|" + (new_tag_string.split(",").map { |x| x.strip }).join("|") + "|"
2472
+ # Returns the contents of the itunes:subtitle element
2473
+ def itunes_subtitle
2474
+ if @itunes_subtitle.nil?
2475
+ @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node,
2476
+ "itunes:subtitle/text()").to_s)
2477
+ if @itunes_subtitle == ""
2478
+ @itunes_subtitle = nil
2479
+ end
2480
+ unless @itunes_subtitle.nil?
2481
+ @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle)
2482
+ end
2483
+ end
2484
+ return @itunes_subtitle
821
2485
  end
822
2486
 
823
- def parse_feed_item_hook(item_data)
824
- return nil
2487
+ # Sets the contents of the itunes:subtitle element
2488
+ def itunes_subtitle=(new_itunes_subtitle)
2489
+ @itunes_subtitle = new_itunes_subtitle
825
2490
  end
826
2491
 
827
- def parse_item(item_data)
828
- item_node = Document.new(item_data).root
829
-
830
- # get the feed base, in case the feed items use relative paths
831
- base = feed.link
832
-
833
- # get the link
834
- link = XPath.first(item_node, "link[@rel='alternate']/@href").to_s
835
- if link == ""
836
- link = XPath.first(item_node, "link/@href").to_s
837
- end
838
- if link == ""
839
- link = XPath.first(item_node, "link/text()").to_s
840
- end
841
- if link == ""
842
- link = XPath.first(item_node, "@rdf:about").to_s
843
- end
844
- if link == ""
845
- link = XPath.first(item_node, "guid/text()").to_s
846
- end
847
- if link != ""
848
- link = CGI.unescapeHTML(link)
849
- end
850
- if link != "" && (link =~ /http:\/\//) != 0 && (link =~ /https:\/\//) != 0
851
- # ensure that we don't end up with 'http://www.foobar.com//path/to/entry'
852
- # future-proofed this so that it doesn't break when Ruby 1.9/2.0 starts
853
- # returning single character Strings instead of FixNums
854
- if (base[-1] == 47 && link[0] == 47) || (base[-1] == "/" && link[0] == "/")
855
- link = link[1..-1]
2492
+ # Returns the contents of the media:text element
2493
+ def media_text
2494
+ if @media_text.nil?
2495
+ @media_text = CGI.unescapeHTML(XPath.first(root_node,
2496
+ "itunes:subtitle/text()").to_s)
2497
+ if @media_text == ""
2498
+ @media_text = nil
2499
+ end
2500
+ unless @media_text.nil?
2501
+ @media_text = FeedTools.sanitize_html(@media_text)
856
2502
  end
857
- # prepend the base to the link since they seem to have used a relative path
858
- link = base + link
859
- end
860
-
861
- title = XPath.first(item_node, "title/text()").to_s
862
- if title != ""
863
- # some blogging tools (notably TextPattern I believe) include the number of
864
- # comments in a post in the title... this is ugly, so we're gonna strip them out
865
- title = title.gsub(/\[\d*\]/,"").strip
866
- end
867
-
868
- # get the item author
869
- author = CGI.unescapeHTML(XPath.first(item_node, "author/name/text()").to_s)
870
- if author == ""
871
- author = CGI.unescapeHTML(XPath.first(item_node, "dc:creator/text()").to_s)
872
- end
873
- if author == ""
874
- author = CGI.unescapeHTML(XPath.first(item_node, "author/text()").to_s)
875
2503
  end
2504
+ return @media_text
2505
+ end
876
2506
 
877
- # get the item content
878
- description = ""
879
- body = XPath.first(item_node, "xhtml:body")
880
- if body == nil
881
- body = XPath.first(item_node, "body")
882
- end
883
- if body != nil
884
- description = body.inner_xml
885
- end
886
- if description == ""
887
- description = CGI.unescapeHTML(XPath.first(item_node, "content:encoded/text()").to_s)
2507
+ # Sets the contents of the media:text element
2508
+ def media_text=(new_media_text)
2509
+ @media_text = new_media_text
2510
+ end
2511
+
2512
+ # Returns the contents of the itunes:author element
2513
+ #
2514
+ # This inherits from any incorrectly placed channel-level itunes:author
2515
+ # elements. They're actually amazingly commong. People don't read specs.
2516
+ def itunes_author
2517
+ if @itunes_author.nil?
2518
+ @itunes_author = CGI.unescapeHTML(XPath.first(root_node,
2519
+ "itunes:author/text()").to_s)
2520
+ if @itunes_author == ""
2521
+ @itunes_author = CGI.unescapeHTML(XPath.first(feed.channel_node,
2522
+ "itunes:author/text()").to_s)
2523
+ end
2524
+ if @itunes_author == ""
2525
+ @itunes_author = nil
2526
+ end
888
2527
  end
889
- if description == ""
890
- description = XPath.first(item_node, "description/text()").to_s
891
- if description != ""
892
- if XPath.first(item_node, "description/@encoding").to_s != ""
893
- description = "[Embedded data objects are not supported.]"
894
- else
895
- description = CGI.unescapeHTML(description)
2528
+ return @itunes_author
2529
+ end
2530
+
2531
+ # Sets the contents of the itunes:author element
2532
+ def itunes_author=(new_itunes_author)
2533
+ @itunes_author = new_itunes_author
2534
+ end
2535
+
2536
+ # Returns the number of seconds that the associated media runs for
2537
+ def duration
2538
+ if @duration.nil?
2539
+ itunes_duration = CGI.unescapeHTML(XPath.first(root_node,
2540
+ "itunes:duration/text()").to_s)
2541
+ if itunes_duration != ""
2542
+ hms = itunes_duration.split(":").map { |x| x.to_i }
2543
+ if hms.size == 3
2544
+ @duration = hms[0].hour + hms[1].minute + hms[2]
2545
+ elsif hms.size == 2
2546
+ @duration = hms[0].minute + hms[1]
2547
+ elsif hms.size == 1
2548
+ @duration = hms[0]
896
2549
  end
897
2550
  end
898
2551
  end
899
- if description == ""
900
- description = XPath.first(item_node,"content/text()").to_s
901
- if description != "" && XPath.first(item_node, "content/@mode").to_s == "escaped"
902
- description = CGI.unescapeHTML(description)
2552
+ return @duration
2553
+ end
2554
+
2555
+ # Sets the number of seconds that the associate media runs for
2556
+ def duration=(new_duration)
2557
+ @duration = new_duration
2558
+ end
2559
+
2560
+ # Sets the itunes:summary
2561
+ def itunes_summary=(new_itunes_summary)
2562
+ end
2563
+
2564
+ # Returns the feed item time
2565
+ def time
2566
+ if @time.nil?
2567
+ time_string = XPath.first(root_node, "pubDate/text()").to_s
2568
+ if time_string == ""
2569
+ time_string = XPath.first(root_node, "dc:date/text()").to_s
2570
+ end
2571
+ if time_string == ""
2572
+ time_string = XPath.first(root_node, "issued/text()").to_s
2573
+ end
2574
+ if time_string != ""
2575
+ @time = Time.parse(time_string) rescue Time.now
2576
+ elsif time_string == nil
2577
+ @time = Time.now
903
2578
  end
904
2579
  end
905
-
906
- # get the item time
907
- time = XPath.first(item_node, "pubDate/text()").to_s
908
- if time == ""
909
- time = XPath.first(item_node, "dc:date/text()").to_s
910
- end
911
- if time == ""
912
- time = XPath.first(item_node, "issued/text()").to_s
913
- end
914
-
915
- # get the item tags
916
- tags_array = []
917
- if tags_array == nil || tags_array.size == 0
918
- tags_array = []
919
- tag_list = XPath.match(item_node, "dc:subject/rdf:Bag/rdf:li/text()")
920
- if tag_list.size > 1
921
- for tag in tag_list
922
- tags_array << tag.to_s.downcase.strip
2580
+ return @time
2581
+ end
2582
+
2583
+ # Sets the feed item time
2584
+ def time=(new_time)
2585
+ @time = new_time
2586
+ end
2587
+
2588
+ # Returns the feed item tags
2589
+ def tags
2590
+ # TODO: support the rel="tag" microformat
2591
+ # =======================================
2592
+ if @tags.nil?
2593
+ @tags = []
2594
+ if @tags.nil? || @tags.size == 0
2595
+ @tags = []
2596
+ tag_list = XPath.match(root_node, "dc:subject/rdf:Bag/rdf:li/text()")
2597
+ if tag_list.size > 1
2598
+ for tag in tag_list
2599
+ @tags << tag.to_s.downcase.strip
2600
+ end
923
2601
  end
924
2602
  end
925
- end
926
- if tags_array == nil || tags_array.size == 0
927
- tags_array = []
928
- tag_list = XPath.match(item_node, "category/text()")
929
- if tag_list.size > 1
2603
+ if @tags.nil? || @tags.size == 0
2604
+ # messy effort to find ourselves some tags, mainly for del.icio.us
2605
+ @tags = []
2606
+ rdf_bag = XPath.match(root_node, "taxo:topics/rdf:Bag/rdf:li")
2607
+ if rdf_bag != nil && rdf_bag.size > 0
2608
+ for tag_node in rdf_bag
2609
+ begin
2610
+ tag_url = XPath.first(root_node, "@resource").to_s
2611
+ tag_match = tag_url.scan(/\/(tag|tags)\/(\w+)/)
2612
+ if tag_match.size > 0
2613
+ @tags << tag_match.first.last.downcase.strip
2614
+ end
2615
+ rescue
2616
+ end
2617
+ end
2618
+ end
2619
+ end
2620
+ if @tags.nil? || @tags.size == 0
2621
+ @tags = []
2622
+ tag_list = XPath.match(root_node, "category/text()")
930
2623
  for tag in tag_list
931
- tags_array << tag.to_s.downcase.strip
2624
+ @tags << tag.to_s.downcase.strip
932
2625
  end
933
2626
  end
934
- end
935
- if tags_array == nil || tags_array.size == 0
936
- tags_array = []
937
- tag_list = XPath.match(item_node, "dc:subject/text()")
938
- if tag_list.size > 1
2627
+ if @tags.nil? || @tags.size == 0
2628
+ @tags = []
2629
+ tag_list = XPath.match(root_node, "dc:subject/text()")
939
2630
  for tag in tag_list
940
- tags_array << tag.to_s.downcase.strip
2631
+ @tags << tag.to_s.downcase.strip
941
2632
  end
942
2633
  end
943
- end
944
- if tags_array == nil || tags_array.size == 0
945
- tags_array = XPath.first(item_node,
946
- "category/text()").to_s.downcase.split(" ")
947
- end
948
- if tags_array == nil || tags_array.size == 0
949
- begin
950
- tags_array = XPath.first(item_node,
951
- "dc:subject/text()").to_s.downcase.split(" ")
952
- rescue
953
- tags_array = []
954
- end
955
- end
956
- if tags_array == nil || tags_array.size == 0
957
- tags_array = []
958
- rdf_bag = XPath.match(item_node,
959
- "taxo:topics/rdf:Bag/rdf:li")
960
- if rdf_bag != nil && rdf_bag.size > 0
961
- for tag_node in rdf_bag
962
- begin
963
- tag_url = XPath.first(tag_node, "@resource").to_s
964
- tag_match = tag_url.scan(/\/(tag|tags)\/(\w+)/)
965
- if tag_match.size > 0
966
- tags_array << tag_match.first.last.downcase.strip
967
- end
968
- rescue
969
- end
2634
+ if @tags.nil? || @tags.size == 0
2635
+ begin
2636
+ @tags = XPath.first(root_node, "itunes:keywords/text()").to_s.downcase.split(" ")
2637
+ rescue
2638
+ @tags = []
970
2639
  end
971
2640
  end
2641
+ if @tags.nil?
2642
+ @tags = []
2643
+ end
2644
+ @tags.uniq!
972
2645
  end
2646
+ return @tags
2647
+ end
973
2648
 
974
- # set all of the properties
975
- if link != ""
976
- self.link = link
977
- else
978
- self.link = nil
979
- end
980
- if title != ""
981
- self.title = title
982
- end
983
- if description != ""
984
- self.description = description.strip
985
- end
986
- if time != ""
987
- self.time = Time.parse(time) rescue Time.now
988
- elsif @time == nil
989
- self.time = Time.now
990
- end
991
- if tags_array.size > 0
992
- self.tag_list = tags_array
993
- end
994
- parse_feed_item_hook(item_data)
995
- if Feed.cache_enabled?
996
- save
2649
+ # Sets the feed item tags
2650
+ def tags=(new_tags)
2651
+ @tags = new_tags
2652
+ end
2653
+
2654
+ # Returns true if this feed item contains explicit material. If the whole
2655
+ # feed has been marked as explicit, this will return true even if the item
2656
+ # isn't explicitly marked as explicit.
2657
+ def explicit?
2658
+ if @explicit.nil?
2659
+ if XPath.first(root_node,
2660
+ "media:adult/text()").to_s.downcase == "true" ||
2661
+ XPath.first(root_node,
2662
+ "itunes:explicit/text()").to_s.downcase == "yes" ||
2663
+ XPath.first(root_node,
2664
+ "itunes:explicit/text()").to_s.downcase == "true" ||
2665
+ feed.explicit
2666
+ @explicit = true
2667
+ else
2668
+ @explicit = false
2669
+ end
997
2670
  end
998
- return self
2671
+ return @explicit
999
2672
  end
1000
2673
 
1001
- def build_feed_item_hook(feed_type, version, xml_builder)
2674
+ # Sets whether or not the feed contains explicit material
2675
+ def explicit=(new_explicit)
2676
+ @explicit = (new_explicit ? true : false)
1002
2677
  end
1003
2678
 
1004
- def build_feed_item(feed_type, version, xml_builder=Builder::XmlMarkup.new(:indent => 2))
2679
+ # A hook method that is called during the feed generation process. Overriding this method
2680
+ # will enable additional content to be inserted into the feed.
2681
+ def build_xml_hook(feed_type, version, xml_builder)
2682
+ return nil
2683
+ end
2684
+
2685
+ # Generates xml based on the content of the feed item
2686
+ def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
1005
2687
  if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
1006
2688
  # RDF-based rss format
1007
2689
  if link.nil?
@@ -1026,15 +2708,17 @@ module FeedTools
1026
2708
  unless time.nil?
1027
2709
  xml_builder.tag!("dc:date", time.iso8601)
1028
2710
  end
1029
- unless tags.nil?
2711
+ unless tags.nil? || tags.size == 0
1030
2712
  xml_builder.tag!("dc:subject") do
1031
2713
  xml_builder.tag!("rdf:Bag") do
1032
- for tag in tag_list
2714
+ for tag in tags
1033
2715
  xml_builder.tag!("rdf:li", tag)
1034
2716
  end
1035
2717
  end
1036
2718
  end
2719
+ xml_builder.tag!("itunes:keywords", tags.join(" "))
1037
2720
  end
2721
+ build_xml_hook(feed_type, version, xml_builder)
1038
2722
  end
1039
2723
  elsif feed_type == "rss"
1040
2724
  # normal rss format
@@ -1051,15 +2735,17 @@ module FeedTools
1051
2735
  unless time.nil?
1052
2736
  xml_builder.pubDate(time.rfc822)
1053
2737
  end
1054
- unless tags.nil?
2738
+ unless tags.nil? || tags.size == 0
1055
2739
  xml_builder.tag!("dc:subject") do
1056
2740
  xml_builder.tag!("rdf:Bag") do
1057
- for tag in tag_list
2741
+ for tag in tags
1058
2742
  xml_builder.tag!("rdf:li", tag)
1059
2743
  end
1060
2744
  end
1061
2745
  end
2746
+ xml_builder.tag!("itunes:keywords", tags.join(" "))
1062
2747
  end
2748
+ build_xml_hook(feed_type, version, xml_builder)
1063
2749
  end
1064
2750
  elsif feed_type == "atom"
1065
2751
  # normal atom format
@@ -1083,31 +2769,34 @@ module FeedTools
1083
2769
  unless time.nil?
1084
2770
  xml_builder.issued(time.iso8601)
1085
2771
  end
1086
- unless tags.nil?
1087
- for tag in tag_list
2772
+ unless tags.nil? || tags.size == 0
2773
+ for tag in tags
1088
2774
  xml_builder.category(tag)
1089
2775
  end
1090
2776
  end
2777
+ build_xml_hook(feed_type, version, xml_builder)
1091
2778
  end
1092
2779
  end
1093
2780
  end
1094
-
1095
- # Saves the current state of the feed item to the database unless the feed lacks
1096
- # a remote location
1097
- def save
1098
- unless feed.nil? || feed.url.nil? || feed.url == ""
1099
- super
1100
- end
1101
- end
2781
+
2782
+ alias_method :tagline, :description
2783
+ alias_method :tagline=, :description=
2784
+ alias_method :subtitle, :description
2785
+ alias_method :subtitle=, :description=
2786
+ alias_method :abstract, :description
2787
+ alias_method :abstract=, :description=
2788
+ alias_method :content, :description
2789
+ alias_method :content=, :description=
2790
+ alias_method :guid, :id
2791
+ alias_method :guid=, :id=
1102
2792
  end
1103
2793
  end
1104
2794
 
1105
- module REXML
1106
- class Element
1107
- # small extension to REXML to simplify parsing of xhtml feed items
1108
- def inner_xml
2795
+ module REXML #:nodoc:
2796
+ class Element #:nodoc:
2797
+ def inner_xml #:nodoc:
1109
2798
  result = ""
1110
- each_child do |child|
2799
+ self.each_child do |child|
1111
2800
  result << child.to_s
1112
2801
  end
1113
2802
  return result
@@ -1116,11 +2805,8 @@ module REXML
1116
2805
  end
1117
2806
 
1118
2807
  begin
1119
- FeedTools::Feed.prepare_connection
1120
- unless FeedTools::Feed.cache_exists?
1121
- FeedTools::Feed.create_cache
2808
+ unless FeedTools.feed_cache.nil?
2809
+ FeedTools.feed_cache.initialize_cache
1122
2810
  end
1123
2811
  rescue
1124
- # Nothing can be done until someone sets up the database connection.
1125
- # We'll just assume for now that the user will take care of that.
1126
2812
  end