feedtools 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. data/CHANGELOG +11 -0
  2. data/lib/feed_tools.rb +2496 -810
  3. data/lib/feed_tools/vendor/builder.rb +2 -0
  4. data/lib/feed_tools/vendor/builder/blankslate.rb +2 -0
  5. data/lib/feed_tools/vendor/builder/xmlbase.rb +2 -1
  6. data/lib/feed_tools/vendor/builder/xmlevents.rb +2 -0
  7. data/lib/feed_tools/vendor/builder/xmlmarkup.rb +4 -2
  8. data/lib/feed_tools/vendor/htree.rb +97 -0
  9. data/lib/feed_tools/vendor/htree/container.rb +10 -0
  10. data/lib/feed_tools/vendor/htree/context.rb +67 -0
  11. data/lib/feed_tools/vendor/htree/display.rb +27 -0
  12. data/lib/feed_tools/vendor/htree/doc.rb +149 -0
  13. data/lib/feed_tools/vendor/htree/elem.rb +262 -0
  14. data/lib/feed_tools/vendor/htree/encoder.rb +163 -0
  15. data/lib/feed_tools/vendor/htree/equality.rb +218 -0
  16. data/lib/feed_tools/vendor/htree/extract_text.rb +37 -0
  17. data/lib/feed_tools/vendor/htree/fstr.rb +33 -0
  18. data/lib/feed_tools/vendor/htree/gencode.rb +97 -0
  19. data/lib/feed_tools/vendor/htree/htmlinfo.rb +672 -0
  20. data/lib/feed_tools/vendor/htree/inspect.rb +108 -0
  21. data/lib/feed_tools/vendor/htree/leaf.rb +94 -0
  22. data/lib/feed_tools/vendor/htree/loc.rb +367 -0
  23. data/lib/feed_tools/vendor/htree/modules.rb +48 -0
  24. data/lib/feed_tools/vendor/htree/name.rb +124 -0
  25. data/lib/feed_tools/vendor/htree/output.rb +207 -0
  26. data/lib/feed_tools/vendor/htree/parse.rb +407 -0
  27. data/lib/feed_tools/vendor/htree/raw_string.rb +124 -0
  28. data/lib/feed_tools/vendor/htree/regexp-util.rb +15 -0
  29. data/lib/feed_tools/vendor/htree/rexml.rb +130 -0
  30. data/lib/feed_tools/vendor/htree/scan.rb +166 -0
  31. data/lib/feed_tools/vendor/htree/tag.rb +111 -0
  32. data/lib/feed_tools/vendor/htree/template.rb +909 -0
  33. data/lib/feed_tools/vendor/htree/text.rb +115 -0
  34. data/lib/feed_tools/vendor/htree/traverse.rb +465 -0
  35. data/rakefile +1 -1
  36. data/test/rss_test.rb +97 -0
  37. metadata +30 -1
data/CHANGELOG CHANGED
@@ -1,3 +1,14 @@
1
+ == FeedTools 0.2.0
2
+ * more complete support for rss, atom, cdf
3
+ * modular caching mechanism
4
+ * lazy parsing
5
+ * HTML sanitization of possibly dangerous fields
6
+ * HTML tidy support
7
+ * support for podcasts and vidlogs
8
+ * corrected handling of http redirection
9
+ * made http header information available
10
+ * file: protocol support
11
+ * custom parsing can be done using the find_node and find_all_nodes methods
1
12
  == FeedTools 0.1.0
2
13
  * basic support for rss, atom, cdf
3
14
  * basic caching using active record
@@ -21,987 +21,2669 @@
21
21
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
22
  #++
23
23
 
24
- FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] || ENV['RAILS_ENV'] || 'production'
24
+ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
25
+ ENV['RAILS_ENV'] ||
26
+ 'production' # :nodoc:
27
+
28
+ FEED_TOOLS_VERSION = "0.2.0"
25
29
 
26
30
  $:.unshift(File.dirname(__FILE__))
27
31
  $:.unshift(File.dirname(__FILE__) + "/../../activerecord/lib")
32
+ $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
33
+
34
+ require 'rubygems'
35
+ require 'active_record'
28
36
 
29
37
  begin
30
- require 'active_record'
38
+ require 'builder'
31
39
  rescue LoadError
32
- require 'rubygems'
33
- require_gem 'activerecord'
40
+ # RubyGems version is not available, use included Builder
41
+ require 'feed_tools/vendor/builder'
34
42
  end
35
43
 
36
44
  begin
37
- require 'rubygems'
38
- require 'builder'
45
+ require 'tidy'
39
46
  rescue LoadError
40
- # RubyGems is not available, use included Builder
41
- $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
42
- require 'feed_tools/vendor/builder'
47
+ # Ignore the error for now.
43
48
  end
44
49
 
45
- require 'open-uri'
46
- require 'time'
50
+ require 'feed_tools/vendor/htree'
51
+
52
+ require 'net/http'
53
+ require 'net/https'
54
+ require 'net/ftp'
55
+
47
56
  require 'rexml/document'
48
- require 'yaml'
57
+
58
+ require 'iconv'
59
+ require 'uri'
60
+ require 'time'
49
61
  require 'cgi'
62
+ require 'pp'
63
+ require 'yaml'
50
64
 
65
+ #= feed_tools.rb
66
+ #
67
+ # FeedTools was designed to be a simple XML feed parser, generator, and translator with a built-in
68
+ # caching system.
69
+ #
70
+ #== Example
71
+ # slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss')
72
+ # slashdot_feed.title
73
+ # => "Slashdot"
74
+ # slashdot_feed.description
75
+ # => "News for nerds, stuff that matters"
76
+ # slashdot_feed.link
77
+ # => "http://slashdot.org/"
78
+ # slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s
79
+ # => "43,37,28,23,11,3,1"
51
80
  module FeedTools
52
- class Feed < ActiveRecord::Base
53
- include REXML
54
81
 
55
- has_many :feed_items_unsorted, :class_name => "FeedItem"
56
-
57
- def initialize
58
- @live = false
59
- @feed_items_unsorted = nil
60
- super
61
- end
62
-
63
- # Loads the feed specified by the url, pulling the data from the cache if it hasn't expired
64
- # Be aware that this method translates from the feed: and rss: pseudo-protocols to the
65
- # http: protocol as needed. This means that if you pass in a feed url that looks like
66
- # 'feed://www.anywhere.com/feed.xml' it will end up being stored in the cache as
67
- # 'http://www.anywhere.com/feed.xml' instead. This does affect the usage of methods like
68
- # find_by_url, but otherwise should be fairly transparent.
69
- def Feed.open(url)
70
- # deal with all of the ugly possibilities involved in the rss: and feed: pseudo-protocols
71
- if (url =~ /feed:/) == 0
72
- url = url.gsub(/feed:\/\/http:\/\/\//, "http://")
73
- url = url.gsub(/feed:\/\/http:\/\//, "http://")
74
- url = url.gsub(/feed:http:\/\/\//, "http://")
75
- url = url.gsub(/feed:http:\/\//, "http://")
76
- url = url.gsub(/feed:\/\/\//, "http://")
77
- url = url.gsub(/feed:\/\//, "http://")
78
- url = url.gsub(/feed:\//, "http://")
79
- url = url.gsub(/feed:/, "http://")
80
- end
81
- if (url =~ /rss:/) == 0
82
- url = url.gsub(/rss:\/\/http:\/\/\//, "http://")
83
- url = url.gsub(/rss:\/\/http:\/\//, "http://")
84
- url = url.gsub(/rss:http:\/\/\//, "http://")
85
- url = url.gsub(/rss:http:\/\//, "http://")
86
- url = url.gsub(/rss:\/\/\//, "http://")
87
- url = url.gsub(/rss:\/\//, "http://")
88
- url = url.gsub(/rss:\//, "http://")
89
- url = url.gsub(/rss:/, "http://")
90
- end
91
-
92
- feed = nil
82
+ # The default caching mechanism for the FeedTools module
83
+ class DatabaseFeedCache < ActiveRecord::Base
84
+ # Overrides the default table name to use the "feeds" table.
85
+ def self.table_name() "feeds" end
86
+
87
+ # If ActiveRecord is not already connected, attempts to find a configuration file and use
88
+ # it to open a connection for ActiveRecord.
89
+ # This method is probably unnecessary for anything but testing and debugging purposes.
90
+ # In a Rails environment, the connection will already have been established
91
+ # and this method will simply do nothing.
92
+ #
93
+ # This method should not raise any exceptions because it's designed to be run only when
94
+ # the module is first loaded. If it fails, the user should get an exception when they
95
+ # try to perform some action that makes use of the caching functionality, and not until.
96
+ def DatabaseFeedCache.initialize_cache
97
+ # Establish a connection if we don't already have one
93
98
  begin
94
- feed = Feed.find_by_url(url)
95
- rescue ActiveRecord::StatementInvalid
96
- # make sure that the necessary tables are present and recover if possible
97
- FeedTools::Feed.prepare_connection
98
- unless FeedTools::Feed.cache_exists?
99
- FeedTools::Feed.create_cache
99
+ ActiveRecord::Base.connection
100
+ rescue
101
+ begin
102
+ possible_config_files = [
103
+ "./config/database.yml",
104
+ "../database.yml",
105
+ "./database.yml"
106
+ ]
107
+ database_config_file = nil
108
+ for file in possible_config_files
109
+ if File.exists? file
110
+ database_config_file = file
111
+ break
112
+ end
113
+ end
114
+ database_config_hash = File.open(database_config_file) do |file|
115
+ config_hash = YAML::load(file)
116
+ unless config_hash[FEED_TOOLS_ENV].nil?
117
+ config_hash = config_hash[FEED_TOOLS_ENV]
118
+ end
119
+ config_hash
120
+ end
121
+ ActiveRecord::Base.configurations = database_config_hash
122
+ ActiveRecord::Base.establish_connection(database_config_hash)
123
+ ActiveRecord::Base.connection
124
+ rescue
100
125
  end
101
- feed = Feed.find_by_url(url)
102
126
  end
103
- unless feed.nil?
104
- feed.update_if_needed
105
- else
106
- feed = Feed.new
107
- feed.url = url
108
- feed.load_remote_feed
127
+ # Verify that the necessary database tables are in place
128
+ # and if they're missing, create them
129
+ unless DatabaseFeedCache.table_exists?
130
+ DatabaseFeedCache.create_table
109
131
  end
110
- return feed
111
- end
112
-
113
- # Checks if the feed has expired and updates if it has
114
- def update_if_needed
115
- if expired?
116
- load_remote_feed
117
- end
118
- end
119
-
120
- # Verifies that the table structure exists
121
- def Feed.cache_exists?
122
- return Feed.table_exists? && FeedItem.table_exists?
132
+ return nil
123
133
  end
124
-
125
- # Verifies that the required fields exist; additional ones added by the user are fine
126
- def Feed.table_exists?
134
+
135
+ # True if the appropriate database table already exists
136
+ def DatabaseFeedCache.table_exists?
127
137
  begin
128
- connection.execute "select id, url, link, image_link, title, description, " +
129
- "tags, last_updated, etag, time_to_live from feeds limit 1"
138
+ ActiveRecord::Base.connection.execute "select id, url, title, " +
139
+ "link, xml_data, http_headers, last_retrieved " +
140
+ "from feeds limit 1"
130
141
  rescue ActiveRecord::StatementInvalid
131
142
  return false
143
+ rescue
144
+ return false
132
145
  end
133
146
  return true
134
147
  end
135
-
136
- # Generates the table structure if necessary
137
- def Feed.create_cache
138
- unless Feed.cache_exists?
139
- feed_items_mysql = <<-SQL_END
140
- CREATE TABLE `feed_items` (
141
- `id` int(6) unsigned NOT NULL auto_increment,
142
- `feed_id` int(6) unsigned NOT NULL default '0',
143
- `link` varchar(255) default NULL,
144
- `title` varchar(255) default NULL,
145
- `author` varchar(255) default NULL,
146
- `description` text default NULL,
147
- `time` datetime NOT NULL default '0000-00-00 00:00:00',
148
- `tags` varchar(255) default NULL,
149
- PRIMARY KEY (`id`)
150
- ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
151
- SQL_END
152
- feed_items_sqlite = <<-SQL_END
153
- CREATE TABLE 'feed_items' (
154
- 'id' INTEGER PRIMARY KEY NOT NULL,
155
- 'feed_id' INTEGER NOT NULL,
156
- 'link' VARCHAR(255) DEFAULT NULL,
157
- 'title' VARCHAR(255) DEFAULT NULL,
158
- 'author' VARCHAR(255) DEFAULT NULL,
159
- 'description' TEXT DEFAULT NULL,
160
- 'time' DATETIME DEFAULT NULL,
161
- 'tags' VARCHAR(255) DEFAULT NULL
162
- );
163
- SQL_END
164
- feed_items_psql = <<-SQL_END
165
- CREATE TABLE feed_items (
166
- id SERIAL PRIMARY KEY NOT NULL,
167
- feed_id int REFERENCES feeds,
168
- link varchar(255) default NULL,
169
- title varchar(255) default NULL,
170
- author varchar(255) default NULL,
171
- description text default NULL,
172
- time datetime default NULL,
173
- tags varchar(255) default NULL
174
- );
175
- SQL_END
176
- unless FeedItem.table_exists?
177
- table_creation_sql = nil
178
- if configurations["adapter"] == "mysql"
179
- table_creation_sql = feed_items_mysql
180
- elsif configurations["adapter"] == "sqlite"
181
- table_creation_sql = feed_items_sqlite
182
- elsif configurations["adapter"] == "postgresql"
183
- table_creation_sql = feeds_psql
184
- end
185
- if table_creation_sql.nil?
186
- raise "Could not build feed_items table."
187
- else
188
- connection.execute table_creation_sql
189
- end
190
- end
148
+
149
+ # Creates the appropriate database table
150
+ def DatabaseFeedCache.create_table
151
+ unless DatabaseFeedCache.table_exists?
191
152
  feeds_mysql = <<-SQL_END
192
153
  CREATE TABLE `feeds` (
193
- `id` int(6) unsigned NOT NULL auto_increment,
194
- `url` varchar(255) NOT NULL default '',
195
- `link` varchar(255) NOT NULL default '',
196
- `image_link` varchar(255) default NULL,
197
- `title` varchar(255) default NULL,
198
- `description` text default NULL,
199
- `tags` varchar(255) default NULL,
200
- `last_updated` datetime default NULL,
201
- `etag` varchar(255) default NULL,
202
- `time_to_live` int(4) default NULL,
154
+ `id` int(10) unsigned NOT NULL auto_increment,
155
+ `url` varchar(255) default NULL,
156
+ `title` varchar(255) default NULL,
157
+ `link` varchar(255) default NULL,
158
+ `xml_data` longtext default NULL,
159
+ `http_headers` text default NULL,
160
+ `last_retrieved` datetime default NULL,
203
161
  PRIMARY KEY (`id`)
204
162
  ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
205
163
  SQL_END
206
164
  feeds_sqlite = <<-SQL_END
207
165
  CREATE TABLE 'feeds' (
208
- 'id' INTEGER PRIMARY KEY NOT NULL,
209
- 'url' VARCHAR(255) DEFAULT NULL,
210
- 'link' VARCHAR(255) DEFAULT NULL,
211
- 'image_link' VARCHAR(255) DEFAULT NULL,
212
- 'title' VARCHAR(255) DEFAULT NULL,
213
- 'description' TEXT DEFAULT NULL,
214
- 'tags' VARCHAR(255) DEFAULT NULL,
215
- 'last_updated' DATETIME DEFAULT NULL,
216
- 'etag' VARCHAR(255) DEFAULT NULL,
217
- 'time_to_live' INTEGER DEFAULT NULL
166
+ 'id' INTEGER PRIMARY KEY NOT NULL,
167
+ 'url' VARCHAR(255) DEFAULT NULL,
168
+ 'title' VARCHAR(255) DEFAULT NULL,
169
+ 'link' VARCHAR(255) DEFAULT NULL,
170
+ 'image_link' VARCHAR(255) DEFAULT NULL,
171
+ 'xml_data' TEXT DEFAULT NULL,
172
+ 'http_headers' TEXT DEFAULT NULL,
173
+ 'last_retrieved' DATETIME DEFAULT NULL,
218
174
  );
219
175
  SQL_END
220
176
  feeds_psql = <<-SQL_END
221
177
  CREATE TABLE feeds (
222
- id SERIAL PRIMARY KEY NOT NULL,
223
- url varchar(255) default NULL,
224
- link varchar(255) default NULL,
225
- image_link varchar(255) default NULL,
226
- title varchar(255) default NULL,
227
- description text default NULL,
228
- tags varchar(255) default NULL,
229
- last_updated datetime default NULL,
230
- etag varchar(255) default NULL,
231
- time_to_live int default NULL
178
+ id SERIAL PRIMARY KEY NOT NULL,
179
+ url varchar(255) default NULL,
180
+ title varchar(255) default NULL,
181
+ link varchar(255) default NULL,
182
+ xml_data text default NULL,
183
+ http_headers text default NULL,
184
+ last_retrieved datetime default NULL,
232
185
  );
233
186
  SQL_END
234
- unless Feed.table_exists?
235
- table_creation_sql = nil
236
- if configurations["adapter"] == "mysql"
237
- table_creation_sql = feeds_mysql
238
- elsif configurations["adapter"] == "sqlite"
239
- table_creation_sql = feeds_sqlite
240
- elsif configurations["adapter"] == "postgresql"
241
- table_creation_sql = feeds_psql
242
- end
243
- if table_creation_sql.nil?
244
- raise "Could not build feed_items table."
245
- else
246
- connection.execute table_creation_sql
247
- end
187
+ table_creation_sql = nil
188
+ if configurations["adapter"] == "mysql"
189
+ table_creation_sql = feeds_mysql
190
+ elsif configurations["adapter"] == "sqlite"
191
+ table_creation_sql = feeds_sqlite
192
+ elsif configurations["adapter"] == "postgresql"
193
+ table_creation_sql = feeds_psql
194
+ end
195
+ if table_creation_sql.nil?
196
+ raise "Could not build feed_items table."
197
+ else
198
+ connection.execute table_creation_sql
248
199
  end
249
200
  end
250
201
  end
251
-
252
- # Removes all feed entries from the cache
253
- # This could obviously be a very dangerous operation if you use the cache for more than simply
254
- # caching the feeds.
255
- def Feed.clear_cache
256
- FeedItem.delete_all
257
- Feed.delete_all
202
+ end
203
+
204
+ # Error raised when a feed cannot be retrieved
205
+ class FeedAccessError < StandardError
206
+ end
207
+
208
+ # Quick method of enabling small classes to have their attributes
209
+ # accessible as a dictionary.
210
+ module AttributeDictionary # :nodoc:
211
+ # Access the attributes as a dictionary
212
+ def [](key)
213
+ # Assignment, and destructive methods should not be
214
+ # accessed like this.
215
+ return nil if key[-1..-1] == "=" || key[-1..-1] == "!"
216
+ return nil unless self.method(key).arity == 0
217
+ return self.send(key)
258
218
  end
259
-
260
- # Removes all feed items from the cache and resets the last updated time for all feeds
261
- # This is probably much safer than the clear_cache method
262
- def Feed.expire_cache
263
- FeedItem.delete_all
264
- Feed.update_all("last_updated = NULL")
219
+
220
+ # Access the attributes as a dictionary
221
+ def []=(key, value)
222
+ # Assignment, and destructive methods should not be
223
+ # accessed like this.
224
+ return nil if key[-1..-1] == "=" || key[-1..-1] == "!"
225
+ return nil unless self.method(key + "=").arity == 1
226
+ return self.send(key + "=", value)
265
227
  end
266
-
267
- # Removes all feed items older than the specified number of seconds
268
- def Feed.purge_cache(purge_time=1.week)
269
- purge_date = (Time.now - purge_time).strftime("%Y-%m-%d %H:%M:%S")
270
- FeedItem.delete_all("time < '#{purge_date}'")
228
+ end
229
+
230
+ @feed_cache = DatabaseFeedCache
231
+ @user_agent = "FeedTools/#{FEED_TOOLS_VERSION} " +
232
+ "+http://www.sporkmonger.com/projects/feedtools/"
233
+
234
+ # Returns the current caching mechanism.
235
+ def FeedTools.feed_cache
236
+ return @feed_cache
237
+ end
238
+
239
+ # Sets the current caching mechanism. If set to nil, disables caching.
240
+ # Default is the DatabaseFeedCache class.
241
+ #
242
+ # Objects of this class must accept the following messages:
243
+ # url
244
+ # url=
245
+ # title
246
+ # title=
247
+ # link
248
+ # link=
249
+ # xml_data
250
+ # xml_data=
251
+ # etag
252
+ # etag=
253
+ # last_modified
254
+ # last_modified=
255
+ # save
256
+ #
257
+ # Additionally, the class itself must accept the following messages:
258
+ # find_by_id
259
+ # find_by_url
260
+ # initialize_cache
261
+ def FeedTools.feed_cache=(new_feed_cache)
262
+ # TODO: ensure that the feed cache class actually does those things.
263
+ # ==================================================================
264
+ @feed_cache = new_feed_cache
265
+ end
266
+
267
+ # Returns the currently used user agent string.
268
+ def FeedTools.user_agent
269
+ return @user_agent
270
+ end
271
+
272
+ # Sets the user agent string to send in the http headers.
273
+ def FeedTools.user_agent=(new_user_agent)
274
+ @user_agent = new_user_agent
275
+ end
276
+
277
+ # Returns true if the html tidy module can be used.
278
+ #
279
+ # Obviously, you need the tidy gem installed in order to run with html
280
+ # tidy features turned on.
281
+ #
282
+ # This method does a fairly complicated, and probably unnecessarily
283
+ # desperate search for the libtidy library. If you want this thing to
284
+ # execute fast, the best thing to do is to set Tidy.path ahead of time.
285
+ # If Tidy.path is set, this method doesn't do much. If it's not set,
286
+ # it will do it's darnedest to find the libtidy library. If you set
287
+ # the LIBTIDYPATH environment variable to the libtidy library, it should
288
+ # be able to find it.
289
+ #
290
+ # Once the library is located, this method will run much faster.
291
+ def FeedTools.tidy_enabled?
292
+ # This is an override variable to keep tidy from being used even if it
293
+ # is available.
294
+ if @force_tidy_enabled == false
295
+ return false
271
296
  end
272
-
273
- # If ActiveRecord is not already connected, attempts to find a configuration file and use
274
- # it to open a connection for ActiveRecord.
275
- # This method is probably unnecessary for anything but testing and debugging purposes.
276
- def Feed.prepare_connection
297
+ if @tidy_enabled.nil? || @tidy_enabled == false
298
+ @tidy_enabled = false
277
299
  begin
278
- ActiveRecord::Base.connection
279
- rescue
280
- possible_config_files = [
281
- "./config/database.yml",
282
- "./database.yml"
283
- ]
284
- database_config_file = nil
285
- for file in possible_config_files
286
- if File.exists? file
287
- database_config_file = file
288
- break
289
- end
290
- end
291
- database_config_hash = File.open(database_config_file) do |file|
292
- config_hash = YAML::load(file)
293
- unless config_hash[FEED_TOOLS_ENV].nil?
294
- config_hash = config_hash[FEED_TOOLS_ENV]
295
- end
296
- config_hash
297
- end
298
- ActiveRecord::Base.configurations = database_config_hash
299
- ActiveRecord::Base.establish_connection(database_config_hash)
300
- ActiveRecord::Base.connection
300
+ require 'tidy'
301
+ if Tidy.path.nil?
302
+ # *Shrug*, just brute force it, I guess. There's a lot of places
303
+ # this thing might be hiding in, depending on platform and general
304
+ # sanity of the person who installed the thing. Most of these are
305
+ # probably unlikely, but it's not like checking unlikely locations
306
+ # hurts. Much. Especially if you actually find it.
307
+ libtidy_locations = [
308
+ '/usr/local/lib/libtidy.dylib',
309
+ '/opt/local/lib/libtidy.dylib',
310
+ '/usr/lib/libtidy.dylib',
311
+ '/usr/local/lib/tidylib.dylib',
312
+ '/opt/local/lib/tidylib.dylib',
313
+ '/usr/lib/tidylib.dylib',
314
+ '/usr/local/lib/tidy.dylib',
315
+ '/opt/local/lib/tidy.dylib',
316
+ '/usr/lib/tidy.dylib',
317
+ '/usr/local/lib/libtidy.so',
318
+ '/opt/local/lib/libtidy.so',
319
+ '/usr/lib/libtidy.so',
320
+ '/usr/local/lib/tidylib.so',
321
+ '/opt/local/lib/tidylib.so',
322
+ '/usr/lib/tidylib.so',
323
+ '/usr/local/lib/tidy.so',
324
+ '/opt/local/lib/tidy.so',
325
+ '/usr/lib/tidy.so',
326
+ 'C:\Program Files\Tidy\tidy.dll',
327
+ 'C:\Tidy\tidy.dll',
328
+ '/usr/local/lib',
329
+ '/opt/local/lib',
330
+ '/usr/lib'
331
+ ]
332
+ # We just made this thing up, but if someone sets it, we'll
333
+ # go ahead and check it
334
+ unless ENV['LIBTIDYPATH'].nil?
335
+ libtidy_locations =
336
+ libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
337
+ end
338
+ for path in libtidy_locations
339
+ if File.exists? path
340
+ if File.ftype(path) == "file"
341
+ Tidy.path = path
342
+ @tidy_enabled = true
343
+ break
344
+ elsif File.ftype(path) == "directory"
345
+ # Ok, now perhaps we're getting a bit more desperate
346
+ lib_paths =
347
+ `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
348
+ # If there's more than one, grab the first one and
349
+ # hope for the best, and if it doesn't work, then blame the
350
+ # user for not specifying more accurately.
351
+ tidy_path = lib_paths.split("\n").first
352
+ unless tidy_path.nil?
353
+ Tidy.path = tidy_path
354
+ @tidy_enabled = true
355
+ break
356
+ end
357
+ end
358
+ end
359
+ end
360
+ # Still couldn't find it.
361
+ unless @tidy_enabled
362
+ @tidy_enabled = false
363
+ end
364
+ else
365
+ @tidy_enabled = true
366
+ end
367
+ rescue LoadError
368
+ # Tidy not installed, disable features that rely on tidy.
369
+ @tidy_enabled = false
301
370
  end
302
371
  end
303
-
304
- def Feed.cache_enabled?
305
- return true
372
+ return @tidy_enabled
373
+ end
374
+
375
+ # Turns html tidy support on or off. Be aware, that setting this to true
376
+ # does not mean tidy will be enabled. It simply means that tidy will be
377
+ # enabled if it is available to be enabled.
378
+ def FeedTools.tidy_enabled=(new_tidy_enabled)
379
+ @force_tidy_enabled = new_tidy_enabled
380
+ end
381
+
382
+ # Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls
383
+ # and makes every effort to figure out what it was supposed to be. Also translates from
384
+ # the feed: and rss: pseudo-protocols to the http: protocol.
385
+ def FeedTools.normalize_url(url)
386
+ if url.nil? || url == ""
387
+ return nil
306
388
  end
389
+ normalized_url = url
307
390
 
308
- def title
309
- return (self["title"] or "Untitled Feed")
391
+ # if a url begins with the '/' character, it only makes sense that they
392
+ # meant to be using a file:// url. Fix it for them.
393
+ if normalized_url.length > 0 && normalized_url[0..0] == "/"
394
+ normalized_url = "file://" + normalized_url
310
395
  end
311
396
 
312
- # Optional feed attribute.
313
- # If you want to use it, the database table needs to have a language field added, otherwise
314
- # it will just default to "en-US".
315
- def language
316
- begin
317
- return (self["language"] or "en-US")
318
- rescue
319
- return "en-US"
320
- end
397
+ # if a url begins with javascript:, it's quite possibly an attempt at
398
+ # doing something malicious. Let's keep that from getting anywhere,
399
+ # shall we?
400
+ if (normalized_url.downcase =~ /javascript:/) != nil
401
+ return "#"
321
402
  end
322
403
 
323
- def live?
324
- if @live
325
- return true
326
- else
327
- return false
328
- end
329
- end
404
+ # deal with all of the many ugly possibilities involved in the rss:
405
+ # and feed: pseudo-protocols (incidentally, whose crazy idea was this
406
+ # mess?)
407
+ normalized_url.gsub!(/^http:\/*(feed:\/*)?/, "http://")
408
+ normalized_url.gsub!(/^http:\/*(rss:\/*)?/, "http://")
409
+ normalized_url.gsub!(/^feed:\/*(http:\/*)?/, "http://")
410
+ normalized_url.gsub!(/^rss:\/*(http:\/*)?/, "http://")
411
+ normalized_url.gsub!(/^file:\/*/, "file:///")
412
+ normalized_url.gsub!(/^https:\/*/, "https://")
413
+ # fix (very) bad urls (usually of the user-entered sort)
414
+ normalized_url.gsub!(/^http:\/*(http:\/*)*/, "http://")
415
+ if (normalized_url =~ /^file:/) == 0
416
+ # fix bad Windows-based entries
417
+ normalized_url.gsub!(/file:\/\/\/([a-zA-Z]):/, 'file:///\1|')
330
418
 
331
- def expired?
332
- return last_updated == nil || (last_updated + time_to_live) < Time.now
419
+ # maybe this is too aggressive?
420
+ normalized_url.gsub!(/\\/, '/')
421
+ return normalized_url
422
+ else
423
+ if (normalized_url =~ /https?:\/\//) == nil
424
+ normalized_url = "http://" + normalized_url
425
+ end
426
+ if normalized_url == "http://"
427
+ return nil
428
+ end
429
+ begin
430
+ feed_uri = URI.parse(normalized_url)
431
+ if feed_uri.scheme == nil
432
+ feed_uri.scheme = "http"
433
+ end
434
+ if feed_uri.path == nil || feed_uri.path == ""
435
+ feed_uri.path = "/"
436
+ end
437
+ if (feed_uri.path =~ /^[\/]+/) == 0
438
+ feed_uri.path.gsub!(/^[\/]+/, "/")
439
+ end
440
+ return feed_uri.to_s
441
+ rescue URI::InvalidURIError
442
+ return normalized_url
443
+ end
333
444
  end
445
+ end
334
446
 
335
- # Forces this feed to expire.
336
- def expire
337
- FeedItem.delete_all("feed_id = '#{self.id}'")
338
- @feed_items_unsorted = nil
339
- self.last_updated = Time.mktime(1980)
340
- self.save
447
+ # Returns true if the parameter appears to be a valid url
448
+ def FeedTools.is_url?(url)
449
+ return false if url.nil?
450
+ begin
451
+ uri = URI.parse(url)
452
+ rescue URI::InvalidURIError
453
+ return false
341
454
  end
455
+ return true
456
+ end
342
457
 
343
- # The ammount of time in seconds between the last time the feed was updated and the next
344
- # valid time to retrieve a remote feed.
345
- def time_to_live
346
- return self['time_to_live'].nil? ? 1.hour : self['time_to_live'].hour
347
- end
458
+ # Removes all html tags from the html formatted text.
459
+ def FeedTools.strip_html(html)
460
+ # TODO: do this properly
461
+ # ======================
462
+ stripped_html = html.gsub(/<\/?[^>]+>/, "")
463
+ return stripped_html
464
+ end
348
465
 
349
- def tag_list
350
- return tags.nil? ? nil : tags[1..-2].split("|")
466
+ # Tidys up the html
467
+ def FeedTools.tidy_html(html)
468
+ if FeedTools.tidy_enabled?
469
+ is_fragment = true
470
+ if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
471
+ (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
472
+ is_fragment = false
473
+ end
474
+ if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
475
+ is_fragment = false
476
+ end
477
+ tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
478
+ tidy.options.output_xml = true
479
+ tidy.options.indent = false
480
+ tidy.options.wrap_attributes = true
481
+ tidy.options.logical_emphasis = true
482
+ tidy.options.doctype = "omit"
483
+ xml = tidy.clean(html)
484
+ xml
485
+ end
486
+ if is_fragment
487
+ # Tidy puts <html>...<body>[our html]</body>...</html> in.
488
+ # We don't want this.
489
+ tidy_html.strip!
490
+ tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
491
+ tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
492
+ tidy_html.strip!
493
+ end
494
+ else
495
+ tidy_html = html
351
496
  end
497
+ return tidy_html
498
+ end
352
499
 
353
- def tag_list=(new_tag_list)
354
- self.tags = "|" + (new_tag_list.map { |x| x.strip }).join("|") + "|"
355
- end
500
+ # Removes all dangerous html tags from the html formatted text.
501
+ # If mode is set to :escape, dangerous and unknown elements will
502
+ # be escaped. If mode is set to :strip, dangerous and unknown
503
+ # elements and all children will be removed entirely.
504
+ # Dangerous or unknown attributes are always removed.
505
+ def FeedTools.sanitize_html(html, mode=:escape)
506
+
507
+ # Lists borrowed from Mark Pilgrim's feedparser
508
+ acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
509
+ 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
510
+ 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
511
+ 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
512
+ 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
513
+ 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
514
+ 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
515
+ 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
516
+ 'u', 'ul', 'var']
356
517
 
357
- def tag_string
358
- return (tags.nil? ? nil : tags[1..-2]).split("|").join(", ")
359
- end
360
-
361
- def tag_string=(new_tag_string)
362
- self.tags = "|" + (new_tag_string.split(",").map { |x| x.strip }).join("|") + "|"
363
- end
518
+ acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
519
+ 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
520
+ 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
521
+ 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
522
+ 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
523
+ 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
524
+ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
525
+ 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
526
+ 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
527
+ 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
528
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width']
364
529
 
365
- # Returns a list of the feed_items, sorted by date
366
- def feed_items
367
- begin
368
- if @feed_items_unsorted.nil?
369
- @feed_items_unsorted = feed_items_unsorted
370
- end
371
- return @feed_items_unsorted.sort do |a,b|
372
- b.time <=> a.time
373
- end
374
- rescue
375
- unless @feed_items_unsorted.nil?
376
- return @feed_items_unsorted
377
- else
378
- return feed_items_unsorted
530
+ # Stupid hack to pass this unit test:
531
+ # http://feedparser.org/tests/wellformed/rss/
532
+ # item_description_not_a_doctype.xml
533
+ html.gsub!(/<!'/, "&lt;!'")
534
+
535
+ # The closer we are to proper xhtml, the more accurate the
536
+ # sanitization will be.
537
+ html = FeedTools.tidy_html(html)
538
+
539
+ # Hackity hack. But it works, and it seems plenty fast enough.
540
+ html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
541
+
542
+ sanitize_node = lambda do |html_node|
543
+ if html_node.respond_to? :children
544
+ for child in html_node.children
545
+ if child.kind_of? REXML::Element
546
+ unless acceptable_elements.include? child.name
547
+ if mode == :strip
548
+ html_node.delete_element(child)
549
+ else
550
+ new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
551
+ html_node.insert_after(child, new_child)
552
+ html_node.delete_element(child)
553
+ end
554
+ end
555
+ for attribute in child.attributes.keys
556
+ unless acceptable_attributes.include? attribute
557
+ child.delete_attribute(attribute)
558
+ end
559
+ end
560
+ end
561
+ sanitize_node.call(child)
379
562
  end
380
563
  end
564
+ html_node
381
565
  end
566
+ sanitize_node.call(html_doc.root)
567
+ return html_doc.root.inner_xml
568
+ end
569
+
570
+ class Feed
571
+ include REXML
572
+ include AttributeDictionary
382
573
 
383
- # Attempts to load the feed from the remote location. Requires the url to be set.
384
- # If an etag has been set, attempts to use it to prevent unnecessary reloading of identical
385
- # content.
386
- def load_remote_feed
387
- @live = true
388
- self.last_updated = Time.now
389
- if (etag != nil)
390
- # TODO: verify that the etag code works as intended
391
- # -> may need to check what gets returned when the
392
- # etag is matched
393
- # =================================================
394
- open(url, "If-None-Match" => @etag ) do |http|
395
- etag = http.meta['etag']
396
- parse_feed(http.read)
397
- end
574
+ # Loads the feed specified by the url, pulling the data from the cache if it hasn't expired.
575
+ def Feed.open(url)
576
+ # clean up the url
577
+ url = FeedTools.normalize_url(url)
578
+
579
+ # create and load the new feed
580
+ feed = Feed.new
581
+ feed.url = url
582
+ feed.update
583
+ return feed
584
+ end
585
+
586
+ # Loads the feed from the remote url if the feed has expired from the cache or cannot be
587
+ # retrieved from the cache for some reason.
588
+ def update
589
+ if self.http_headers.nil? && !(self.cache_object.nil?) &&
590
+ !(self.cache_object.http_headers.nil?)
591
+ @http_headers = YAML.load(self.cache_object.http_headers)
592
+ end
593
+ if expired?
594
+ load_remote_feed
398
595
  else
399
- open(url) do |http|
400
- etag = http.meta['etag']
401
- parse_feed(http.read)
402
- end
596
+ @live = false
403
597
  end
404
598
  end
405
-
406
- def parse_feed_hook(feed_data)
407
- return nil
408
- end
409
-
410
- def parse_feed(feed_data)
411
- root_node = Document.new(feed_data).root
412
- metadata_node = XPath.first(root_node, "channel")
413
- if metadata_node == nil
414
- metadata_node = root_node
415
- end
416
599
 
417
- # get the feed title
418
- title = XPath.first(metadata_node, "title/text()").to_s
600
+ # Attempts to load the feed from the remote location. Requires the url
601
+ # field to be set. If an etag or the last_modified date has been set,
602
+ # attempts to use them to prevent unnecessary reloading of identical
603
+ # content.
604
+ def load_remote_feed
605
+ @live = true
606
+ if self.http_headers.nil? && !(self.cache_object.nil?) &&
607
+ !(self.cache_object.http_headers.nil?)
608
+ @http_headers = YAML.load(self.cache_object.http_headers)
609
+ end
419
610
 
420
- # is the title escaped?
421
- if XPath.first(metadata_node, "title/@mode").to_s == "escaped"
422
- title = CGI.unescapeHTML(title)
611
+ if (self.url =~ /^feed:/) == 0
612
+ # Woah, Nelly, how'd that happen? You should've already been
613
+ # corrected. So let's fix that url. And please,
614
+ # just use less crappy browsers instead of badly defined
615
+ # pseudo-protocol hacks.
616
+ self.url = FeedTools.normalize_url(self.url)
423
617
  end
424
-
425
- # get the feed link
426
- link = XPath.first(metadata_node, "link[@rel='alternate' @type='text/html']/@href").to_s
427
- if link == ""
428
- link = XPath.first(metadata_node, "link[@rel='alternate']/@href").to_s
429
- end
430
- if link == ""
431
- link = XPath.first(metadata_node, "link/@href").to_s
432
- end
433
- if link == ""
434
- link = XPath.first(metadata_node, "link/text()").to_s
435
- end
436
- if link == ""
437
- # The ordering here is somewhat incorrect, but the more correct ordering would
438
- # introduce much more serious problems, so I've chosen to go with the lesser of two
439
- # evils. (The completely correct implementation would require a vestigial 'base' method
440
- # on the Feed class to fully support CDF files. This method will support almost all CDF
441
- # files without any unnecessary methods.) But given that this only exists to support
442
- # CDF files, it's not a big deal. It's not like CDF files really exist in the wild.
443
- # (The assumption this ordering makes is that the 'base' attribute points to a valid
444
- # location, hopefully the same as the 'href' location. Chances are pretty good that this
445
- # is true.)
446
- link = XPath.first(metadata_node, "@base").to_s
447
- end
448
- if link == ""
449
- link = XPath.first(metadata_node, "@href").to_s
618
+
619
+ # Find out what method we're going to be using to obtain this feed.
620
+ uri = URI.parse(self.url)
621
+ retrieval_method = "http"
622
+ case uri.scheme
623
+ when "http"
624
+ retrieval_method = "http"
625
+ when "ftp"
626
+ retrieval_method = "ftp"
627
+ when "file"
628
+ retrieval_method = "file"
629
+ when nil
630
+ raise FeedAccessError,
631
+ "No protocol was specified in the url."
632
+ else
633
+ raise FeedAccessError,
634
+ "Cannot retrieve feed using unrecognized protocol: " + uri.scheme
450
635
  end
451
636
 
452
- # get the feed description
453
- description = XPath.first(metadata_node, "description/text()").to_s
454
- if description != ""
455
- if XPath.first(metadata_node, "description/@encoding").to_s != ""
456
- description = "[Embedded data objects are not supported.]"
457
- else
458
- description = CGI.unescapeHTML(description)
637
+ # No need for http headers unless we're actually doing http
638
+ if retrieval_method == "http"
639
+ # Set up the appropriate http headers
640
+ headers = {}
641
+ unless self.http_headers.nil?
642
+ headers["If-None-Match"] =
643
+ self.http_headers['etag'] unless self.http_headers['etag'].nil?
644
+ headers["If-Modified-Since"] =
645
+ self.http_headers['last-modified'] unless
646
+ self.http_headers['last-modified'].nil?
647
+ end
648
+ headers["User-Agent"] =
649
+ FeedTools.user_agent unless FeedTools.user_agent.nil?
650
+
651
+ # The http feed access method
652
+ def http_fetch(feed_url, http_headers, redirect_limit = 10,
653
+ response_chain = []) # :nodoc:
654
+ raise FeedAccessError, 'Redirect too deep' if redirect_limit == 0
655
+ feed_uri = nil
656
+ begin
657
+ feed_uri = URI.parse(feed_url)
658
+ rescue URI::InvalidURIError
659
+ # Uh, maybe try to fix it?
660
+ feed_uri = URI.parse(FeedTools.normalize_url(feed_url))
661
+ end
662
+
663
+ # Borrowed from open-uri:
664
+ # According to RFC2616 14.23, Host: request-header field should be
665
+ # set to an origin server.
666
+ # But net/http wrongly set a proxy server if an absolute URI is
667
+ # specified as a request URI.
668
+ # So override it here explicitly.
669
+ http_headers['Host'] = feed_uri.host
670
+ http_headers['Host'] += ":#{feed_uri.port}" if feed_uri.port
671
+
672
+ Net::HTTP.start(feed_uri.host, (feed_uri.port or 80)) do |http|
673
+ response = http.request_get(feed_uri.path, http_headers)
674
+
675
+ case response
676
+ when Net::HTTPSuccess
677
+ # We've reached the final destination, process all previous
678
+ # redirections, and see if we need to update the url.
679
+ for redirected_response in response_chain
680
+ if redirected_response.last.code.to_i == 301
681
+ self.url = redirected_response.first
682
+ else
683
+ # Jump out as soon as we hit anything that isn't a
684
+ # permanently moved redirection.
685
+ break
686
+ end
687
+ end
688
+ return response
689
+ when Net::HTTPRedirection
690
+ if response.code.to_i == 304
691
+ response.error!
692
+ else
693
+ if response['Location'].nil?
694
+ raise FeedAccessError,
695
+ "No location to redirect to supplied: " + response.code
696
+ end
697
+ response_chain << [feed_url, response]
698
+ new_location = response['location']
699
+ if response_chain.assoc(new_location) != nil
700
+ raise FeedAccessError, "Redirection loop detected."
701
+ end
702
+ # TODO: deal with stupid people using relative urls
703
+ # in Location header
704
+ # =================================================
705
+ http_fetch(new_location, http_headers,
706
+ redirect_limit - 1, response_chain)
707
+ end
708
+ else
709
+ response.error!
710
+ end
711
+ end
712
+ end
713
+
714
+ begin
715
+ @http_response = http_fetch(self.url, headers)
716
+ @http_headers = {}
717
+ self.http_response.each_header do |header|
718
+ self.http_headers[header.first.downcase] = header.last
719
+ end
720
+ self.last_retrieved = Time.now
721
+ self.xml_data = self.http_response.body
722
+ rescue FeedAccessError
723
+ @live = false
724
+ if self.xml_data.nil?
725
+ raise
726
+ end
727
+ rescue Timeout::Error
728
+ # if we time out, do nothing, it should fall back to the xml_data
729
+ # stored in the cache.
730
+ @live = false
731
+ if self.xml_data.nil?
732
+ raise
733
+ end
734
+ rescue Errno::ECONNRESET
735
+ # if the connection gets reset by peer, oh well, fall back to the
736
+ # xml_data stored in the cache
737
+ @live = false
738
+ if self.xml_data.nil?
739
+ raise
740
+ end
741
+ rescue => error
742
+ # heck, if anything at all bad happens, fall back to the xml_data
743
+ # stored in the cache.
744
+
745
+ # If we can, get the HTTPResponse...
746
+ @http_response = nil
747
+ if error.respond_to?(:each_header)
748
+ @http_response = error
749
+ end
750
+ if error.respond_to?(:response) &&
751
+ error.response.respond_to?(:each_header)
752
+ @http_response = error.response
753
+ end
754
+ if @http_response != nil
755
+ @http_headers = {}
756
+ self.http_response.each_header do |header|
757
+ self.http_headers[header.first] = header.last
758
+ end
759
+ if self.http_response.code.to_i == 304
760
+ self.last_retrieved = Time.now
761
+ end
762
+ end
763
+ @live = false
764
+ if self.xml_data.nil?
765
+ raise
766
+ end
767
+ end
768
+ elsif retrieval_method == "https"
769
+ # Not supported... yet
770
+ elsif retrieval_method == "ftp"
771
+ # Not supported... yet
772
+ # Technically, CDF feeds are supposed to be able to be accessed directly
773
+ # from an ftp server. This is silly, but we'll humor Microsoft.
774
+ #
775
+ # Eventually.
776
+ elsif retrieval_method == "file"
777
+ # Now that we've gone to all that trouble to ensure the url begins
778
+ # with 'file://', strip the 'file://' off the front of the url.
779
+ file_name = self.url.gsub(/^file:\/\//, "")
780
+ begin
781
+ open(file_name) do |file|
782
+ @http_response = nil
783
+ @http_headers = {}
784
+ self.last_retrieved = Time.now
785
+ self.xml_data = file.read
786
+ end
787
+ rescue
788
+ @live = false
789
+ # In this case, pulling from the cache is probably not going
790
+ # to help at all, and the use should probably be immediately
791
+ # appraised of the problem. Raise the exception.
792
+ raise
459
793
  end
460
794
  end
461
- if description == ""
462
- description = XPath.first(metadata_node, "tagline/text()").to_s
463
- if description != "" && XPath.first(metadata_node, "tagline/@mode").to_s == "escaped"
464
- description = CGI.unescapeHTML(description)
795
+ unless self.cache_object.nil?
796
+ begin
797
+ self.save
798
+ rescue
465
799
  end
466
800
  end
467
- if description == "" && XPath.first(metadata_node, "tagline") == nil
468
- description = XPath.first(metadata_node, "info/text()").to_s
469
- if description != "" && XPath.first(metadata_node, "info/@mode").to_s == "escaped"
470
- description = CGI.unescapeHTML(description)
801
+ end
802
+
803
+ # Returns the relevant information from an http request.
804
+ def http_response
805
+ return @http_response
806
+ end
807
+
808
+ # Returns a hash of the http headers from the response.
809
+ def http_headers
810
+ return @http_headers
811
+ end
812
+
813
+ # Returns the feed's raw xml data.
814
+ def xml_data
815
+ if @xml_data.nil?
816
+ unless self.cache_object.nil?
817
+ @xml_data = self.cache_object.xml_data
471
818
  end
472
819
  end
473
- if description == ""
474
- description = CGI.unescapeHTML(XPath.first(metadata_node, "abstract/text()").to_s)
820
+ return @xml_data
821
+ end
822
+
823
+ # Sets the feed's xml data.
824
+ def xml_data=(new_xml_data)
825
+ @xml_data = new_xml_data
826
+ unless self.cache_object.nil?
827
+ self.cache_object.xml_data = new_xml_data
475
828
  end
829
+ end
476
830
 
477
- # get the image link
478
- image_link = XPath.first(metadata_node, "image/url/text()").to_s
479
- if image_link == ""
480
- image_link = XPath.first(metadata_node, "image/@rdf:resource").to_s
831
+ # Returns a REXML Document of the xml_data
832
+ def xml
833
+ if @xml_doc.nil?
834
+ begin
835
+ @xml_doc = Document.new(xml_data)
836
+ rescue
837
+ # Something failed, attempt to repair the xml with htree.
838
+ @xml_doc = HTree.parse(xml_data).to_rexml
839
+ end
481
840
  end
482
- if image_link == ""
483
- image_link = XPath.first(metadata_node, "link[@type='image/jpeg']/@href").to_s
841
+ return @xml_doc
842
+ end
843
+
844
+ # Returns the first node within the channel_node that matches the xpath query.
845
+ def find_node(xpath)
846
+ return XPath.first(channel_node, xpath)
847
+ end
848
+
849
+ # Returns all nodes within the channel_node that match the xpath query.
850
+ def find_all_nodes(xpath)
851
+ return XPath.match(channel_node, xpath)
852
+ end
853
+
854
+ # Returns the root node of the feed.
855
+ def root_node
856
+ if @root_node.nil?
857
+ @root_node = xml.root
484
858
  end
485
- if image_link == ""
486
- image_link = XPath.first(metadata_node, "link[@type='image/gif']/@href").to_s
859
+ return @root_node
860
+ end
861
+
862
+ # Returns the channel node of the feed.
863
+ def channel_node
864
+ if @channel_node.nil?
865
+ @channel_node = XPath.first(root_node, "channel")
866
+ if @channel_node == nil
867
+ @channel_node = XPath.first(root_node, "feedinfo")
868
+ end
869
+ if @channel_node == nil
870
+ @channel_node = root_node
871
+ end
487
872
  end
488
- if image_link == ""
489
- image_link = XPath.first(metadata_node, "link[@type='image/png']/@href").to_s
873
+ return @channel_node
874
+ end
875
+
876
+ # The cache object that handles the feed persistence.
877
+ def cache_object
878
+ unless FeedTools.feed_cache.nil?
879
+ if @cache_object.nil?
880
+ begin
881
+ if @id != nil
882
+ @cache_object = FeedTools.feed_cache.find_by_id(@id)
883
+ elsif @url != nil
884
+ @cache_object = FeedTools.feed_cache.find_by_url(@url)
885
+ end
886
+ if @cache_object.nil?
887
+ @cache_object = FeedTools.feed_cache.new
888
+ end
889
+ rescue
890
+ end
891
+ end
490
892
  end
491
- if image_link == ""
492
- image_link = XPath.first(metadata_node, "logo[@style='image']/@href").to_s
893
+ return @cache_object
894
+ end
895
+
896
+ # Sets the cache object for this feed.
897
+ #
898
+ # This can be any object, but it must accept the following messages:
899
+ # url
900
+ # url=
901
+ # title
902
+ # title=
903
+ # link
904
+ # link=
905
+ # xml_data
906
+ # xml_data=
907
+ # etag
908
+ # etag=
909
+ # last_modified
910
+ # last_modified=
911
+ # save
912
+ def cache_object=(new_cache_object)
913
+ @cache_object = new_cache_object
914
+ end
915
+
916
+ # Returns the feed's unique id
917
+ def id
918
+ if @id.nil?
919
+ @id = XPath.first(root_node, "id/text()").to_s
920
+ if @id == ""
921
+ @id = XPath.first(root_node, "guid/text()").to_s
922
+ end
923
+ @id = nil if @id == ""
493
924
  end
494
- if image_link == ""
495
- image_link = XPath.first(metadata_node, "logo/@href").to_s
925
+ return @id
926
+ end
927
+
928
+ # Sets the feed's unique id
929
+ def id=(new_id)
930
+ @id = new_id
931
+ end
932
+
933
+ # Returns the feed url.
934
+ def url
935
+ if @url.nil? && self.xml_data != nil
936
+ @url = XPath.first(channel_node, "link[@rel='self']/@href").to_s
937
+ @url = nil if @url == ""
496
938
  end
939
+ return @url
940
+ end
497
941
 
498
- # get the feed time to live (expressed in hours)
499
- feed_time_to_live = nil
500
- update_frequency = XPath.first(metadata_node, "syn:updateFrequency/text()").to_s
501
- if update_frequency != ""
502
- update_period = XPath.first(metadata_node, "syn:updatePeriod/text()").to_s
503
- if update_period == "daily"
504
- feed_time_to_live = update_frequency.to_i * 24
505
- elsif update_period == "weekly"
506
- feed_time_to_live = update_frequency.to_i * 24 * 7
507
- elsif update_period == "monthly"
508
- feed_time_to_live = update_frequency.to_i * 24 * 30
509
- elsif update_period == "yearly"
510
- feed_time_to_live = update_frequency.to_i * 24 * 365
942
+ # Sets the feed url and prepares the cache_object if necessary.
943
+ def url=(new_url)
944
+ @url = FeedTools.normalize_url(new_url)
945
+ self.cache_object.url = new_url unless self.cache_object.nil?
946
+ end
947
+
948
+ # Returns the feed title
949
+ def title
950
+ if @title.nil?
951
+ if XPath.first(channel_node, "title/@type").to_s == "xhtml" ||
952
+ XPath.first(channel_node, "title/@mode").to_s == "xhtml"
953
+ @title = XPath.first(channel_node, "title").inner_xml
954
+ elsif XPath.first(channel_node, "title/@type").to_s == "escaped" ||
955
+ XPath.first(channel_node, "title/@mode").to_s == "escaped"
956
+ @title = CGI.unescapeHTML(
957
+ XPath.first(channel_node, "title/text()").to_s)
511
958
  else
512
- # hourly
513
- feed_time_to_live = update_frequency.to_i
959
+ @title = CGI.unescapeHTML(
960
+ XPath.first(channel_node, "title/text()").to_s)
514
961
  end
515
- end
516
- if feed_time_to_live == nil
517
- # expressed in minutes
518
- update_frequency = XPath.first(metadata_node, "ttl/text()").to_s
519
- if update_frequency != ""
520
- feed_time_to_live = (update_frequency.to_i / 60)
962
+ unless @title.nil?
963
+ @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip))
521
964
  end
965
+ if @title != "" && !(@title.nil?)
966
+ @title = FeedTools.strip_html(@title).strip
967
+ end
968
+ @title.gsub!(/\n/, " ")
969
+ @title = nil if @title == ""
970
+ self.cache_object.title = @title unless self.cache_object.nil?
522
971
  end
523
-
524
- # TODO: handle time_to_live for CDF files
525
- # =======================================
526
-
527
- # get the feed items
528
- items = XPath.match(root_node, "item")
529
- if items == nil || items == []
530
- items = XPath.match(metadata_node, "item")
972
+ return @title
973
+ end
974
+
975
+ # Sets the feed title
976
+ def title=(new_title)
977
+ @title = new_title
978
+ self.cache_object.title = new_title unless self.cache_object.nil?
979
+ end
980
+
981
+ # Returns the feed description
982
+ def description
983
+ if @description.nil?
984
+ # get the feed description from the xml document
985
+ @description = XPath.first(channel_node, "description/text()").to_s
986
+ if @description != ""
987
+ if XPath.first(channel_node, "description/@encoding").to_s != ""
988
+ @description = "[Embedded data objects are not supported.]"
989
+ else
990
+ @description = CGI.unescapeHTML(description)
991
+ end
992
+ end
993
+ if @description == ""
994
+ @description = XPath.first(channel_node, "subtitle/text()").to_s
995
+ if @description != "" &&
996
+ XPath.first(channel_node, "subtitle/@mode").to_s == "escaped"
997
+ @description = CGI.unescapeHTML(description)
998
+ end
999
+ end
1000
+ if @description == ""
1001
+ @description = XPath.first(channel_node, "tagline/text()").to_s
1002
+ if @description != "" &&
1003
+ XPath.first(channel_node, "tagline/@mode").to_s == "escaped"
1004
+ @description = CGI.unescapeHTML(description)
1005
+ end
1006
+ end
1007
+ if @description == "" && XPath.first(channel_node, "tagline") == nil
1008
+ @description = XPath.first(channel_node, "info/text()").to_s
1009
+ if @description != "" &&
1010
+ XPath.first(channel_node, "info/@mode").to_s == "escaped"
1011
+ @description = CGI.unescapeHTML(description)
1012
+ end
1013
+ end
1014
+ if @description == ""
1015
+ @description = CGI.unescapeHTML(
1016
+ XPath.first(channel_node, "abstract/text()").to_s)
1017
+ end
1018
+ if @description == ""
1019
+ @description = CGI.unescapeHTML(
1020
+ XPath.first(channel_node, "summary/text()").to_s)
1021
+ end
1022
+ if @description == ""
1023
+ # I don't think this is valid for anyone to do, but this is probably
1024
+ # what they meant if they do it.
1025
+ @description = CGI.unescapeHTML(
1026
+ XPath.first(channel_node, "content:encoded/text()").to_s)
1027
+ if @description != ""
1028
+ @bozo = true
1029
+ end
1030
+ end
1031
+ if @description == ""
1032
+ begin
1033
+ @description = XPath.first(channel_node, "description").inner_xml
1034
+ rescue
1035
+ end
1036
+ end
1037
+ if @description == ""
1038
+ @description = self.itunes_summary
1039
+ @description = "" if @description.nil?
1040
+ end
1041
+ if @description == ""
1042
+ @description = self.itunes_subtitle
1043
+ @description = "" if @description.nil?
1044
+ end
1045
+
1046
+ @description =
1047
+ FeedTools.sanitize_html(@description) unless @description.nil?
1048
+ # If it started with a bunch of divs, hack them right off. We can put
1049
+ # them back later if they're needed.
1050
+ @description.gsub!(/^(<div[^>]*>)*/, "")
1051
+ @description.gsub!(/(<\/div>)*$/, "")
1052
+
1053
+ @description.gsub!(/\n/, " ") if @description.size < 80
1054
+ @description = @description.strip unless @description.nil?
1055
+ @description = nil if @description == ""
1056
+ end
1057
+ return @description
1058
+ end
1059
+
1060
+ # Sets the feed description
1061
+ def description=(new_description)
1062
+ @description = new_description
1063
+ end
1064
+
1065
+ # Returns the contents of the itunes:summary element
1066
+ def itunes_summary
1067
+ if @itunes_summary.nil?
1068
+ @itunes_summary = CGI.unescapeHTML(XPath.first(root_node,
1069
+ "itunes:summary/text()").to_s)
1070
+ if @itunes_summary == ""
1071
+ @itunes_summary = nil
1072
+ end
1073
+ @itunes_summary =
1074
+ FeedTools.sanitize_html(@itunes_summary) unless @itunes_summary.nil?
1075
+ end
1076
+ return @itunes_summary
1077
+ end
1078
+
1079
+ # Sets the contents of the itunes:summary element
1080
+ def itunes_summary=(new_itunes_summary)
1081
+ @itunes_summary = new_itunes_summary
1082
+ end
1083
+
1084
+ # Returns the contents of the itunes:subtitle element
1085
+ def itunes_subtitle
1086
+ if @itunes_subtitle.nil?
1087
+ @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node,
1088
+ "itunes:subtitle/text()").to_s)
1089
+ if @itunes_subtitle == ""
1090
+ @itunes_subtitle = nil
1091
+ end
1092
+ unless @itunes_subtitle.nil?
1093
+ @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle)
1094
+ end
1095
+ end
1096
+ return @itunes_subtitle
1097
+ end
1098
+
1099
+ # Sets the contents of the itunes:subtitle element
1100
+ def itunes_subtitle=(new_itunes_subtitle)
1101
+ @itunes_subtitle = new_itunes_subtitle
1102
+ end
1103
+
1104
+ # Returns the feed link
1105
+ def link
1106
+ if @link.nil?
1107
+ # get the feed link from the xml document
1108
+ @link = XPath.first(channel_node, "link[@rel='alternate' @type='text/html']/@href").to_s
1109
+ if @link == ""
1110
+ @link = XPath.first(channel_node, "link[@rel='alternate']/@href").to_s
1111
+ end
1112
+ if @link == ""
1113
+ @link = XPath.first(channel_node, "link/@href").to_s
1114
+ end
1115
+ if @link == ""
1116
+ @link = XPath.first(channel_node, "link/text()").to_s
1117
+ end
1118
+ if @link == ""
1119
+ @link = XPath.first(channel_node, "@href").to_s
1120
+ end
1121
+ if @link == ""
1122
+ if FeedTools.is_url? self.guid
1123
+ @link = self.guid
1124
+ end
1125
+ end
1126
+ if @link == ""
1127
+ # Technically, we shouldn't use the base attribute for this, but if the href attribute
1128
+ # is missing, it's already a given that we're looking at a messed up CDF file. We can
1129
+ # always pray it's correct.
1130
+ @link = XPath.first(channel_node, "@base").to_s
1131
+ end
1132
+ @link = FeedTools.normalize_url(@link)
1133
+ unless self.cache_object.nil?
1134
+ self.cache_object.link = @link
1135
+ end
1136
+ end
1137
+ return @link
1138
+ end
1139
+
1140
+ # Sets the feed link
1141
+ def link=(new_link)
1142
+ @link = new_link
1143
+ unless self.cache_object.nil?
1144
+ self.cache_object.link = new_link
1145
+ end
1146
+ end
1147
+
1148
+ # Returns the feed image link
1149
+ def image_link
1150
+ if @image_link.nil?
1151
+ # get the feed image link from the xml document
1152
+ @image_link = XPath.first(channel_node, "image/url/text()").to_s
1153
+ if @image_link == ""
1154
+ @image_link = XPath.first(channel_node, "image/@rdf:resource").to_s
1155
+ end
1156
+ if @image_link == ""
1157
+ @image_link = XPath.first(channel_node, "link[@type='image/jpeg']/@href").to_s
1158
+ end
1159
+ if @image_link == ""
1160
+ @image_link = XPath.first(channel_node, "link[@type='image/gif']/@href").to_s
1161
+ end
1162
+ if @image_link == ""
1163
+ @image_link = XPath.first(channel_node, "link[@type='image/png']/@href").to_s
1164
+ end
1165
+ if @image_link == ""
1166
+ @image_link = XPath.first(channel_node, "logo[@style='image']/@href").to_s
1167
+ end
1168
+ if @image_link == ""
1169
+ @image_link = XPath.first(channel_node, "logo/@href").to_s
1170
+ end
1171
+ @image_link = FeedTools.normalize_url(@image_link)
1172
+ end
1173
+ return @image_link
1174
+ end
1175
+
1176
+ # Sets the feed image link
1177
+ def image_link=(new_image_link)
1178
+ @image_link = new_image_link
1179
+ end
1180
+
1181
+ # Returns the url to the icon file for this feed.
1182
+ #
1183
+ # This method uses the url from the link field in order to avoid grabbing
1184
+ # the favicon for services like feedburner.
1185
+ def icon_link
1186
+ if @icon_link.nil?
1187
+ @icon_link = XPath.first(channel_node,
1188
+ "link[@rel='icon']/@href").to_s
1189
+ if @icon_link == ""
1190
+ @icon_link = XPath.first(channel_node,
1191
+ "link[@rel='shortcut icon']/@href").to_s
1192
+ end
1193
+ if @icon_link == ""
1194
+ @icon_link = XPath.first(channel_node,
1195
+ "link[@type='image/x-icon']/@href").to_s
1196
+ end
1197
+ if @icon_link == ""
1198
+ @icon_link = XPath.first(channel_node,
1199
+ "icon/@href").to_s
1200
+ end
1201
+ if @icon_link == ""
1202
+ @icon_link = XPath.first(channel_node,
1203
+ "icon/text()").to_s
1204
+ end
1205
+ if @icon_link == ""
1206
+ link_uri = URI.parse(FeedTools.normalize_url(self.link))
1207
+ @icon_link =
1208
+ link_uri.scheme + "://" + link_uri.host + "/favicon.ico"
1209
+ end
1210
+ end
1211
+ return @icon_link
1212
+ end
1213
+
1214
+ # Returns the number of seconds before the feed should expire
1215
+ def time_to_live
1216
+ if @time_to_live.nil?
1217
+ # get the feed time to live from the xml document
1218
+ update_frequency = XPath.first(channel_node, "syn:updateFrequency/text()").to_s
1219
+ if update_frequency != ""
1220
+ update_period = XPath.first(channel_node, "syn:updatePeriod/text()").to_s
1221
+ if update_period == "daily"
1222
+ @time_to_live = update_frequency.to_i * 24
1223
+ elsif update_period == "weekly"
1224
+ @time_to_live = update_frequency.to_i * 24 * 7
1225
+ elsif update_period == "monthly"
1226
+ @time_to_live = update_frequency.to_i * 24 * 30
1227
+ elsif update_period == "yearly"
1228
+ @time_to_live = update_frequency.to_i * 24 * 365
1229
+ else
1230
+ # hourly
1231
+ @time_to_live = update_frequency.to_i
1232
+ end
1233
+ end
1234
+ end
1235
+ if @time_to_live.nil?
1236
+ # expressed in minutes
1237
+ update_frequency = XPath.first(channel_node, "ttl/text()").to_s
1238
+ if update_frequency != ""
1239
+ @time_to_live = (update_frequency.to_i / 60)
1240
+ end
1241
+ end
1242
+ if @time_to_live.nil?
1243
+ @time_to_live = 0
1244
+ update_frequency_days = XPath.first(channel_node, "schedule/intervaltime/@days").to_s
1245
+ update_frequency_hours = XPath.first(channel_node, "schedule/intervaltime/@hour").to_s
1246
+ update_frequency_minutes = XPath.first(channel_node, "schedule/intervaltime/@min").to_s
1247
+ update_frequency_seconds = XPath.first(channel_node, "schedule/intervaltime/@sec").to_s
1248
+ if update_frequency_days != ""
1249
+ @time_to_live = @time_to_live + update_frequency_days.to_i * 24
1250
+ end
1251
+ if update_frequency_hours != ""
1252
+ @time_to_live = @time_to_live + update_frequency_hours.to_i * 1
1253
+ end
1254
+ if update_frequency_minutes != ""
1255
+ @time_to_live = @time_to_live + update_frequency_minutes.to_i / 60
1256
+ end
1257
+ if update_frequency_seconds != ""
1258
+ @time_to_live = @time_to_live + update_frequency_seconds.to_i / 3600
1259
+ end
1260
+ if @time_to_live == 0
1261
+ @time_to_live = nil
1262
+ end
1263
+ end
1264
+ if @time_to_live.nil? || @time_to_live == 0
1265
+ # Default to one hour
1266
+ @time_to_live = 1
1267
+ end
1268
+ @time_to_live = @time_to_live.round
1269
+ return @time_to_live.hour
1270
+ end
1271
+
1272
+ # Sets the feed time to live
1273
+ def time_to_live=(new_time_to_live)
1274
+ @time_to_live = (new_time_to_live / 3600).round
1275
+ @time_to_live = 1 if @time_to_live < 1
1276
+ end
1277
+
1278
+ # Returns the feed language
1279
+ def language
1280
+ if @language.nil?
1281
+ @language = XPath.first(channel_node, "language/text()").to_s
1282
+ if @language == ""
1283
+ @language = XPath.first(channel_node, "dc:language/text()").to_s
1284
+ end
1285
+ if @language == ""
1286
+ @language = XPath.first(channel_node, "xml:lang/text()").to_s
1287
+ end
1288
+ if @language == ""
1289
+ @language = XPath.first(root_node, "xml:lang/text()").to_s
1290
+ end
1291
+ if @language == ""
1292
+ @language = "en-us"
1293
+ end
1294
+ @language = @language.downcase
531
1295
  end
532
- if items == nil || items == []
533
- items = XPath.match(metadata_node, "entry")
1296
+ return @language
1297
+ end
1298
+
1299
+ # Sets the feed language
1300
+ def language=(new_language)
1301
+ @language = new_language
1302
+ end
1303
+
1304
+ # Returns true if this feed contains explicit material.
1305
+ def explicit
1306
+ if @explicit.nil?
1307
+ if XPath.first(channel_node,
1308
+ "media:adult/text()").to_s.downcase == "true" ||
1309
+ XPath.first(channel_node,
1310
+ "itunes:explicit/text()").to_s.downcase == "yes" ||
1311
+ XPath.first(channel_node,
1312
+ "itunes:explicit/text()").to_s.downcase == "true"
1313
+ @explicit = true
1314
+ else
1315
+ @explicit = false
1316
+ end
1317
+ end
1318
+ return @explicit
1319
+ end
1320
+
1321
+ # Sets whether or not the feed contains explicit material
1322
+ def explicit=(new_explicit)
1323
+ @explicit = (new_explicit ? true : false)
1324
+ end
1325
+
1326
+ # Returns the feed items
1327
+ def items
1328
+ if @items.nil?
1329
+ raw_items = XPath.match(root_node, "item")
1330
+ if raw_items == nil || raw_items == []
1331
+ raw_items = XPath.match(channel_node, "item")
1332
+ end
1333
+ if raw_items == nil || raw_items == []
1334
+ raw_items = XPath.match(channel_node, "entry")
1335
+ end
1336
+
1337
+ # create the individual feed items
1338
+ @items = []
1339
+ if raw_items != nil
1340
+ for item_node in raw_items
1341
+ new_item = FeedItem.new
1342
+ new_item.xml_data = item_node.to_s
1343
+ new_item.feed = self
1344
+ @items << new_item
1345
+ end
1346
+ end
534
1347
  end
535
1348
 
536
- # set all of the properties
537
- if title != ""
538
- self.title = title
539
- else
540
- self.title = nil
1349
+ # Sort the items
1350
+ @items = @items.sort do |a,b|
1351
+ (b.time or Time.mktime(1970)) <=> (a.time or Time.mktime(1970))
541
1352
  end
542
- if link != ""
543
- self.link = link
544
- else
545
- self.link = nil
1353
+ return @items
1354
+ end
1355
+
1356
+ # The time that the feed was last requested from the remote server. Nil if it has
1357
+ # never been pulled, or if it was created from scratch.
1358
+ def last_retrieved
1359
+ unless self.cache_object.nil?
1360
+ @last_retrieved = self.cache_object.last_retrieved
546
1361
  end
547
- if description != ""
548
- self.description = description
549
- else
550
- self.description = nil
1362
+ return @last_retrieved
1363
+ end
1364
+
1365
+ # Sets the time that the feed was last updated.
1366
+ def last_retrieved=(new_last_retrieved)
1367
+ @last_retrieved = new_last_retrieved
1368
+ unless self.cache_object.nil?
1369
+ self.cache_object.last_retrieved = new_last_retrieved
551
1370
  end
552
- if image_link != ""
553
- self.image_link = image_link
554
- else
555
- self.image_link = nil
1371
+ end
1372
+
1373
+ # True if this feed contains audio content enclosures
1374
+ def podcast?
1375
+ podcast = false
1376
+ $test_feed.items.each do |item|
1377
+ item.enclosures.each do |enclosure|
1378
+ podcast = true if enclosure.audio?
1379
+ end
556
1380
  end
557
- if feed_time_to_live != nil
558
- self.time_to_live = feed_time_to_live
1381
+ return podcast
1382
+ end
1383
+
1384
+ # True if this feed contains video content enclosures
1385
+ def vidlog?
1386
+ vidlog = false
1387
+ $test_feed.items.each do |item|
1388
+ item.enclosures.each do |enclosure|
1389
+ vidlog = true if enclosure.video?
1390
+ end
1391
+ end
1392
+ return vidlog
1393
+ end
1394
+
1395
+ # True if the feed was not last retrieved from the cache.
1396
+ def live?
1397
+ return @live
1398
+ end
1399
+
1400
+ # True if the feed has expired and must be reacquired from the remote server.
1401
+ def expired?
1402
+ return self.last_retrieved == nil || (self.last_retrieved + self.time_to_live.hour) < Time.now
1403
+ end
1404
+
1405
+ # Forces this feed to expire.
1406
+ def expire
1407
+ self.last_retrieved = Time.mktime(1970)
1408
+ self.save
1409
+ end
1410
+
1411
+ # A hook method that is called during the feed generation process. Overriding this method
1412
+ # will enable additional content to be inserted into the feed.
1413
+ def build_xml_hook(feed_type, version, xml_builder)
1414
+ return nil
1415
+ end
1416
+
1417
+ # Generates xml based on the content of the feed
1418
+ def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
1419
+ if feed_type == "rss" && version == 0.0
1420
+ version = 1.0
1421
+ elsif feed_type == "atom" && version == 0.0
1422
+ version = 0.3
1423
+ end
1424
+ if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
1425
+ # RDF-based rss format
1426
+ return xml_builder.tag!("rdf:RDF") do
1427
+ xml_builder.channel("rdf:about" => CGI.escapeHTML(link)) do
1428
+ unless title.nil? || title == ""
1429
+ xml_builder.title(title)
1430
+ else
1431
+ xml_builder.title
1432
+ end
1433
+ unless link.nil? || link == ""
1434
+ xml_builder.link(link)
1435
+ else
1436
+ xml_builder.link
1437
+ end
1438
+ unless image_link.nil? || image_link == ""
1439
+ xml_builder.image("rdf:resource" => CGI.escapeHTML(image_link))
1440
+ end
1441
+ unless description.nil? || description == ""
1442
+ xml_builder.description(description)
1443
+ else
1444
+ xml_builder.description
1445
+ end
1446
+ unless language.nil? || language == ""
1447
+ xml_builder.tag!("dc:language", language)
1448
+ end
1449
+ xml_builder.tag!("syn:updatePeriod", "hourly")
1450
+ xml_builder.tag!("syn:updateFrequency", (time_to_live / 1.hour).to_s)
1451
+ xml_builder.tag!("syn:updateBase", Time.mktime(1970).iso8601)
1452
+ xml_builder.items do
1453
+ xml_builder.tag!("rdf:Seq") do
1454
+ unless items.nil?
1455
+ for item in items
1456
+ if item.link.nil?
1457
+ raise "Cannot generate an rdf-based feed with a nil item link field."
1458
+ end
1459
+ xml_builder.tag!("rdf:li", "rdf:resource" => CGI.escapeHTML(item.link))
1460
+ end
1461
+ end
1462
+ end
1463
+ end
1464
+ build_xml_hook(feed_type, version, xml_builder)
1465
+ end
1466
+ unless image_link.nil? || image_link == ""
1467
+ xml_builder.image("rdf:about" => CGI.escapeHTML(image_link)) do
1468
+ unless title.nil? || title == ""
1469
+ xml_builder.title(title)
1470
+ else
1471
+ xml_builder.title
1472
+ end
1473
+ unless image_link.nil? || image_link == ""
1474
+ xml_builder.url(image_link)
1475
+ end
1476
+ unless link.nil? || link == ""
1477
+ xml_builder.link(link)
1478
+ else
1479
+ xml_builder.link
1480
+ end
1481
+ end
1482
+ end
1483
+ unless items.nil?
1484
+ for item in items
1485
+ item.build_xml(feed_type, version, xml_builder)
1486
+ end
1487
+ end
1488
+ end
1489
+ elsif feed_type == "rss"
1490
+ # normal rss format
1491
+ return xml_builder.rss("version" => version.to_s) do
1492
+ unless title.nil? || title == ""
1493
+ xml_builder.title(title)
1494
+ end
1495
+ unless link.nil? || link == ""
1496
+ xml_builder.link(link)
1497
+ end
1498
+ unless description.nil? || description == ""
1499
+ xml_builder.description(description)
1500
+ end
1501
+ xml_builder.ttl((time_to_live / 1.minute).to_s)
1502
+ xml_builder.generator("http://www.sporkmonger.com/projects/feedtools")
1503
+ build_xml_hook(feed_type, version, xml_builder)
1504
+ unless items.nil?
1505
+ for item in items
1506
+ item.build_xml(feed_type, version, xml_builder)
1507
+ end
1508
+ end
1509
+ end
1510
+ elsif feed_type == "atom"
1511
+ # normal atom format
1512
+ return xml_builder.feed("xmlns" => "http://purl.org/atom/ns#",
1513
+ "version" => version.to_s,
1514
+ "xml:lang" => language) do
1515
+ unless title.nil? || title == ""
1516
+ xml_builder.title(title,
1517
+ "mode" => "escaped",
1518
+ "type" => "text/html")
1519
+ end
1520
+ unless link.nil? || link == ""
1521
+ xml_builder.link("href" => link,
1522
+ "rel" => "alternate",
1523
+ "type" => "text/html",
1524
+ "title" => title)
1525
+ end
1526
+ unless description.nil? || description == ""
1527
+ xml_builder.tagline(description,
1528
+ "mode" => "escaped",
1529
+ "type" => "text/html")
1530
+ end
1531
+ xml_builder.generator("FeedTools",
1532
+ "url" => "http://www.sporkmonger.com/projects/feedtools")
1533
+ build_xml_hook(feed_type, version, xml_builder)
1534
+ unless items.nil?
1535
+ for item in items
1536
+ item.build_xml(feed_type, version, xml_builder)
1537
+ end
1538
+ end
1539
+ end
1540
+ end
1541
+ end
1542
+
1543
+ # Persists the current feed state to the cache.
1544
+ def save
1545
+ if FeedTools.feed_cache.nil?
1546
+ raise "Caching is currently disabled. Cannot save to cache."
1547
+ elsif self.url.nil?
1548
+ raise "The url field must be set to save to the cache."
1549
+ elsif self.xml_data.nil?
1550
+ raise "The xml_data field must be set to save to the cache."
1551
+ elsif self.cache_object.nil?
1552
+ raise "The cache_object is currently nil. Cannot save to cache."
559
1553
  else
560
- self.time_to_live = nil
1554
+ self.cache_object.url = self.url
1555
+ self.cache_object.title = self.title
1556
+ self.cache_object.link = self.link
1557
+ self.cache_object.xml_data = self.xml_data
1558
+ unless self.http_response.nil?
1559
+ self.cache_object.http_headers = self.http_headers.to_yaml
1560
+ end
1561
+ self.cache_object.last_retrieved = self.last_retrieved
1562
+ self.cache_object.save
1563
+ end
1564
+ end
1565
+
1566
+ alias_method :tagline, :description
1567
+ alias_method :tagline=, :description=
1568
+ alias_method :subtitle, :description
1569
+ alias_method :subtitle=, :description=
1570
+ alias_method :abstract, :description
1571
+ alias_method :abstract=, :description=
1572
+ alias_method :content, :description
1573
+ alias_method :content=, :description=
1574
+ alias_method :ttl, :time_to_live
1575
+ alias_method :ttl=, :time_to_live=
1576
+ alias_method :guid, :id
1577
+ alias_method :guid=, :id=
1578
+ alias_method :entries, :items
1579
+
1580
+ # passes missing methods to the cache_object
1581
+ def method_missing(msg, *params)
1582
+ if self.cache_object.nil?
1583
+ raise NoMethodError, "Invalid method #{msg.to_s}"
1584
+ end
1585
+ return self.cache_object.send(msg, params)
1586
+ end
1587
+
1588
+ # passes missing methods to the FeedTools.feed_cache
1589
+ def Feed.method_missing(msg, *params)
1590
+ if FeedTools.feed_cache.nil?
1591
+ raise NoMethodError, "Invalid method Feed.#{msg.to_s}"
1592
+ end
1593
+ result = FeedTools.feed_cache.send(msg, params)
1594
+ if result.kind_of? FeedTools.feed_cache
1595
+ result = Feed.open(result.url)
1596
+ end
1597
+ return result
1598
+ end
1599
+ end
1600
+
1601
+ class FeedItem
1602
+ include REXML
1603
+ include AttributeDictionary
1604
+
1605
+ # This class stores information about a feed item's file enclosures.
1606
+ class Enclosure
1607
+ include AttributeDictionary
1608
+
1609
+ # The url for the enclosure
1610
+ attr_accessor :url
1611
+ # The MIME type of the file referenced by the enclosure
1612
+ attr_accessor :type
1613
+ # The size of the file referenced by the enclosure
1614
+ attr_accessor :file_size
1615
+ # The total play time of the file referenced by the enclosure
1616
+ attr_accessor :duration
1617
+ # The height in pixels of the enclosed media
1618
+ attr_accessor :height
1619
+ # The width in pixels of the enclosed media
1620
+ attr_accessor :width
1621
+ # The bitrate of the enclosed media
1622
+ attr_accessor :bitrate
1623
+ # The framerate of the enclosed media
1624
+ attr_accessor :framerate
1625
+ # The thumbnail for this enclosure
1626
+ attr_accessor :thumbnail
1627
+ # The categories for this enclosure
1628
+ attr_accessor :categories
1629
+ # A hash of the enclosed file
1630
+ attr_accessor :hash
1631
+ # A website containing some kind of media player instead of a direct
1632
+ # link to the media file.
1633
+ attr_accessor :player
1634
+ # A list of credits for the enclosed media
1635
+ attr_accessor :credits
1636
+ # A text rendition of the enclosed media
1637
+ attr_accessor :text
1638
+ # A list of alternate version of the enclosed media file
1639
+ attr_accessor :versions
1640
+ # The default version of the enclosed media file
1641
+ attr_accessor :default_version
1642
+
1643
+ # Returns true if this is the default enclosure
1644
+ def is_default?
1645
+ return @is_default
1646
+ end
1647
+
1648
+ # Sets whether this is the default enclosure for the media group
1649
+ def is_default=(new_is_default)
1650
+ @is_default = new_is_default
561
1651
  end
562
1652
 
563
- parse_feed_hook(feed_data)
564
- if Feed.cache_enabled?
565
- save
1653
+ # Returns true if the enclosure contains explicit material
1654
+ def explicit?
1655
+ return @explicit
1656
+ end
1657
+
1658
+ # Sets the explicit attribute on the enclosure
1659
+ def explicit=(new_explicit)
1660
+ @explicit = new_explicit
1661
+ end
1662
+
1663
+ # Determines if the object is a sample, or the full version of the
1664
+ # object, or if it is a stream.
1665
+ # Possible values are 'sample', 'full', 'nonstop'.
1666
+ def expression
1667
+ return @expression
1668
+ end
1669
+
1670
+ # Sets the expression attribute on the enclosure.
1671
+ # Allowed values are 'sample', 'full', 'nonstop'.
1672
+ def expression=(new_expression)
1673
+ unless ['sample', 'full', 'nonstop'].include? new_expression.downcase
1674
+ raise ArgumentError,
1675
+ "Permitted values are 'sample', 'full', 'nonstop'."
1676
+ end
1677
+ @expression = new_expression.downcase
566
1678
  end
567
1679
 
568
- # check and make sure we don't have any cached feed_items with a nil link
569
- # if we do, we need to start from scratch to avoid duplicates
570
- for item_link in feed_items.map { |item| item.link }
571
- if item_link.nil?
572
- FeedItem.delete_all("feed_id = '#{self.id}'")
573
- break
1680
+ # Returns true if this enclosure contains audio content
1681
+ def audio?
1682
+ unless self.type.nil?
1683
+ return true if (self.type =~ /^audio/) != nil
1684
+ end
1685
+ # TODO: create a more complete list
1686
+ # =================================
1687
+ audio_extensions = ['mp3', 'm4a', 'm4p', 'wav', 'ogg', 'wma']
1688
+ audio_extensions.each do |extension|
1689
+ if (url =~ /#{extension}$/) != nil
1690
+ return true
1691
+ end
574
1692
  end
1693
+ return false
575
1694
  end
576
1695
 
577
- # parse the feed items
578
- @feed_items_unsorted = []
579
- if items != nil
580
- for item_node in items
581
- @feed_items_unsorted << handle_feed_item(item_node.to_s)
1696
+ # Returns true if this enclosure contains video content
1697
+ def video?
1698
+ unless self.type.nil?
1699
+ return true if (self.type =~ /^video/) != nil
1700
+ return true if self.type == "image/mov"
1701
+ end
1702
+ # TODO: create a more complete list
1703
+ # =================================
1704
+ video_extensions = ['mov', 'mp4', 'avi', 'wmv', 'asf']
1705
+ video_extensions.each do |extension|
1706
+ if (url =~ /#{extension}$/) != nil
1707
+ return true
1708
+ end
582
1709
  end
1710
+ return false
583
1711
  end
584
- return self
585
1712
  end
586
-
587
- # Locates the feed item in the database based on the supplied item xml data.
588
- def find_feed_item_by_data(item_data)
589
- item_node = Document.new(item_data).root
1713
+ EnclosureCategory = Struct.new( "EnclosureCategory", :category, :scheme, :label )
1714
+ EnclosureHash = Struct.new( "EnclosureHash", :hash, :type )
1715
+ EnclosurePlayer = Struct.new( "EnclosurePlayer", :url, :height, :width )
1716
+ EnclosureCredit = Struct.new( "EnclosureCredit", :name, :role )
1717
+ EnclosureThumbnail = Struct.new( "EnclosureThumbnail", :url, :height, :width )
590
1718
 
591
- # get the link
592
- item_link = XPath.first(item_node, "link[@rel='alternate']/@href").to_s
593
- if item_link == ""
594
- item_link = XPath.first(item_node, "link/@href").to_s
1719
+ # Returns the parent feed of this feed item
1720
+ def feed
1721
+ return @feed
1722
+ end
1723
+
1724
+ # Sets the parent feed of this feed item
1725
+ def feed=(new_feed)
1726
+ @feed = new_feed
1727
+ end
1728
+
1729
+ # Returns the feed item's raw xml data.
1730
+ def xml_data
1731
+ return @xml_data
1732
+ end
1733
+
1734
+ # Sets the feed item's xml data.
1735
+ def xml_data=(new_xml_data)
1736
+ @xml_data = new_xml_data
1737
+ end
1738
+
1739
+ # Returns a REXML Document of the xml_data
1740
+ def xml
1741
+ if @xml_doc.nil?
1742
+ @xml_doc = Document.new(xml_data)
1743
+ end
1744
+ return @xml_doc
1745
+ end
1746
+
1747
+ # Returns the first node within the root_node that matches the xpath query.
1748
+ def find_node(xpath)
1749
+ return XPath.first(root_node, xpath)
1750
+ end
1751
+
1752
+ # Returns all nodes within the root_node that match the xpath query.
1753
+ def find_all_nodes(xpath)
1754
+ return XPath.match(root_node, xpath)
1755
+ end
1756
+
1757
+ # Returns the root node of the feed item.
1758
+ def root_node
1759
+ if @root_node.nil?
1760
+ @root_node = xml.root
1761
+ end
1762
+ return @root_node
1763
+ end
1764
+
1765
+ # Returns the feed item title
1766
+ def title
1767
+ if @title.nil?
1768
+ if XPath.first(root_node, "title/@type").to_s == "xhtml" ||
1769
+ XPath.first(root_node, "title/@mode").to_s == "xhtml"
1770
+ @title = XPath.first(root_node, "title").inner_xml
1771
+ elsif XPath.first(root_node, "title/@type").to_s == "escaped" ||
1772
+ XPath.first(root_node, "title/@mode").to_s == "escaped"
1773
+ @title = CGI.unescapeHTML(
1774
+ XPath.first(root_node, "title/text()").to_s)
1775
+ else
1776
+ @title = CGI.unescapeHTML(
1777
+ XPath.first(root_node, "title/text()").to_s)
1778
+ end
1779
+ unless @title.nil?
1780
+ @title = CGI.unescapeHTML(FeedTools.sanitize_html(@title, :strip))
1781
+ end
1782
+ if @title != ""
1783
+ # Some blogging tools include the number of comments in a post
1784
+ # in the title... this is supremely ugly, and breaks any
1785
+ # applications which expect the title to be static, so we're
1786
+ # gonna strip them out.
1787
+ #
1788
+ # If for some incredibly wierd reason you need the actual
1789
+ # unstripped title, just use find_node("title/text()").to_s
1790
+ @title = FeedTools.strip_html(
1791
+ @title.strip.gsub(/\[\d*\]$/, "")).strip
1792
+ @title.gsub!(/\n/, " ")
1793
+ end
1794
+ @title = nil if @title == ""
1795
+ end
1796
+ return @title
1797
+ end
1798
+
1799
+ # Sets the feed item title
1800
+ def title=(new_title)
1801
+ @title = new_title
1802
+ end
1803
+
1804
+ # Returns the feed item description
1805
+ def description
1806
+ if @description.nil?
1807
+ # get the item content
1808
+ @description = ""
1809
+ body_node = XPath.first(root_node, "xhtml:body")
1810
+ if body_node == nil
1811
+ body_node = XPath.first(root_node, "body")
1812
+ end
1813
+ if body_node != nil
1814
+ @description = body_node.inner_xml
1815
+ end
1816
+ if @description == ""
1817
+ @description =
1818
+ CGI.unescapeHTML(XPath.first(root_node, "content:encoded/text()").to_s)
1819
+ end
1820
+ if @description == ""
1821
+ begin
1822
+ @description = XPath.first(root_node, "description").cdatas.first.to_s
1823
+ rescue
1824
+ @description = ""
1825
+ end
1826
+ if @description == ""
1827
+ @description = XPath.first(root_node, "description/text()").to_s
1828
+ end
1829
+ if @description != ""
1830
+ if XPath.first(root_node, "description/@encoding").to_s != ""
1831
+ # Not supported... yet.
1832
+ @description = "[Embedded data objects are not supported.]"
1833
+ else
1834
+ @description = CGI.unescapeHTML(@description)
1835
+ end
1836
+ end
1837
+ end
1838
+ if @description == ""
1839
+ @description = XPath.first(root_node, "content/text()").to_s
1840
+ if @description != "" &&
1841
+ (XPath.first(root_node, "content/@mode").to_s == "escaped" ||
1842
+ XPath.first(root_node, "content/@type").to_s == "escaped")
1843
+ @description = CGI.unescapeHTML(@description)
1844
+ end
1845
+ if XPath.first(root_node, "content/@mode").to_s == "xhtml" ||
1846
+ XPath.first(root_node, "content/@type").to_s == "xhtml"
1847
+ @description = XPath.first(root_node, "content").inner_xml
1848
+ end
1849
+ end
1850
+ if @description == ""
1851
+ begin
1852
+ @description = XPath.first(root_node, "description").inner_xml
1853
+ rescue
1854
+ end
1855
+ end
1856
+ if @description == ""
1857
+ @description = self.itunes_summary
1858
+ @description = "" if @description.nil?
1859
+ end
1860
+ if @description == ""
1861
+ @description = self.itunes_subtitle
1862
+ @description = "" if @description.nil?
1863
+ end
1864
+ if @description == ""
1865
+ @description = self.media_text
1866
+ @description = "" if @description.nil?
1867
+ end
1868
+
1869
+ unless @description.nil?
1870
+ @description = FeedTools.sanitize_html(@description)
1871
+ end
1872
+
1873
+ # If it started with a bunch of divs, hack them right off. We can put
1874
+ # them back later if they're needed.
1875
+ @description.gsub!(/^(<div[^>]*>)*/, "")
1876
+ @description.gsub!(/(<\/div>)*$/, "")
1877
+
1878
+ @description.gsub!(/\n/, " ") if @description.size < 80
1879
+ @description = @description.strip unless @description.nil?
1880
+ @description = nil if @description == ""
595
1881
  end
596
- if item_link == ""
597
- item_link = XPath.first(item_node, "link/text()").to_s
1882
+ return @description
1883
+ end
1884
+
1885
+ # Sets the feed item description
1886
+ def description=(new_description)
1887
+ @description = new_description
1888
+ end
1889
+
1890
+ # Returns the feed item link
1891
+ def link
1892
+ if @link.nil?
1893
+ @link = XPath.first(root_node, "link[@rel='alternate']/@href").to_s
1894
+ if @link == ""
1895
+ @link = XPath.first(root_node, "link/@href").to_s
1896
+ end
1897
+ if @link == ""
1898
+ @link = XPath.first(root_node, "link/text()").to_s
1899
+ end
1900
+ if @link == ""
1901
+ @link = XPath.first(root_node, "@rdf:about").to_s
1902
+ end
1903
+ if @link == ""
1904
+ @link = XPath.first(root_node, "guid[@isPermaLink='true']/text()").to_s
1905
+ end
1906
+ if @link == ""
1907
+ if FeedTools.is_url? self.guid
1908
+ @link = self.guid
1909
+ end
1910
+ end
1911
+ if @link != ""
1912
+ @link = CGI.unescapeHTML(@link)
1913
+ end
1914
+ if @link != "" && (@link =~ /http:\/\//) != 0 && (@link =~ /https:\/\//) != 0
1915
+ if (feed.base[-1..-1] == "/" && @link[0..0] == "/")
1916
+ @link = @link[1..-1]
1917
+ end
1918
+ # prepend the base to the link since they seem to have used a relative path
1919
+ @link = feed.base + @link
1920
+ end
1921
+ @link = FeedTools.normalize_url(@link)
598
1922
  end
599
- if item_link == ""
600
- item_link = XPath.first(item_node, "@rdf:about").to_s
1923
+ return @link
1924
+ end
1925
+
1926
+ # Sets the feed item link
1927
+ def link=(new_link)
1928
+ @link = new_link
1929
+ end
1930
+
1931
+ # Returns the feed comment link
1932
+ def comment_link
1933
+ if @comment_link.nil?
1934
+ # get the feed comment link from the xml document
1935
+ @comment_link = XPath.first(root_node, "comments/text()").to_s
1936
+ if @comment_link == ""
1937
+ @comment_link = self.link
1938
+ end
1939
+ @comment_link = FeedTools.normalize_url(@comment_link)
601
1940
  end
602
- if item_link == ""
603
- item_link = XPath.first(item_node, "guid/text()").to_s
1941
+ return @comment_link
1942
+ end
1943
+
1944
+ # Sets the feed comment link
1945
+ def comment_link=(new_comment_link)
1946
+ @comment_link = new_comment_link
1947
+ end
1948
+
1949
+ # Returns the feed image link
1950
+ def image_link
1951
+ if @image_link.nil?
1952
+ # get the feed image link from the xml document
1953
+ if @image_link == ""
1954
+ @image_link = XPath.first(root_node, "link[@type='image/jpeg']/@href").to_s
1955
+ end
1956
+ if @image_link == ""
1957
+ @image_link = XPath.first(root_node, "link[@type='image/gif']/@href").to_s
1958
+ end
1959
+ if @image_link == ""
1960
+ @image_link = XPath.first(root_node, "link[@type='image/png']/@href").to_s
1961
+ end
1962
+ # The following two should technically never occur, but have been included
1963
+ # simply because I've seen both occuring in the wild at least once.
1964
+ if @image_link == ""
1965
+ @image_link = XPath.first(root_node, "image/url/text()").to_s
1966
+ end
1967
+ if @image_link == ""
1968
+ @image_link = XPath.first(root_node, "image/@rdf:resource").to_s
1969
+ end
1970
+ if @image_link == ""
1971
+ # If there's only a media thumbnail, we can just borrow it. Technically, this isn't
1972
+ # ideal, but chances are very good that anything that makes use of this image is
1973
+ # simply not going to care anyhow.
1974
+ @image_link = XPath.first(root_node, "media:thumbnail/@url").to_s
1975
+ if @image_link == ""
1976
+ @media_image_link = @image_link
1977
+ end
1978
+ end
1979
+ if @image_link == ""
1980
+ # If there's only an itunes image, we can just borrow it. See comment above regarding
1981
+ # less-than-ideal-ness.
1982
+ if @itunes_image_link == ""
1983
+ @image_link = XPath.first(root_node, "itunes:image/@href").to_s
1984
+ if @image_link == ""
1985
+ @image_link = XPath.first(root_node, "itunes:link[@rel='image']/@href").to_s
1986
+ end
1987
+ @itunes_image_link = @image_link
1988
+ else
1989
+ @image_link = @itunes_image_link
1990
+ end
1991
+ end
1992
+ @image_link = FeedTools.normalize_url(@image_link)
1993
+ end
1994
+ return @image_link
1995
+ end
1996
+
1997
+ # Sets the feed image link
1998
+ def image_link=(new_image_link)
1999
+ @image_link = new_image_link
2000
+ end
2001
+
2002
+ # Returns the feed item itunes image link
2003
+ #
2004
+ # If it's not present, falls back to the normal image link.
2005
+ # Technically, the itunes spec says that the image needs to be
2006
+ # square and larger than 300x300, but hey, if there's an image
2007
+ # to be had, it's better than none at all.
2008
+ def itunes_image_link
2009
+ if @itunes_image_link.nil?
2010
+ # get the feed item itunes image link from the xml document
2011
+ @itunes_image_link = XPath.first(root_node, "itunes:image/@href").to_s
2012
+ if @itunes_image_link == ""
2013
+ @itunes_image_link = XPath.first(root_node, "itunes:link[@rel='image']/@href").to_s
2014
+ end
2015
+ if @itunes_image_link == ""
2016
+ @itunes_image_link = self.image_link
2017
+ end
2018
+ @itunes_image_link = FeedTools.normalize_url(@itunes_image_link)
2019
+ end
2020
+ return @itunes_image_link
2021
+ end
2022
+
2023
+ # Sets the feed item itunes image link
2024
+ def itunes_image_link=(new_itunes_image_link)
2025
+ @itunes_image_link = new_itunes_image_link
2026
+ end
2027
+
2028
+ # Returns the feed item media thumbnail link
2029
+ #
2030
+ # If it's not present, falls back to the normal image link.
2031
+ def media_thumbnail_link
2032
+ if @media_thumbnail_link.nil?
2033
+ # get the feed item itunes image link from the xml document
2034
+ @media_thumbnail_link = XPath.first(root_node, "media:thumbnail/@url").to_s
2035
+ if @media_thumbnail_link == ""
2036
+ @media_thumbnail_link = image_link
2037
+ end
2038
+ @media_thumbnail_link = FeedTools.normalize_url(@media_thumbnail_link)
604
2039
  end
605
- item_title = XPath.first(item_node, "title/text()").to_s
2040
+ return @media_thumbnail_link
2041
+ end
2042
+
2043
+ # Sets the feed item media thumbnail url
2044
+ def media_thumbnail_link=(new_media_thumbnail_link)
2045
+ @media_thumbnail_link = new_media_thumbnail_link
2046
+ end
606
2047
 
607
- feed_item = FeedItem.find_by_feed_id_and_link(self.id, item_link)
608
- unless feed_item.nil?
609
- # Some blogging tools alter the title of an item when the number of comments change (for
610
- # example, TextPattern) and many email feed dumps use the same link for multiple
611
- # items (for example, GMail). We try to take both of these cases into account here.
612
- existing_title = feed_item.title
613
- item_title = item_title.gsub(/\[\d*\]/,"").strip
614
- existing_title = existing_title.gsub(/\[\d*\]/,"").strip
615
- item_title = item_title.gsub(/\(\d*\)/,"").strip
616
- existing_title = existing_title.gsub(/\(\d*\)/,"").strip
617
- item_title = item_title.gsub(/\{\d*\}/,"").strip
618
- existing_title = existing_title.gsub(/\{\d*\}/,"").strip
619
- if existing_title != item_title
620
- feed_item = nil
2048
+ # Returns the feed items's unique id
2049
+ def id
2050
+ if @id.nil?
2051
+ @id = XPath.first(root_node, "id/text()").to_s
2052
+ if @id == ""
2053
+ @id = XPath.first(root_node, "guid/text()").to_s
621
2054
  end
2055
+ @id = nil if @id == ""
622
2056
  end
623
- return feed_item
2057
+ return @id
624
2058
  end
625
2059
 
626
- def handle_feed_item(item_data)
627
- feed_item = find_feed_item_by_data(item_data)
628
- if feed_item.nil?
629
- feed_item = FeedItem.new
630
- end
631
- feed_item.feed = self
632
- feed_item.parse_item(item_data)
633
- return feed_item
634
- end
635
-
636
- def build_feed_hook(feed_type, version, xml_builder)
637
- return nil
2060
+ # Sets the feed item's unique id
2061
+ def id=(new_id)
2062
+ @id = new_id
638
2063
  end
639
-
640
- def build_feed(feed_type, version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
641
- if feed_type == "rss" && version == 0.0
642
- version = 1.0
643
- elsif feed_type == "atom" && version == 0.0
644
- version = 0.3
645
- end
646
- if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
647
- # RDF-based rss format
648
- return xml_builder.tag!("rdf:RDF") do
649
- xml_builder.channel("rdf:about" => CGI.escapeHTML(link)) do
650
- unless title.nil? || title == ""
651
- xml_builder.title(title)
652
- else
653
- xml_builder.title
2064
+
2065
+ # Returns all feed item enclosures
2066
+ def enclosures
2067
+ if @enclosures.nil?
2068
+ @enclosures = []
2069
+
2070
+ # First, load up all the different possible sources of enclosures
2071
+ rss_enclosures = XPath.match(root_node, "enclosure")
2072
+ atom_enclosures = XPath.match(root_node, "link[@rel='enclosure']")
2073
+ media_content_enclosures = XPath.match(root_node, "media:content")
2074
+ media_group_enclosures = XPath.match(root_node, "media:group")
2075
+
2076
+ # Parse RSS-type enclosures. Thanks to a few buggy enclosures implementations,
2077
+ # sometimes these also manage to show up in atom files.
2078
+ for enclosure_node in rss_enclosures
2079
+ enclosure = Enclosure.new
2080
+ enclosure.url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s)
2081
+ enclosure.type = enclosure_node.attributes["type"].to_s
2082
+ enclosure.file_size = enclosure_node.attributes["length"].to_i
2083
+ enclosure.credits = []
2084
+ enclosure.explicit = false
2085
+ @enclosures << enclosure
2086
+ end
2087
+
2088
+ # Parse atom-type enclosures. If there are repeats of the same enclosure object,
2089
+ # we merge the two together.
2090
+ for enclosure_node in atom_enclosures
2091
+ enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["href"].to_s)
2092
+ enclosure = nil
2093
+ new_enclosure = false
2094
+ for existing_enclosure in @enclosures
2095
+ if existing_enclosure.url == enclosure_url
2096
+ enclosure = existing_enclosure
2097
+ break
654
2098
  end
655
- unless link.nil? || link == ""
656
- xml_builder.link(link)
657
- else
658
- xml_builder.link
2099
+ end
2100
+ if enclosure.nil?
2101
+ new_enclosure = true
2102
+ enclosure = Enclosure.new
2103
+ end
2104
+ enclosure.url = enclosure_url
2105
+ enclosure.type = enclosure_node.attributes["type"].to_s
2106
+ enclosure.file_size = enclosure_node.attributes["length"].to_i
2107
+ enclosure.credits = []
2108
+ enclosure.explicit = false
2109
+ if new_enclosure
2110
+ @enclosures << enclosure
2111
+ end
2112
+ end
2113
+
2114
+ # Creates an anonymous method to parse content objects from the media module. We
2115
+ # do this to avoid excessive duplication of code since we have to do identical
2116
+ # processing for content objects within group objects.
2117
+ parse_media_content = lambda do |media_content_nodes|
2118
+ affected_enclosures = []
2119
+ for enclosure_node in media_content_nodes
2120
+ enclosure_url = CGI.unescapeHTML(enclosure_node.attributes["url"].to_s)
2121
+ enclosure = nil
2122
+ new_enclosure = false
2123
+ for existing_enclosure in @enclosures
2124
+ if existing_enclosure.url == enclosure_url
2125
+ enclosure = existing_enclosure
2126
+ break
2127
+ end
659
2128
  end
660
- unless image_link.nil? || image_link == ""
661
- xml_builder.image("rdf:resource" => CGI.escapeHTML(image_link))
2129
+ if enclosure.nil?
2130
+ new_enclosure = true
2131
+ enclosure = Enclosure.new
662
2132
  end
663
- unless description.nil? || description == ""
664
- xml_builder.description(description)
665
- else
666
- xml_builder.description
2133
+ enclosure.url = enclosure_url
2134
+ enclosure.type = enclosure_node.attributes["type"].to_s
2135
+ enclosure.file_size = enclosure_node.attributes["fileSize"].to_i
2136
+ enclosure.duration = enclosure_node.attributes["duration"].to_s
2137
+ enclosure.height = enclosure_node.attributes["height"].to_i
2138
+ enclosure.width = enclosure_node.attributes["width"].to_i
2139
+ enclosure.bitrate = enclosure_node.attributes["bitrate"].to_i
2140
+ enclosure.framerate = enclosure_node.attributes["framerate"].to_i
2141
+ enclosure.expression = enclosure_node.attributes["expression"].to_s
2142
+ enclosure.is_default =
2143
+ (enclosure_node.attributes["isDefault"].to_s.downcase == "true")
2144
+ if XPath.first(enclosure_node, "media:thumbnail/@url").to_s != ""
2145
+ enclosure.thumbnail = EnclosureThumbnail.new(
2146
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@url").to_s),
2147
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@height").to_s),
2148
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:thumbnail/@width").to_s)
2149
+ )
2150
+ if enclosure.thumbnail.height == ""
2151
+ enclosure.thumbnail.height = nil
2152
+ end
2153
+ if enclosure.thumbnail.width == ""
2154
+ enclosure.thumbnail.width = nil
2155
+ end
667
2156
  end
668
- unless language.nil? || language == ""
669
- xml_builder.tag!("dc:language", language)
2157
+ enclosure.categories = []
2158
+ for category in XPath.match(enclosure_node, "media:category")
2159
+ enclosure.categories << EnclosureCategory.new(
2160
+ CGI.unescapeHTML(category.text),
2161
+ CGI.unescapeHTML(category.attributes["scheme"].to_s),
2162
+ CGI.unescapeHTML(category.attributes["label"].to_s)
2163
+ )
2164
+ if enclosure.categories.last.scheme == ""
2165
+ enclosure.categories.last.scheme = nil
2166
+ end
2167
+ if enclosure.categories.last.label == ""
2168
+ enclosure.categories.last.label = nil
2169
+ end
670
2170
  end
671
- xml_builder.tag!("syn:updatePeriod", "hourly")
672
- xml_builder.tag!("syn:updateFrequency", (time_to_live / 1.hour).to_s)
673
- xml_builder.tag!("syn:updateBase", Time.mktime(1970).iso8601)
674
- xml_builder.items do
675
- xml_builder.tag!("rdf:Seq") do
676
- unless feed_items.nil?
677
- for item in feed_items
678
- if item.link.nil?
679
- raise "Cannot generate an rdf-based feed with a nil item link field."
680
- end
681
- xml_builder.tag!("rdf:li", "rdf:resource" => CGI.escapeHTML(item.link))
682
- end
683
- end
2171
+ if XPath.first(enclosure_node, "media:hash/text()").to_s != ""
2172
+ enclosure.hash = EnclosureHash.new(
2173
+ FeedTools.sanitize_html(CGI.unescapeHTML(XPath.first(
2174
+ enclosure_node, "media:hash/text()").to_s), :strip),
2175
+ "md5"
2176
+ )
2177
+ end
2178
+ if XPath.first(enclosure_node, "media:player/@url").to_s != ""
2179
+ enclosure.player = EnclosurePlayer.new(
2180
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@url").to_s),
2181
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@height").to_s),
2182
+ CGI.unescapeHTML(XPath.first(enclosure_node, "media:player/@width").to_s)
2183
+ )
2184
+ if enclosure.player.height == ""
2185
+ enclosure.player.height = nil
2186
+ end
2187
+ if enclosure.player.width == ""
2188
+ enclosure.player.width = nil
2189
+ end
2190
+ end
2191
+ enclosure.credits = []
2192
+ for credit in XPath.match(enclosure_node, "media:credit")
2193
+ enclosure.credits << EnclosureCredit.new(
2194
+ CGI.unescapeHTML(CGI.unescapeHTML(credit.text)),
2195
+ CGI.unescapeHTML(credit.attributes["role"].to_s.downcase)
2196
+ )
2197
+ if enclosure.credits.last.role == ""
2198
+ enclosure.credits.last.role = nil
684
2199
  end
685
2200
  end
686
- build_feed_hook(feed_type, version, xml_builder)
2201
+ enclosure.explicit = (XPath.first(enclosure_node,
2202
+ "media:adult/text()").to_s.downcase == "true")
2203
+ if XPath.first(enclosure_node, "media:text/text()").to_s != ""
2204
+ enclosure.text = CGI.unescapeHTML(XPath.first(enclosure_node,
2205
+ "media:text/text()").to_s)
2206
+ end
2207
+ affected_enclosures << enclosure
2208
+ if new_enclosure
2209
+ @enclosures << enclosure
2210
+ end
687
2211
  end
688
- unless image_link.nil? || image_link == ""
689
- xml_builder.image("rdf:about" => CGI.escapeHTML(image_link)) do
690
- unless title.nil? || title == ""
691
- xml_builder.title(title)
692
- else
693
- xml_builder.title
2212
+ affected_enclosures
2213
+ end
2214
+
2215
+ # Parse the independant content objects.
2216
+ parse_media_content.call(media_content_enclosures)
2217
+
2218
+ media_groups = []
2219
+
2220
+ # Parse the group objects.
2221
+ for media_group in media_group_enclosures
2222
+ group_media_content_enclosures =
2223
+ XPath.match(media_group, "media:content")
2224
+
2225
+ # Parse the content objects within the group objects.
2226
+ affected_enclosures =
2227
+ parse_media_content.call(group_media_content_enclosures)
2228
+
2229
+ # Now make sure that content objects inherit certain properties from
2230
+ # the group objects.
2231
+ for enclosure in affected_enclosures
2232
+ if enclosure.thumbnail.nil? &&
2233
+ XPath.first(media_group, "media:thumbnail/@url").to_s != ""
2234
+ enclosure.thumbnail = EnclosureThumbnail.new(
2235
+ CGI.unescapeHTML(
2236
+ XPath.first(media_group, "media:thumbnail/@url").to_s),
2237
+ CGI.unescapeHTML(
2238
+ XPath.first(media_group, "media:thumbnail/@height").to_s),
2239
+ CGI.unescapeHTML(
2240
+ XPath.first(media_group, "media:thumbnail/@width").to_s)
2241
+ )
2242
+ if enclosure.thumbnail.height == ""
2243
+ enclosure.thumbnail.height = nil
694
2244
  end
695
- unless image_link.nil? || image_link == ""
696
- xml_builder.url(image_link)
2245
+ if enclosure.thumbnail.width == ""
2246
+ enclosure.thumbnail.width = nil
697
2247
  end
698
- unless link.nil? || link == ""
699
- xml_builder.link(link)
700
- else
701
- xml_builder.link
2248
+ end
2249
+ if (enclosure.categories.nil? || enclosure.categories.size == 0)
2250
+ enclosure.categories = []
2251
+ for category in XPath.match(media_group, "media:category")
2252
+ enclosure.categories << EnclosureCategory.new(
2253
+ CGI.unescapeHTML(category.text),
2254
+ CGI.unescapeHTML(category.attributes["scheme"].to_s),
2255
+ CGI.unescapeHTML(category.attributes["label"].to_s)
2256
+ )
2257
+ if enclosure.categories.last.scheme == ""
2258
+ enclosure.categories.last.scheme = nil
2259
+ end
2260
+ if enclosure.categories.last.label == ""
2261
+ enclosure.categories.last.label = nil
2262
+ end
702
2263
  end
703
2264
  end
704
- end
705
- unless feed_items.nil?
706
- for item in feed_items
707
- item.build_feed_item(feed_type, version, xml_builder)
2265
+ if enclosure.hash.nil? &&
2266
+ XPath.first(media_group, "media:hash/text()").to_s != ""
2267
+ enclosure.hash = EnclosureHash.new(
2268
+ CGI.unescapeHTML(XPath.first(media_group, "media:hash/text()").to_s),
2269
+ "md5"
2270
+ )
2271
+ end
2272
+ if enclosure.player.nil? &&
2273
+ XPath.first(media_group, "media:player/@url").to_s != ""
2274
+ enclosure.player = EnclosurePlayer.new(
2275
+ CGI.unescapeHTML(XPath.first(media_group, "media:player/@url").to_s),
2276
+ CGI.unescapeHTML(XPath.first(media_group, "media:player/@height").to_s),
2277
+ CGI.unescapeHTML(XPath.first(media_group, "media:player/@width").to_s)
2278
+ )
2279
+ if enclosure.player.height == ""
2280
+ enclosure.player.height = nil
2281
+ end
2282
+ if enclosure.player.width == ""
2283
+ enclosure.player.width = nil
2284
+ end
2285
+ end
2286
+ if enclosure.credits.nil? || enclosure.credits.size == 0
2287
+ enclosure.credits = []
2288
+ for credit in XPath.match(media_group, "media:credit")
2289
+ enclosure.credits << EnclosureCredit.new(
2290
+ CGI.unescapeHTML(CGI.unescapeHTML(credit.text)),
2291
+ CGI.unescapeHTML(credit.attributes["role"].to_s.downcase)
2292
+ )
2293
+ if enclosure.credits.last.role == ""
2294
+ enclosure.credits.last.role = nil
2295
+ end
2296
+ end
2297
+ end
2298
+ if enclosure.explicit?.nil?
2299
+ enclosure.explicit = (XPath.first(media_group,
2300
+ "media:adult/text()").to_s.downcase == "true") ? true : false
2301
+ end
2302
+ if enclosure.text.nil? &&
2303
+ XPath.first(media_group, "media:text/text()").to_s != ""
2304
+ enclosure.text = FeedTools.sanitize_html(CGI.unescapeHTML(
2305
+ XPath.first(media_group, "media:text/text()").to_s), :strip)
708
2306
  end
709
2307
  end
2308
+
2309
+ # Keep track of the media groups
2310
+ media_groups << affected_enclosures
710
2311
  end
711
- elsif feed_type == "rss"
712
- # normal rss format
713
- return xml_builder.rss("version" => version.to_s) do
714
- unless title.nil? || title == ""
715
- xml_builder.title(title)
716
- end
717
- unless link.nil? || link == ""
718
- xml_builder.link(link)
2312
+
2313
+ # Now we need to inherit any relevant item level information.
2314
+ if self.explicit?
2315
+ for enclosure in @enclosures
2316
+ enclosure.explicit = true
719
2317
  end
720
- unless description.nil? || description == ""
721
- xml_builder.description(description)
2318
+ end
2319
+
2320
+ # Add all the itunes categories
2321
+ for itunes_category in XPath.match(root_node, "itunes:category")
2322
+ genre = "Podcasts"
2323
+ category = itunes_category.attributes["text"].to_s
2324
+ subcategory = XPath.first(itunes_category, "itunes:category/@text").to_s
2325
+ category_path = genre
2326
+ if category != ""
2327
+ category_path << "/" + category
722
2328
  end
723
- xml_builder.ttl((time_to_live / 1.minute).to_s)
724
- xml_builder.generator("http://www.sporkmonger.com/projects/feedtools")
725
- build_feed_hook(feed_type, version, xml_builder)
726
- unless feed_items.nil?
727
- for item in feed_items
728
- item.build_feed_item(feed_type, version, xml_builder)
2329
+ if subcategory != ""
2330
+ category_path << "/" + subcategory
2331
+ end
2332
+ for enclosure in @enclosures
2333
+ if enclosure.categories.nil?
2334
+ enclosure.categories = []
729
2335
  end
2336
+ enclosure.categories << EnclosureCategory.new(
2337
+ CGI.unescapeHTML(category_path),
2338
+ CGI.unescapeHTML("http://www.apple.com/itunes/store/"),
2339
+ CGI.unescapeHTML("iTunes Music Store Categories")
2340
+ )
730
2341
  end
731
2342
  end
732
- elsif feed_type == "atom"
733
- # normal atom format
734
- return xml_builder.feed("xmlns" => "http://purl.org/atom/ns#",
735
- "version" => version.to_s,
736
- "xml:lang" => language) do
737
- unless title.nil? || title == ""
738
- xml_builder.title(title,
739
- "mode" => "escaped",
740
- "type" => "text/html")
2343
+
2344
+ for enclosure in @enclosures
2345
+ # Clean up any of those attributes that incorrectly have ""
2346
+ # or 0 as their values
2347
+ if enclosure.type == ""
2348
+ enclosure.type = nil
741
2349
  end
742
- unless link.nil? || link == ""
743
- xml_builder.link("href" => link,
744
- "rel" => "alternate",
745
- "type" => "text/html",
746
- "title" => title)
2350
+ if enclosure.file_size == 0
2351
+ enclosure.file_size = nil
747
2352
  end
748
- unless description.nil? || description == ""
749
- xml_builder.tagline(description,
750
- "mode" => "escaped",
751
- "type" => "text/html")
2353
+ if enclosure.duration == 0
2354
+ enclosure.duration = nil
752
2355
  end
753
- xml_builder.generator("FeedTools",
754
- "url" => "http://www.sporkmonger.com/projects/feedtools")
755
- build_feed_hook(feed_type, version, xml_builder)
756
- unless feed_items.nil?
757
- for item in feed_items
758
- item.build_feed_item(feed_type, version, xml_builder)
2356
+ if enclosure.height == 0
2357
+ enclosure.height = nil
2358
+ end
2359
+ if enclosure.width == 0
2360
+ enclosure.width = nil
2361
+ end
2362
+ if enclosure.bitrate == 0
2363
+ enclosure.bitrate = nil
2364
+ end
2365
+ if enclosure.framerate == 0
2366
+ enclosure.framerate = nil
2367
+ end
2368
+ if enclosure.expression == "" || enclosure.expression.nil?
2369
+ enclosure.expression = "full"
2370
+ end
2371
+
2372
+ # If an enclosure is missing the text field, fall back on the itunes:summary field
2373
+ if enclosure.text.nil? || enclosure.text = ""
2374
+ enclosure.text = self.itunes_summary
2375
+ end
2376
+
2377
+ # Make sure we don't have duplicate categories
2378
+ unless enclosure.categories.nil?
2379
+ enclosure.categories.uniq!
2380
+ end
2381
+ end
2382
+
2383
+ # And finally, now things get complicated. This is where we make
2384
+ # sure that the enclosures method only returns either default
2385
+ # enclosures or enclosures with only one version. Any enclosures
2386
+ # that are wrapped in a media:group will be placed in the appropriate
2387
+ # versions field.
2388
+ affected_enclosure_urls = []
2389
+ for media_group in media_groups
2390
+ affected_enclosure_urls =
2391
+ affected_enclosure_urls | (media_group.map do |enclosure|
2392
+ enclosure.url
2393
+ end)
2394
+ end
2395
+ @enclosures.delete_if do |enclosure|
2396
+ (affected_enclosure_urls.include? enclosure.url)
2397
+ end
2398
+ for media_group in media_groups
2399
+ default_enclosure = nil
2400
+ for enclosure in media_group
2401
+ if enclosure.is_default?
2402
+ default_enclosure = enclosure
759
2403
  end
760
2404
  end
2405
+ for enclosure in media_group
2406
+ enclosure.default_version = default_enclosure
2407
+ enclosure.versions = media_group.clone
2408
+ enclosure.versions.delete(enclosure)
2409
+ end
2410
+ @enclosures << default_enclosure
761
2411
  end
762
2412
  end
763
- end
764
-
765
- # Saves the current state of the feed to the database unless the feed lacks a remote location
766
- def save
767
- unless url.nil? || url == ""
768
- super
769
- end
770
- end
771
- end
772
2413
 
773
- class FeedItem < ActiveRecord::Base
774
- include REXML
775
-
776
- # Verifies that the required fields exist; additional ones added by the user are fine
777
- def FeedItem.table_exists?
778
- begin
779
- connection.execute "select id, feed_id, link, title, author, description, " +
780
- "time, tags from feed_items limit 1"
781
- rescue ActiveRecord::StatementInvalid
782
- return false
2414
+ # If we have a single enclosure, it's safe to inherit the itunes:duration field
2415
+ # if it's missing.
2416
+ if @enclosures.size == 1
2417
+ if @enclosures.first.duration.nil? || @enclosures.first.duration == 0
2418
+ @enclosures.first.duration = self.duration
2419
+ end
783
2420
  end
784
- return true
2421
+
2422
+ return @enclosures
785
2423
  end
786
2424
 
787
- def feed
788
- if @feed != nil
789
- return @feed
790
- elsif @feed_id != nil
791
- @feed = Feed.find_by_id(self.feed_id)
792
- return @feed
793
- else
794
- return nil
795
- end
2425
+ def enclosures=(new_enclosures)
2426
+ @enclosures = new_enclosures
796
2427
  end
797
2428
 
798
- def feed=(new_feed)
799
- self.feed_id = new_feed.id
800
- @feed = new_feed
801
- end
802
-
803
- def title
804
- return (self['title'] or "Untitled Entry")
2429
+ # Returns the feed item author
2430
+ def author_name
2431
+ # TODO: make this not suck, actually ensure we're looking at a name
2432
+ # and not an email address.
2433
+ # Also, factor in itunes module.
2434
+ # =================================================================
2435
+ if @author_name.nil?
2436
+ @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/name/text()").to_s)
2437
+ if @author_name == ""
2438
+ @author_name = CGI.unescapeHTML(XPath.first(root_node, "dc:creator/text()").to_s)
2439
+ end
2440
+ if @author_name == ""
2441
+ @author_name = CGI.unescapeHTML(XPath.first(root_node, "author/text()").to_s)
2442
+ end
2443
+ end
2444
+ return @author_name
805
2445
  end
806
-
807
- def tag_list
808
- return tags.nil? ? nil : tags[1..-2].split("|")
2446
+
2447
+ # Sets the feed item author
2448
+ def author_name=(new_author_name)
2449
+ @author_name = new_author_name
809
2450
  end
810
-
811
- def tag_list=(new_tag_list)
812
- self.tags = "|" + (new_tag_list.map { |x| x.strip }).join("|") + "|"
2451
+
2452
+ # Returns the contents of the itunes:summary element
2453
+ def itunes_summary
2454
+ if @itunes_summary.nil?
2455
+ @itunes_summary = CGI.unescapeHTML(XPath.first(root_node,
2456
+ "itunes:summary/text()").to_s)
2457
+ if @itunes_summary == ""
2458
+ @itunes_summary = nil
2459
+ end
2460
+ unless @itunes_summary.nil?
2461
+ @itunes_summary = FeedTools.sanitize_html(@itunes_summary)
2462
+ end
2463
+ end
2464
+ return @itunes_summary
813
2465
  end
814
2466
 
815
- def tag_string
816
- return (tags.nil? ? nil : tags[1..-2]).split("|").join(", ")
2467
+ # Sets the contents of the itunes:summary element
2468
+ def itunes_summary=(new_itunes_summary)
2469
+ @itunes_summary = new_itunes_summary
817
2470
  end
818
2471
 
819
- def tag_string=(new_tag_string)
820
- self.tags = "|" + (new_tag_string.split(",").map { |x| x.strip }).join("|") + "|"
2472
+ # Returns the contents of the itunes:subtitle element
2473
+ def itunes_subtitle
2474
+ if @itunes_subtitle.nil?
2475
+ @itunes_subtitle = CGI.unescapeHTML(XPath.first(root_node,
2476
+ "itunes:subtitle/text()").to_s)
2477
+ if @itunes_subtitle == ""
2478
+ @itunes_subtitle = nil
2479
+ end
2480
+ unless @itunes_subtitle.nil?
2481
+ @itunes_subtitle = FeedTools.sanitize_html(@itunes_subtitle)
2482
+ end
2483
+ end
2484
+ return @itunes_subtitle
821
2485
  end
822
2486
 
823
- def parse_feed_item_hook(item_data)
824
- return nil
2487
+ # Sets the contents of the itunes:subtitle element
2488
+ def itunes_subtitle=(new_itunes_subtitle)
2489
+ @itunes_subtitle = new_itunes_subtitle
825
2490
  end
826
2491
 
827
- def parse_item(item_data)
828
- item_node = Document.new(item_data).root
829
-
830
- # get the feed base, in case the feed items use relative paths
831
- base = feed.link
832
-
833
- # get the link
834
- link = XPath.first(item_node, "link[@rel='alternate']/@href").to_s
835
- if link == ""
836
- link = XPath.first(item_node, "link/@href").to_s
837
- end
838
- if link == ""
839
- link = XPath.first(item_node, "link/text()").to_s
840
- end
841
- if link == ""
842
- link = XPath.first(item_node, "@rdf:about").to_s
843
- end
844
- if link == ""
845
- link = XPath.first(item_node, "guid/text()").to_s
846
- end
847
- if link != ""
848
- link = CGI.unescapeHTML(link)
849
- end
850
- if link != "" && (link =~ /http:\/\//) != 0 && (link =~ /https:\/\//) != 0
851
- # ensure that we don't end up with 'http://www.foobar.com//path/to/entry'
852
- # future-proofed this so that it doesn't break when Ruby 1.9/2.0 starts
853
- # returning single character Strings instead of FixNums
854
- if (base[-1] == 47 && link[0] == 47) || (base[-1] == "/" && link[0] == "/")
855
- link = link[1..-1]
2492
+ # Returns the contents of the media:text element
2493
+ def media_text
2494
+ if @media_text.nil?
2495
+ @media_text = CGI.unescapeHTML(XPath.first(root_node,
2496
+ "itunes:subtitle/text()").to_s)
2497
+ if @media_text == ""
2498
+ @media_text = nil
2499
+ end
2500
+ unless @media_text.nil?
2501
+ @media_text = FeedTools.sanitize_html(@media_text)
856
2502
  end
857
- # prepend the base to the link since they seem to have used a relative path
858
- link = base + link
859
- end
860
-
861
- title = XPath.first(item_node, "title/text()").to_s
862
- if title != ""
863
- # some blogging tools (notably TextPattern I believe) include the number of
864
- # comments in a post in the title... this is ugly, so we're gonna strip them out
865
- title = title.gsub(/\[\d*\]/,"").strip
866
- end
867
-
868
- # get the item author
869
- author = CGI.unescapeHTML(XPath.first(item_node, "author/name/text()").to_s)
870
- if author == ""
871
- author = CGI.unescapeHTML(XPath.first(item_node, "dc:creator/text()").to_s)
872
- end
873
- if author == ""
874
- author = CGI.unescapeHTML(XPath.first(item_node, "author/text()").to_s)
875
2503
  end
2504
+ return @media_text
2505
+ end
876
2506
 
877
- # get the item content
878
- description = ""
879
- body = XPath.first(item_node, "xhtml:body")
880
- if body == nil
881
- body = XPath.first(item_node, "body")
882
- end
883
- if body != nil
884
- description = body.inner_xml
885
- end
886
- if description == ""
887
- description = CGI.unescapeHTML(XPath.first(item_node, "content:encoded/text()").to_s)
2507
+ # Sets the contents of the media:text element
2508
+ def media_text=(new_media_text)
2509
+ @media_text = new_media_text
2510
+ end
2511
+
2512
+ # Returns the contents of the itunes:author element
2513
+ #
2514
+ # This inherits from any incorrectly placed channel-level itunes:author
2515
+ # elements. They're actually amazingly commong. People don't read specs.
2516
+ def itunes_author
2517
+ if @itunes_author.nil?
2518
+ @itunes_author = CGI.unescapeHTML(XPath.first(root_node,
2519
+ "itunes:author/text()").to_s)
2520
+ if @itunes_author == ""
2521
+ @itunes_author = CGI.unescapeHTML(XPath.first(feed.channel_node,
2522
+ "itunes:author/text()").to_s)
2523
+ end
2524
+ if @itunes_author == ""
2525
+ @itunes_author = nil
2526
+ end
888
2527
  end
889
- if description == ""
890
- description = XPath.first(item_node, "description/text()").to_s
891
- if description != ""
892
- if XPath.first(item_node, "description/@encoding").to_s != ""
893
- description = "[Embedded data objects are not supported.]"
894
- else
895
- description = CGI.unescapeHTML(description)
2528
+ return @itunes_author
2529
+ end
2530
+
2531
+ # Sets the contents of the itunes:author element
2532
+ def itunes_author=(new_itunes_author)
2533
+ @itunes_author = new_itunes_author
2534
+ end
2535
+
2536
+ # Returns the number of seconds that the associated media runs for
2537
+ def duration
2538
+ if @duration.nil?
2539
+ itunes_duration = CGI.unescapeHTML(XPath.first(root_node,
2540
+ "itunes:duration/text()").to_s)
2541
+ if itunes_duration != ""
2542
+ hms = itunes_duration.split(":").map { |x| x.to_i }
2543
+ if hms.size == 3
2544
+ @duration = hms[0].hour + hms[1].minute + hms[2]
2545
+ elsif hms.size == 2
2546
+ @duration = hms[0].minute + hms[1]
2547
+ elsif hms.size == 1
2548
+ @duration = hms[0]
896
2549
  end
897
2550
  end
898
2551
  end
899
- if description == ""
900
- description = XPath.first(item_node,"content/text()").to_s
901
- if description != "" && XPath.first(item_node, "content/@mode").to_s == "escaped"
902
- description = CGI.unescapeHTML(description)
2552
+ return @duration
2553
+ end
2554
+
2555
+ # Sets the number of seconds that the associate media runs for
2556
+ def duration=(new_duration)
2557
+ @duration = new_duration
2558
+ end
2559
+
2560
+ # Sets the itunes:summary
2561
+ def itunes_summary=(new_itunes_summary)
2562
+ end
2563
+
2564
+ # Returns the feed item time
2565
+ def time
2566
+ if @time.nil?
2567
+ time_string = XPath.first(root_node, "pubDate/text()").to_s
2568
+ if time_string == ""
2569
+ time_string = XPath.first(root_node, "dc:date/text()").to_s
2570
+ end
2571
+ if time_string == ""
2572
+ time_string = XPath.first(root_node, "issued/text()").to_s
2573
+ end
2574
+ if time_string != ""
2575
+ @time = Time.parse(time_string) rescue Time.now
2576
+ elsif time_string == nil
2577
+ @time = Time.now
903
2578
  end
904
2579
  end
905
-
906
- # get the item time
907
- time = XPath.first(item_node, "pubDate/text()").to_s
908
- if time == ""
909
- time = XPath.first(item_node, "dc:date/text()").to_s
910
- end
911
- if time == ""
912
- time = XPath.first(item_node, "issued/text()").to_s
913
- end
914
-
915
- # get the item tags
916
- tags_array = []
917
- if tags_array == nil || tags_array.size == 0
918
- tags_array = []
919
- tag_list = XPath.match(item_node, "dc:subject/rdf:Bag/rdf:li/text()")
920
- if tag_list.size > 1
921
- for tag in tag_list
922
- tags_array << tag.to_s.downcase.strip
2580
+ return @time
2581
+ end
2582
+
2583
+ # Sets the feed item time
2584
+ def time=(new_time)
2585
+ @time = new_time
2586
+ end
2587
+
2588
+ # Returns the feed item tags
2589
+ def tags
2590
+ # TODO: support the rel="tag" microformat
2591
+ # =======================================
2592
+ if @tags.nil?
2593
+ @tags = []
2594
+ if @tags.nil? || @tags.size == 0
2595
+ @tags = []
2596
+ tag_list = XPath.match(root_node, "dc:subject/rdf:Bag/rdf:li/text()")
2597
+ if tag_list.size > 1
2598
+ for tag in tag_list
2599
+ @tags << tag.to_s.downcase.strip
2600
+ end
923
2601
  end
924
2602
  end
925
- end
926
- if tags_array == nil || tags_array.size == 0
927
- tags_array = []
928
- tag_list = XPath.match(item_node, "category/text()")
929
- if tag_list.size > 1
2603
+ if @tags.nil? || @tags.size == 0
2604
+ # messy effort to find ourselves some tags, mainly for del.icio.us
2605
+ @tags = []
2606
+ rdf_bag = XPath.match(root_node, "taxo:topics/rdf:Bag/rdf:li")
2607
+ if rdf_bag != nil && rdf_bag.size > 0
2608
+ for tag_node in rdf_bag
2609
+ begin
2610
+ tag_url = XPath.first(root_node, "@resource").to_s
2611
+ tag_match = tag_url.scan(/\/(tag|tags)\/(\w+)/)
2612
+ if tag_match.size > 0
2613
+ @tags << tag_match.first.last.downcase.strip
2614
+ end
2615
+ rescue
2616
+ end
2617
+ end
2618
+ end
2619
+ end
2620
+ if @tags.nil? || @tags.size == 0
2621
+ @tags = []
2622
+ tag_list = XPath.match(root_node, "category/text()")
930
2623
  for tag in tag_list
931
- tags_array << tag.to_s.downcase.strip
2624
+ @tags << tag.to_s.downcase.strip
932
2625
  end
933
2626
  end
934
- end
935
- if tags_array == nil || tags_array.size == 0
936
- tags_array = []
937
- tag_list = XPath.match(item_node, "dc:subject/text()")
938
- if tag_list.size > 1
2627
+ if @tags.nil? || @tags.size == 0
2628
+ @tags = []
2629
+ tag_list = XPath.match(root_node, "dc:subject/text()")
939
2630
  for tag in tag_list
940
- tags_array << tag.to_s.downcase.strip
2631
+ @tags << tag.to_s.downcase.strip
941
2632
  end
942
2633
  end
943
- end
944
- if tags_array == nil || tags_array.size == 0
945
- tags_array = XPath.first(item_node,
946
- "category/text()").to_s.downcase.split(" ")
947
- end
948
- if tags_array == nil || tags_array.size == 0
949
- begin
950
- tags_array = XPath.first(item_node,
951
- "dc:subject/text()").to_s.downcase.split(" ")
952
- rescue
953
- tags_array = []
954
- end
955
- end
956
- if tags_array == nil || tags_array.size == 0
957
- tags_array = []
958
- rdf_bag = XPath.match(item_node,
959
- "taxo:topics/rdf:Bag/rdf:li")
960
- if rdf_bag != nil && rdf_bag.size > 0
961
- for tag_node in rdf_bag
962
- begin
963
- tag_url = XPath.first(tag_node, "@resource").to_s
964
- tag_match = tag_url.scan(/\/(tag|tags)\/(\w+)/)
965
- if tag_match.size > 0
966
- tags_array << tag_match.first.last.downcase.strip
967
- end
968
- rescue
969
- end
2634
+ if @tags.nil? || @tags.size == 0
2635
+ begin
2636
+ @tags = XPath.first(root_node, "itunes:keywords/text()").to_s.downcase.split(" ")
2637
+ rescue
2638
+ @tags = []
970
2639
  end
971
2640
  end
2641
+ if @tags.nil?
2642
+ @tags = []
2643
+ end
2644
+ @tags.uniq!
972
2645
  end
2646
+ return @tags
2647
+ end
973
2648
 
974
- # set all of the properties
975
- if link != ""
976
- self.link = link
977
- else
978
- self.link = nil
979
- end
980
- if title != ""
981
- self.title = title
982
- end
983
- if description != ""
984
- self.description = description.strip
985
- end
986
- if time != ""
987
- self.time = Time.parse(time) rescue Time.now
988
- elsif @time == nil
989
- self.time = Time.now
990
- end
991
- if tags_array.size > 0
992
- self.tag_list = tags_array
993
- end
994
- parse_feed_item_hook(item_data)
995
- if Feed.cache_enabled?
996
- save
2649
+ # Sets the feed item tags
2650
+ def tags=(new_tags)
2651
+ @tags = new_tags
2652
+ end
2653
+
2654
+ # Returns true if this feed item contains explicit material. If the whole
2655
+ # feed has been marked as explicit, this will return true even if the item
2656
+ # isn't explicitly marked as explicit.
2657
+ def explicit?
2658
+ if @explicit.nil?
2659
+ if XPath.first(root_node,
2660
+ "media:adult/text()").to_s.downcase == "true" ||
2661
+ XPath.first(root_node,
2662
+ "itunes:explicit/text()").to_s.downcase == "yes" ||
2663
+ XPath.first(root_node,
2664
+ "itunes:explicit/text()").to_s.downcase == "true" ||
2665
+ feed.explicit
2666
+ @explicit = true
2667
+ else
2668
+ @explicit = false
2669
+ end
997
2670
  end
998
- return self
2671
+ return @explicit
999
2672
  end
1000
2673
 
1001
- def build_feed_item_hook(feed_type, version, xml_builder)
2674
+ # Sets whether or not the feed contains explicit material
2675
+ def explicit=(new_explicit)
2676
+ @explicit = (new_explicit ? true : false)
1002
2677
  end
1003
2678
 
1004
- def build_feed_item(feed_type, version, xml_builder=Builder::XmlMarkup.new(:indent => 2))
2679
+ # A hook method that is called during the feed generation process. Overriding this method
2680
+ # will enable additional content to be inserted into the feed.
2681
+ def build_xml_hook(feed_type, version, xml_builder)
2682
+ return nil
2683
+ end
2684
+
2685
+ # Generates xml based on the content of the feed item
2686
+ def build_xml(feed_type="rss", version=0.0, xml_builder=Builder::XmlMarkup.new(:indent => 2))
1005
2687
  if feed_type == "rss" && (version == 0.9 || version == 1.0 || version == 1.1)
1006
2688
  # RDF-based rss format
1007
2689
  if link.nil?
@@ -1026,15 +2708,17 @@ module FeedTools
1026
2708
  unless time.nil?
1027
2709
  xml_builder.tag!("dc:date", time.iso8601)
1028
2710
  end
1029
- unless tags.nil?
2711
+ unless tags.nil? || tags.size == 0
1030
2712
  xml_builder.tag!("dc:subject") do
1031
2713
  xml_builder.tag!("rdf:Bag") do
1032
- for tag in tag_list
2714
+ for tag in tags
1033
2715
  xml_builder.tag!("rdf:li", tag)
1034
2716
  end
1035
2717
  end
1036
2718
  end
2719
+ xml_builder.tag!("itunes:keywords", tags.join(" "))
1037
2720
  end
2721
+ build_xml_hook(feed_type, version, xml_builder)
1038
2722
  end
1039
2723
  elsif feed_type == "rss"
1040
2724
  # normal rss format
@@ -1051,15 +2735,17 @@ module FeedTools
1051
2735
  unless time.nil?
1052
2736
  xml_builder.pubDate(time.rfc822)
1053
2737
  end
1054
- unless tags.nil?
2738
+ unless tags.nil? || tags.size == 0
1055
2739
  xml_builder.tag!("dc:subject") do
1056
2740
  xml_builder.tag!("rdf:Bag") do
1057
- for tag in tag_list
2741
+ for tag in tags
1058
2742
  xml_builder.tag!("rdf:li", tag)
1059
2743
  end
1060
2744
  end
1061
2745
  end
2746
+ xml_builder.tag!("itunes:keywords", tags.join(" "))
1062
2747
  end
2748
+ build_xml_hook(feed_type, version, xml_builder)
1063
2749
  end
1064
2750
  elsif feed_type == "atom"
1065
2751
  # normal atom format
@@ -1083,31 +2769,34 @@ module FeedTools
1083
2769
  unless time.nil?
1084
2770
  xml_builder.issued(time.iso8601)
1085
2771
  end
1086
- unless tags.nil?
1087
- for tag in tag_list
2772
+ unless tags.nil? || tags.size == 0
2773
+ for tag in tags
1088
2774
  xml_builder.category(tag)
1089
2775
  end
1090
2776
  end
2777
+ build_xml_hook(feed_type, version, xml_builder)
1091
2778
  end
1092
2779
  end
1093
2780
  end
1094
-
1095
- # Saves the current state of the feed item to the database unless the feed lacks
1096
- # a remote location
1097
- def save
1098
- unless feed.nil? || feed.url.nil? || feed.url == ""
1099
- super
1100
- end
1101
- end
2781
+
2782
+ alias_method :tagline, :description
2783
+ alias_method :tagline=, :description=
2784
+ alias_method :subtitle, :description
2785
+ alias_method :subtitle=, :description=
2786
+ alias_method :abstract, :description
2787
+ alias_method :abstract=, :description=
2788
+ alias_method :content, :description
2789
+ alias_method :content=, :description=
2790
+ alias_method :guid, :id
2791
+ alias_method :guid=, :id=
1102
2792
  end
1103
2793
  end
1104
2794
 
1105
- module REXML
1106
- class Element
1107
- # small extension to REXML to simplify parsing of xhtml feed items
1108
- def inner_xml
2795
+ module REXML #:nodoc:
2796
+ class Element #:nodoc:
2797
+ def inner_xml #:nodoc:
1109
2798
  result = ""
1110
- each_child do |child|
2799
+ self.each_child do |child|
1111
2800
  result << child.to_s
1112
2801
  end
1113
2802
  return result
@@ -1116,11 +2805,8 @@ module REXML
1116
2805
  end
1117
2806
 
1118
2807
  begin
1119
- FeedTools::Feed.prepare_connection
1120
- unless FeedTools::Feed.cache_exists?
1121
- FeedTools::Feed.create_cache
2808
+ unless FeedTools.feed_cache.nil?
2809
+ FeedTools.feed_cache.initialize_cache
1122
2810
  end
1123
2811
  rescue
1124
- # Nothing can be done until someone sets up the database connection.
1125
- # We'll just assume for now that the user will take care of that.
1126
2812
  end