feedtools 0.2.22 → 0.2.23

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,31 @@
1
+ == FeedTools 0.2.23
2
+ * autodiscovery implemented
3
+ * now knows a title from a hole in the ground
4
+ * now resolves relative urls when possible
5
+ * changed default table name to "cached_feeds" to avoid name collisions
6
+ * schema now uses "href" instead of "url"
7
+ * feed cache is set to nil by default now
8
+ * both summary and content elements are generated now
9
+ * now supports proxies
10
+ * now supports internationalized domain names if libidn is installed
11
+ * fixed bug with feed merging referencing a method that was refactored
12
+ * no longer dies if uuidtools gem is missing but the UUID class is defined
13
+ * updated timestamp handling and generation
14
+ * added support for entry sorting on any feed item field
15
+ * added support for disabling entry sorting entirely
16
+ * fixed issue with itunes categories
17
+ * fixed itunes subtitle/summary
18
+ * fixed entry assignment bug
19
+ * fixed issued/published variable name mix-up
20
+ * added support for the payload module
21
+ * added support for xhtml:div elements
22
+ * dc:date now preempts pubDate
23
+ * added better support for the scriptingNews format
24
+ * now correctly strips out wrapper div elements from text constructs
25
+ * fixed issue with some atom links being incorrectly identified as images
26
+ * reorganized some of the helper modules
27
+ * made some portions of url normalization case insensitive
28
+ * fixed issue with filename handling on Windows
1
29
  == FeedTools 0.2.22
2
30
  * fixed another atom generation error
3
31
  == FeedTools 0.2.21
data/README CHANGED
@@ -1,7 +1,9 @@
1
- FeedTools was designed to be a simple XML feed parser, generator, and translator with a built-in
2
- caching system.
1
+ FeedTools was designed to be a simple XML feed parser, generator, and
2
+ translator with a built-in caching system.
3
3
 
4
4
  == Example
5
+ require 'feed_tools'
6
+
5
7
  slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss')
6
8
  slashdot_feed.title
7
9
  => "Slashdot"
@@ -11,3 +13,22 @@
11
13
  => "http://slashdot.org/"
12
14
  slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s
13
15
  => "43,37,28,23,11,3,1"
16
+
17
+ == Installation
18
+ You can install FeedTools as a gem:
19
+ gem install feedtools
20
+
21
+ Or you can install it from the tarball or zip packages on the download page
22
+ and then extract it to your vendors directory as you would with any other
23
+ Ruby library.
24
+
25
+ After installation, you will either need to run in non-caching mode or set
26
+ up a caching mechanism. The database feed cache system currently included
27
+ with FeedTools is the most common caching method. To set up the database
28
+ feed cache, you will first need to create the appropriate database schema.
29
+ Schema files for MySQL, PostgreSQL, and SQLite have been included, but the
30
+ preferred method of creating the schema within the Rails environment is with
31
+ a migration file. A migration file has been supplied with FeedTools and can
32
+ be found in the db directory. Run
33
+ <tt>script/generate migration add_feed_tools_tables</tt> and then copy and
34
+ paste the contents of db/migration.rb into your new migration file.
data/db/migration.rb ADDED
@@ -0,0 +1,19 @@
1
+ class AddFeedToolsTables < ActiveRecord::Migration
2
+ def self.up
3
+ puts "Adding cached feeds table..."
4
+ create_table :cached_feeds do |t|
5
+ t.column :href, :string
6
+ t.column :title, :string
7
+ t.column :link, :string
8
+ t.column :feed_data, :text
9
+ t.column :feed_data_type, :string
10
+ t.column :http_headers, :text
11
+ t.column :last_retrieved, :datetime
12
+ end
13
+ end
14
+
15
+ def self.down
16
+ puts "Dropping cached feeds table..."
17
+ drop_table :cached_feeds
18
+ end
19
+ end
data/db/schema.mysql.sql CHANGED
@@ -1,7 +1,7 @@
1
1
  -- Example MySQL schema
2
2
  CREATE TABLE `feeds` (
3
3
  `id` int(10) unsigned NOT NULL auto_increment,
4
- `url` varchar(255) default NULL,
4
+ `href` varchar(255) default NULL,
5
5
  `title` varchar(255) default NULL,
6
6
  `link` varchar(255) default NULL,
7
7
  `feed_data` longtext default NULL,
@@ -1,7 +1,7 @@
1
1
  -- Example PostgreSQL schema
2
2
  CREATE TABLE feeds (
3
3
  id SERIAL PRIMARY KEY NOT NULL,
4
- url varchar(255) default NULL,
4
+ href varchar(255) default NULL,
5
5
  title varchar(255) default NULL,
6
6
  link varchar(255) default NULL,
7
7
  feed_data text default NULL,
data/db/schema.sqlite.sql CHANGED
@@ -1,7 +1,7 @@
1
1
  -- Example Sqlite schema
2
2
  CREATE TABLE feeds (
3
3
  id INTEGER PRIMARY KEY NOT NULL,
4
- url VARCHAR(255) DEFAULT NULL,
4
+ href VARCHAR(255) DEFAULT NULL,
5
5
  title VARCHAR(255) DEFAULT NULL,
6
6
  link VARCHAR(255) DEFAULT NULL,
7
7
  feed_data TEXT DEFAULT NULL,
data/lib/feed_tools.rb CHANGED
@@ -32,7 +32,7 @@ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
32
32
  ENV['RAILS_ENV'] ||
33
33
  'development' # :nodoc:
34
34
 
35
- FEED_TOOLS_VERSION = "0.2.22"
35
+ FEED_TOOLS_VERSION = "0.2.23"
36
36
 
37
37
  FEED_TOOLS_NAMESPACES = {
38
38
  "admin" => "http://webns.net/mvcb/",
@@ -40,8 +40,9 @@ FEED_TOOLS_NAMESPACES = {
40
40
  "annotate" => "http://purl.org/rss/1.0/modules/annotate/",
41
41
  "atom10" => "http://www.w3.org/2005/Atom",
42
42
  "atom03" => "http://purl.org/atom/ns#",
43
- # "atom-blog" => "http://purl.org/atom-blog/ns#",
43
+ "atom-blog" => "http://purl.org/atom-blog/ns#",
44
44
  "audio" => "http://media.tangent.org/rss/1.0/",
45
+ "bitTorrent" =>"http://www.reallysimplesyndication.com/bitTorrentRssModule",
45
46
  "blogChannel" => "http://backend.userland.com/blogChannelModule",
46
47
  "blogger" => "http://www.blogger.com/atom/ns#",
47
48
  "cc" => "http://web.resource.org/cc/",
@@ -61,20 +62,24 @@ FEED_TOOLS_NAMESPACES = {
61
62
  "itunes" => "http://www.itunes.com/dtds/podcast-1.0.dtd",
62
63
  "l" => "http://purl.org/rss/1.0/modules/link/",
63
64
  "media" => "http://search.yahoo.com/mrss",
65
+ "p" => "http://purl.org/net/rss1.1/payload#",
64
66
  "pingback" => "http://madskills.com/public/xml/rss/module/pingback/",
65
67
  "prism" => "http://prismstandard.org/namespaces/1.2/basic/",
66
68
  "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
67
69
  "rdfs" => "http://www.w3.org/2000/01/rdf-schema#",
68
70
  "ref" => "http://purl.org/rss/1.0/modules/reference/",
69
71
  "reqv" => "http://purl.org/rss/1.0/modules/richequiv/",
72
+ "rss09" => "http://my.netscape.com/rdf/simple/0.9/",
70
73
  "rss10" => "http://purl.org/rss/1.0/",
74
+ "rss11" => "http://purl.org/net/rss1.1#",
75
+ "rss20" => "http://backend.userland.com/rss2",
71
76
  "search" => "http://purl.org/rss/1.0/modules/search/",
72
77
  "slash" => "http://purl.org/rss/1.0/modules/slash/",
73
78
  "soap" => "http://schemas.xmlsoap.org/soap/envelope/",
74
79
  "ss" => "http://purl.org/rss/1.0/modules/servicestatus/",
75
80
  "str" => "http://hacks.benhammersley.com/rss/streaming/",
76
81
  "sub" => "http://purl.org/rss/1.0/modules/subscription/",
77
- "sy" => "http://purl.org/rss/1.0/modules/syndication/",
82
+ "syn" => "http://purl.org/rss/1.0/modules/syndication/",
78
83
  "taxo" => "http://purl.org/rss/1.0/modules/taxonomy/",
79
84
  "thr" => "http://purl.org/rss/1.0/modules/threading/",
80
85
  "ti" => "http://purl.org/rss/1.0/modules/textinput/",
@@ -91,7 +96,7 @@ $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
91
96
  begin
92
97
  begin
93
98
  require 'iconv'
94
- rescue LoadError
99
+ rescue Object
95
100
  warn("The Iconv library does not appear to be installed properly. " +
96
101
  "FeedTools cannot function properly without it.")
97
102
  raise
@@ -101,11 +106,15 @@ begin
101
106
 
102
107
  require_gem('builder', '>= 1.2.4')
103
108
 
109
+ # Preload optional libraries.
104
110
  begin
105
111
  require 'tidy'
106
- rescue LoadError
107
- # Ignore the error for now.
112
+ rescue Object
108
113
  end
114
+ begin
115
+ require 'idn'
116
+ rescue Object
117
+ end
109
118
 
110
119
  require 'feed_tools/vendor/htree'
111
120
 
@@ -126,11 +135,25 @@ begin
126
135
 
127
136
  require_gem('activesupport', '>= 1.1.1')
128
137
  require_gem('activerecord', '>= 1.11.1')
129
- require_gem('uuidtools', '>= 0.1.2')
130
138
 
139
+ begin
140
+ require_gem('uuidtools', '>= 0.1.2')
141
+ rescue Gem::LoadError
142
+ begin
143
+ require 'uuidtools'
144
+ rescue Object
145
+ raise unless defined? UUID
146
+ end
147
+ end
148
+
131
149
  require 'feed_tools/feed'
132
150
  require 'feed_tools/feed_item'
151
+ require 'feed_tools/feed_structures'
133
152
  require 'feed_tools/database_feed_cache'
153
+
154
+ require 'feed_tools/helpers/html_helper'
155
+ require 'feed_tools/helpers/xml_helper'
156
+ require 'feed_tools/helpers/uri_helper'
134
157
  rescue LoadError
135
158
  # ActiveSupport will very likely mess this up. So drop a warn so that the
136
159
  # programmer can figure it out if things get wierd and unpredictable.
@@ -159,19 +182,27 @@ module FeedTools
159
182
 
160
183
  def FeedTools.load_configurations
161
184
  if @configurations.blank?
185
+ # TODO: Load this from a config file.
162
186
  config_hash = {}
163
187
  @configurations = {
164
- :feed_cache => "FeedTools::DatabaseFeedCache",
188
+ :feed_cache => nil,
189
+ :proxy_address => nil,
190
+ :proxy_port => nil,
165
191
  :user_agent => "FeedTools/#{FEED_TOOLS_VERSION} " +
166
192
  "+http://www.sporkmonger.com/projects/feedtools/",
167
193
  :generator_name => "FeedTools/#{FEED_TOOLS_VERSION}",
168
194
  :generator_href => "http://www.sporkmonger.com/projects/feedtools/",
169
- :tidy_enabled => false,
195
+ :tidy_enabled => true,
170
196
  :tidy_options => {},
197
+ :idn_enabled => true,
198
+ :sanitization_enabled => true,
171
199
  :sanitize_with_nofollow => true,
200
+ :always_strip_wrapper_elements => true,
172
201
  :timestamp_estimation_enabled => true,
173
202
  :url_normalization_enabled => true,
203
+ :entry_sorting_property => "time",
174
204
  :strip_comment_count => false,
205
+ :tab_spaces => 2,
175
206
  :max_ttl => 3.days.to_s,
176
207
  :output_encoding => "utf-8"
177
208
  }.merge(config_hash)
@@ -236,6 +267,9 @@ module FeedTools
236
267
  cache_class = eval(class_name)
237
268
  if cache_class.kind_of?(Class)
238
269
  @feed_cache = cache_class
270
+ if @feed_cache.respond_to? :initialize_cache
271
+ @feed_cache.initialize_cache
272
+ end
239
273
  return cache_class
240
274
  else
241
275
  return nil
@@ -258,376 +292,7 @@ module FeedTools
258
292
  rescue
259
293
  return false
260
294
  end
261
- end
262
-
263
- # Returns true if the html tidy module can be used.
264
- #
265
- # Obviously, you need the tidy gem installed in order to run with html
266
- # tidy features turned on.
267
- #
268
- # This method does a fairly complicated, and probably unnecessarily
269
- # desperate search for the libtidy library. If you want this thing to
270
- # execute fast, the best thing to do is to set Tidy.path ahead of time.
271
- # If Tidy.path is set, this method doesn't do much. If it's not set,
272
- # it will do it's darnedest to find the libtidy library. If you set
273
- # the LIBTIDYPATH environment variable to the libtidy library, it should
274
- # be able to find it.
275
- #
276
- # Once the library is located, this method will run much faster.
277
- def FeedTools.tidy_enabled?
278
- # This is an override variable to keep tidy from being used even if it
279
- # is available.
280
- if FeedTools.configurations[:tidy_enabled] == false
281
- return false
282
- end
283
- if @tidy_enabled.nil? || @tidy_enabled == false
284
- @tidy_enabled = false
285
- begin
286
- require 'tidy'
287
- if Tidy.path.nil?
288
- # *Shrug*, just brute force it, I guess. There's a lot of places
289
- # this thing might be hiding in, depending on platform and general
290
- # sanity of the person who installed the thing. Most of these are
291
- # probably unlikely, but it's not like checking unlikely locations
292
- # hurts. Much. Especially if you actually find it.
293
- libtidy_locations = [
294
- '/usr/local/lib/libtidy.dylib',
295
- '/opt/local/lib/libtidy.dylib',
296
- '/usr/lib/libtidy.dylib',
297
- '/usr/local/lib/tidylib.dylib',
298
- '/opt/local/lib/tidylib.dylib',
299
- '/usr/lib/tidylib.dylib',
300
- '/usr/local/lib/tidy.dylib',
301
- '/opt/local/lib/tidy.dylib',
302
- '/usr/lib/tidy.dylib',
303
- '/usr/local/lib/libtidy.so',
304
- '/opt/local/lib/libtidy.so',
305
- '/usr/lib/libtidy.so',
306
- '/usr/local/lib/tidylib.so',
307
- '/opt/local/lib/tidylib.so',
308
- '/usr/lib/tidylib.so',
309
- '/usr/local/lib/tidy.so',
310
- '/opt/local/lib/tidy.so',
311
- '/usr/lib/tidy.so',
312
- 'C:\Program Files\Tidy\tidy.dll',
313
- 'C:\Tidy\tidy.dll',
314
- 'C:\Ruby\bin\tidy.dll',
315
- 'C:\Ruby\tidy.dll',
316
- '/usr/local/lib',
317
- '/opt/local/lib',
318
- '/usr/lib'
319
- ]
320
- # We just made this thing up, but if someone sets it, we'll
321
- # go ahead and check it
322
- unless ENV['LIBTIDYPATH'].nil?
323
- libtidy_locations =
324
- libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
325
- end
326
- for path in libtidy_locations
327
- if File.exists? path
328
- if File.ftype(path) == "file"
329
- Tidy.path = path
330
- @tidy_enabled = true
331
- break
332
- elsif File.ftype(path) == "directory"
333
- # Ok, now perhaps we're getting a bit more desperate
334
- lib_paths =
335
- `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
336
- # If there's more than one, grab the first one and
337
- # hope for the best, and if it doesn't work, then blame the
338
- # user for not specifying more accurately.
339
- tidy_path = lib_paths.split("\n").first
340
- unless tidy_path.nil?
341
- Tidy.path = tidy_path
342
- @tidy_enabled = true
343
- break
344
- end
345
- end
346
- end
347
- end
348
- # Still couldn't find it.
349
- unless @tidy_enabled
350
- @tidy_enabled = false
351
- end
352
- else
353
- @tidy_enabled = true
354
- end
355
- rescue LoadError
356
- # Tidy not installed, disable features that rely on tidy.
357
- @tidy_enabled = false
358
- end
359
- end
360
- return @tidy_enabled
361
- end
362
-
363
- # Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls
364
- # and makes every effort to figure out what it was supposed to be. Also translates from
365
- # the feed: and rss: pseudo-protocols to the http: protocol.
366
- def FeedTools.normalize_url(url)
367
- if url.nil? || url == ""
368
- return nil
369
- end
370
- normalized_url = url.strip
371
-
372
- # if a url begins with the '/' character, it only makes sense that they
373
- # meant to be using a file:// url. Fix it for them.
374
- if normalized_url.length > 0 && normalized_url[0..0] == "/"
375
- normalized_url = "file://" + normalized_url
376
- end
377
-
378
- # if a url begins with a drive letter followed by a colon, we're looking at
379
- # a file:// url. Fix it for them.
380
- if normalized_url.length > 0 &&
381
- normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
382
- normalized_url = "file:///" + normalized_url
383
- end
384
-
385
- # if a url begins with javascript:, it's quite possibly an attempt at
386
- # doing something malicious. Let's keep that from getting anywhere,
387
- # shall we?
388
- if (normalized_url.downcase =~ /javascript:/) != nil
389
- return "#"
390
- end
391
-
392
- # deal with all of the many ugly possibilities involved in the rss:
393
- # and feed: pseudo-protocols (incidentally, whose crazy idea was this
394
- # mess?)
395
- normalized_url.gsub!(/^http:\/*(feed:\/*)?/, "http://")
396
- normalized_url.gsub!(/^http:\/*(rss:\/*)?/, "http://")
397
- normalized_url.gsub!(/^feed:\/*(http:\/*)?/, "http://")
398
- normalized_url.gsub!(/^rss:\/*(http:\/*)?/, "http://")
399
- normalized_url.gsub!(/^file:\/*/, "file:///")
400
- normalized_url.gsub!(/^https:\/*/, "https://")
401
- # fix (very) bad urls (usually of the user-entered sort)
402
- normalized_url.gsub!(/^http:\/*(http:\/*)*/, "http://")
403
-
404
- if (normalized_url =~ /^file:/) == 0
405
- # Adjust windows-style urls
406
- normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/, 'file:///\1:')
407
- normalized_url.gsub!(/\\/, '/')
408
- else
409
- if (normalized_url =~ /https?:\/\//) == nil
410
- normalized_url = "http://" + normalized_url
411
- end
412
- if normalized_url == "http://"
413
- return nil
414
- end
415
- begin
416
- feed_uri = URI.parse(normalized_url)
417
- if feed_uri.scheme == nil
418
- feed_uri.scheme = "http"
419
- end
420
- if feed_uri.path == nil || feed_uri.path == ""
421
- feed_uri.path = "/"
422
- end
423
- if (feed_uri.path =~ /^[\/]+/) == 0
424
- feed_uri.path.gsub!(/^[\/]+/, "/")
425
- end
426
- feed_uri.host.downcase!
427
- normalized_url = feed_uri.to_s
428
- rescue URI::InvalidURIError
429
- end
430
- end
431
-
432
- # We can't do a proper set of escaping, so this will
433
- # have to do.
434
- normalized_url.gsub!(/%20/, " ")
435
- normalized_url.gsub!(/ /, "%20")
436
-
437
- return normalized_url
438
- end
439
-
440
- # Converts a url into a tag uri
441
- def FeedTools.build_tag_uri(url, date)
442
- unless url.kind_of? String
443
- raise ArgumentError, "Expected String, got #{url.class.name}"
444
- end
445
- unless date.kind_of? Time
446
- raise ArgumentError, "Expected Time, got #{date.class.name}"
447
- end
448
- tag_uri = normalize_url(url)
449
- unless FeedTools.is_uri?(tag_uri)
450
- raise ArgumentError, "Must supply a valid URL."
451
- end
452
- host = URI.parse(tag_uri).host
453
- tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
454
- tag_uri.gsub!(/#/, "/")
455
- tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
456
- "#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
457
- return tag_uri
458
- end
459
-
460
- # Converts a url into a urn:uuid: uri
461
- def FeedTools.build_urn_uri(url)
462
- unless url.kind_of? String
463
- raise ArgumentError, "Expected String, got #{url.class.name}"
464
- end
465
- normalized_url = normalize_url(url)
466
- require 'uuidtools'
467
- return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
468
- end
469
-
470
- # Returns true if the parameter appears to be a valid uri
471
- def FeedTools.is_uri?(url)
472
- return false if url.nil?
473
- begin
474
- uri = URI.parse(url)
475
- if uri.scheme.nil? || uri.scheme == ""
476
- return false
477
- end
478
- rescue URI::InvalidURIError
479
- return false
480
- end
481
- return true
482
- end
483
-
484
- # Escapes all html entities
485
- def FeedTools.escape_entities(html)
486
- return nil if html.nil?
487
- escaped_html = CGI.escapeHTML(html)
488
- escaped_html.gsub!(/'/, "&apos;")
489
- escaped_html.gsub!(/"/, "&quot;")
490
- return escaped_html
491
- end
492
-
493
- # Unescapes all html entities
494
- def FeedTools.unescape_entities(html)
495
- return nil if html.nil?
496
- unescaped_html = html
497
- unescaped_html.gsub!(/&#x26;/, "&amp;")
498
- unescaped_html.gsub!(/&#38;/, "&amp;")
499
- unescaped_html = CGI.unescapeHTML(unescaped_html)
500
- unescaped_html.gsub!(/&apos;/, "'")
501
- unescaped_html.gsub!(/&quot;/, "\"")
502
- return unescaped_html
503
- end
504
-
505
- # Removes all html tags from the html formatted text.
506
- def FeedTools.strip_html(html)
507
- return nil if html.nil?
508
- # TODO: do this properly
509
- # ======================
510
- stripped_html = html.gsub(/<\/?[^>]+>/, "")
511
- return stripped_html
512
- end
513
-
514
- # Tidys up the html
515
- def FeedTools.tidy_html(html, options = {})
516
- return nil if html.nil?
517
- if FeedTools.tidy_enabled?
518
- is_fragment = true
519
- html.gsub!(/&lt;!'/, "&amp;lt;!'")
520
- if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
521
- (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
522
- is_fragment = false
523
- end
524
- if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
525
- is_fragment = false
526
- end
527
- tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
528
- tidy.options.output_xml = true
529
- tidy.options.numeric_entities = true
530
- tidy.options.markup = true
531
- tidy.options.indent = false
532
- tidy.options.wrap = 0
533
- tidy.options.logical_emphasis = true
534
- # TODO: Make this match the actual encoding of the feed
535
- # =====================================================
536
- tidy.options.input_encoding = "utf8"
537
- tidy.options.output_encoding = "ascii"
538
- tidy.options.ascii_chars = false
539
- tidy.options.doctype = "omit"
540
- xml = tidy.clean(html)
541
- xml
542
- end
543
- if is_fragment
544
- # Tidy sticks <html>...<body>[our html]</body>...</html> in.
545
- # We don't want this.
546
- tidy_html.strip!
547
- tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
548
- tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
549
- tidy_html.strip!
550
- end
551
- tidy_html.gsub!(/&#x26;/, "&amp;")
552
- tidy_html.gsub!(/&#38;/, "&amp;")
553
- tidy_html.gsub!(/\320\262\320\202\342\204\242/, "\342\200\231")
554
-
555
- else
556
- tidy_html = html
557
- end
558
- if tidy_html.blank? && !html.blank?
559
- tidy_html = html.strip
560
- end
561
- return tidy_html
562
- end
563
-
564
- # Removes all dangerous html tags from the html formatted text.
565
- # If mode is set to :escape, dangerous and unknown elements will
566
- # be escaped. If mode is set to :strip, dangerous and unknown
567
- # elements and all children will be removed entirely.
568
- # Dangerous or unknown attributes are always removed.
569
- def FeedTools.sanitize_html(html, mode=:strip)
570
- return nil if html.nil?
571
-
572
- # Lists borrowed from Mark Pilgrim's feedparser
573
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
574
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
575
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
576
- 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
577
- 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
578
- 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
579
- 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
580
- 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
581
- 'u', 'ul', 'var']
582
-
583
- acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
584
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
585
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
586
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
587
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
588
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
589
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
590
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
591
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
592
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
593
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width']
594
-
595
- # Replace with appropriate named entities
596
- html.gsub!(/&#x26;/, "&amp;")
597
- html.gsub!(/&#38;/, "&amp;")
598
- html.gsub!(/&lt;!'/, "&amp;lt;!'")
599
-
600
- # Hackity hack. But it works, and it seems plenty fast enough.
601
- html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
602
-
603
- sanitize_node = lambda do |html_node|
604
- if html_node.respond_to? :children
605
- for child in html_node.children
606
- if child.kind_of? REXML::Element
607
- unless acceptable_elements.include? child.name.downcase
608
- if mode == :strip
609
- html_node.delete_element(child)
610
- else
611
- new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
612
- html_node.insert_after(child, new_child)
613
- html_node.delete_element(child)
614
- end
615
- end
616
- for attribute in child.attributes.keys
617
- unless acceptable_attributes.include? attribute.downcase
618
- child.delete_attribute(attribute)
619
- end
620
- end
621
- end
622
- sanitize_node.call(child)
623
- end
624
- end
625
- html_node
626
- end
627
- sanitize_node.call(html_doc.root)
628
- html = html_doc.root.inner_xml
629
- return html
630
- end
295
+ end
631
296
 
632
297
  # Creates a merged "planet" feed from a set of urls.
633
298
  #
@@ -637,7 +302,7 @@ module FeedTools
637
302
  # in conjunction with the DatabaseFeedCache as it will
638
303
  # open multiple connections to the database.
639
304
  def FeedTools.build_merged_feed(url_array, options = {})
640
- validate_options([ :multi_threaded ],
305
+ FeedTools::GenericHelper.validate_options([ :multi_threaded ],
641
306
  options.keys)
642
307
  options = { :multi_threaded => false }.merge(options)
643
308
  return nil if url_array.nil?
@@ -930,19 +595,37 @@ module REXML # :nodoc:
930
595
  result << child.to_s
931
596
  end
932
597
  end
933
- return result
598
+ return result.strip
934
599
  end
600
+ else
601
+ warn("inner_xml method already exists.")
935
602
  end
936
603
 
937
- unless REXML::Element.public_instance_methods.include? :base_uri
938
- def base_uri # :nodoc:
939
- if not attribute('xml:base')
604
+ def base_uri # :nodoc:
605
+ begin
606
+ base_attribute = FeedTools::XmlHelper.try_xpaths(self, [
607
+ '@xml:base'
608
+ ])
609
+ if parent == nil || parent.kind_of?(REXML::Document)
610
+ return nil if base_attribute == nil
611
+ return base_attribute.value
612
+ end
613
+ if base_attribute != nil && parent == nil
614
+ return base_attribute.value
615
+ elsif parent != nil && base_attribute == nil
940
616
  return parent.base_uri
941
- elsif parent
942
- return URI.join(parent.base_uri, attribute('xml:base').value).to_s
943
- else
944
- return (attribute('xml:base').value or '')
617
+ elsif parent != nil && base_attribute != nil
618
+ parent_base_uri = parent.base_uri
619
+ if parent_base_uri != nil
620
+ uri = URI.parse(parent_base_uri)
621
+ return (uri + base_attribute.value).to_s
622
+ else
623
+ return base_attribute.value
624
+ end
945
625
  end
626
+ return nil
627
+ rescue
628
+ return nil
946
629
  end
947
630
  end
948
631
  end