feedtools 0.2.22 → 0.2.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,31 @@
1
+ == FeedTools 0.2.23
2
+ * autodiscovery implemented
3
+ * now knows a title from a hole in the ground
4
+ * now resolves relative urls when possible
5
+ * changed default table name to "cached_feeds" to avoid name collisions
6
+ * schema now uses "href" instead of "url"
7
+ * feed cache is set to nil by default now
8
+ * both summary and content elements are generated now
9
+ * now supports proxies
10
+ * now supports internationalized domain names if libidn is installed
11
+ * fixed bug with feed merging referencing a method that was refactored
12
+ * no longer dies if uuidtools gem is missing but the UUID class is defined
13
+ * updated timestamp handling and generation
14
+ * added support for entry sorting on any feed item field
15
+ * added support for disabling entry sorting entirely
16
+ * fixed issue with itunes categories
17
+ * fixed itunes subtitle/summary
18
+ * fixed entry assignment bug
19
+ * fixed issued/published variable name mix-up
20
+ * added support for the payload module
21
+ * added support for xhtml:div elements
22
+ * dc:date now preempts pubDate
23
+ * added better support for the scriptingNews format
24
+ * now correctly strips out wrapper div elements from text constructs
25
+ * fixed issue with some atom links being incorrectly identified as images
26
+ * reorganized some of the helper modules
27
+ * made some portions of url normalization case insensitive
28
+ * fixed issue with filename handling on Windows
1
29
  == FeedTools 0.2.22
2
30
  * fixed another atom generation error
3
31
  == FeedTools 0.2.21
data/README CHANGED
@@ -1,7 +1,9 @@
1
- FeedTools was designed to be a simple XML feed parser, generator, and translator with a built-in
2
- caching system.
1
+ FeedTools was designed to be a simple XML feed parser, generator, and
2
+ translator with a built-in caching system.
3
3
 
4
4
  == Example
5
+ require 'feed_tools'
6
+
5
7
  slashdot_feed = FeedTools::Feed.open('http://www.slashdot.org/index.rss')
6
8
  slashdot_feed.title
7
9
  => "Slashdot"
@@ -11,3 +13,22 @@
11
13
  => "http://slashdot.org/"
12
14
  slashdot_feed.items.first.find_node("slash:hitparade/text()").to_s
13
15
  => "43,37,28,23,11,3,1"
16
+
17
+ == Installation
18
+ You can install FeedTools as a gem:
19
+ gem install feedtools
20
+
21
+ Or you can install it from the tarball or zip packages on the download page
22
+ and then extract it to your vendors directory as you would with any other
23
+ Ruby library.
24
+
25
+ After installation, you will either need to run in non-caching mode or set
26
+ up a caching mechanism. The database feed cache system currently included
27
+ with FeedTools is the most common caching method. To set up the database
28
+ feed cache, you will first need to create the appropriate database schema.
29
+ Schema files for MySQL, PostgreSQL, and SQLite have been included, but the
30
+ preferred method of creating the schema within the Rails environment is with
31
+ a migration file. A migration file has been supplied with FeedTools and can
32
+ be found in the db directory. Run
33
+ <tt>script/generate migration add_feed_tools_tables</tt> and then copy and
34
+ paste the contents of db/migration.rb into your new migration file.
data/db/migration.rb ADDED
@@ -0,0 +1,19 @@
1
+ class AddFeedToolsTables < ActiveRecord::Migration
2
+ def self.up
3
+ puts "Adding cached feeds table..."
4
+ create_table :cached_feeds do |t|
5
+ t.column :href, :string
6
+ t.column :title, :string
7
+ t.column :link, :string
8
+ t.column :feed_data, :text
9
+ t.column :feed_data_type, :string
10
+ t.column :http_headers, :text
11
+ t.column :last_retrieved, :datetime
12
+ end
13
+ end
14
+
15
+ def self.down
16
+ puts "Dropping cached feeds table..."
17
+ drop_table :cached_feeds
18
+ end
19
+ end
data/db/schema.mysql.sql CHANGED
@@ -1,7 +1,7 @@
1
1
  -- Example MySQL schema
2
2
  CREATE TABLE `feeds` (
3
3
  `id` int(10) unsigned NOT NULL auto_increment,
4
- `url` varchar(255) default NULL,
4
+ `href` varchar(255) default NULL,
5
5
  `title` varchar(255) default NULL,
6
6
  `link` varchar(255) default NULL,
7
7
  `feed_data` longtext default NULL,
@@ -1,7 +1,7 @@
1
1
  -- Example PostgreSQL schema
2
2
  CREATE TABLE feeds (
3
3
  id SERIAL PRIMARY KEY NOT NULL,
4
- url varchar(255) default NULL,
4
+ href varchar(255) default NULL,
5
5
  title varchar(255) default NULL,
6
6
  link varchar(255) default NULL,
7
7
  feed_data text default NULL,
data/db/schema.sqlite.sql CHANGED
@@ -1,7 +1,7 @@
1
1
  -- Example Sqlite schema
2
2
  CREATE TABLE feeds (
3
3
  id INTEGER PRIMARY KEY NOT NULL,
4
- url VARCHAR(255) DEFAULT NULL,
4
+ href VARCHAR(255) DEFAULT NULL,
5
5
  title VARCHAR(255) DEFAULT NULL,
6
6
  link VARCHAR(255) DEFAULT NULL,
7
7
  feed_data TEXT DEFAULT NULL,
data/lib/feed_tools.rb CHANGED
@@ -32,7 +32,7 @@ FEED_TOOLS_ENV = ENV['FEED_TOOLS_ENV'] ||
32
32
  ENV['RAILS_ENV'] ||
33
33
  'development' # :nodoc:
34
34
 
35
- FEED_TOOLS_VERSION = "0.2.22"
35
+ FEED_TOOLS_VERSION = "0.2.23"
36
36
 
37
37
  FEED_TOOLS_NAMESPACES = {
38
38
  "admin" => "http://webns.net/mvcb/",
@@ -40,8 +40,9 @@ FEED_TOOLS_NAMESPACES = {
40
40
  "annotate" => "http://purl.org/rss/1.0/modules/annotate/",
41
41
  "atom10" => "http://www.w3.org/2005/Atom",
42
42
  "atom03" => "http://purl.org/atom/ns#",
43
- # "atom-blog" => "http://purl.org/atom-blog/ns#",
43
+ "atom-blog" => "http://purl.org/atom-blog/ns#",
44
44
  "audio" => "http://media.tangent.org/rss/1.0/",
45
+ "bitTorrent" =>"http://www.reallysimplesyndication.com/bitTorrentRssModule",
45
46
  "blogChannel" => "http://backend.userland.com/blogChannelModule",
46
47
  "blogger" => "http://www.blogger.com/atom/ns#",
47
48
  "cc" => "http://web.resource.org/cc/",
@@ -61,20 +62,24 @@ FEED_TOOLS_NAMESPACES = {
61
62
  "itunes" => "http://www.itunes.com/dtds/podcast-1.0.dtd",
62
63
  "l" => "http://purl.org/rss/1.0/modules/link/",
63
64
  "media" => "http://search.yahoo.com/mrss",
65
+ "p" => "http://purl.org/net/rss1.1/payload#",
64
66
  "pingback" => "http://madskills.com/public/xml/rss/module/pingback/",
65
67
  "prism" => "http://prismstandard.org/namespaces/1.2/basic/",
66
68
  "rdf" => "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
67
69
  "rdfs" => "http://www.w3.org/2000/01/rdf-schema#",
68
70
  "ref" => "http://purl.org/rss/1.0/modules/reference/",
69
71
  "reqv" => "http://purl.org/rss/1.0/modules/richequiv/",
72
+ "rss09" => "http://my.netscape.com/rdf/simple/0.9/",
70
73
  "rss10" => "http://purl.org/rss/1.0/",
74
+ "rss11" => "http://purl.org/net/rss1.1#",
75
+ "rss20" => "http://backend.userland.com/rss2",
71
76
  "search" => "http://purl.org/rss/1.0/modules/search/",
72
77
  "slash" => "http://purl.org/rss/1.0/modules/slash/",
73
78
  "soap" => "http://schemas.xmlsoap.org/soap/envelope/",
74
79
  "ss" => "http://purl.org/rss/1.0/modules/servicestatus/",
75
80
  "str" => "http://hacks.benhammersley.com/rss/streaming/",
76
81
  "sub" => "http://purl.org/rss/1.0/modules/subscription/",
77
- "sy" => "http://purl.org/rss/1.0/modules/syndication/",
82
+ "syn" => "http://purl.org/rss/1.0/modules/syndication/",
78
83
  "taxo" => "http://purl.org/rss/1.0/modules/taxonomy/",
79
84
  "thr" => "http://purl.org/rss/1.0/modules/threading/",
80
85
  "ti" => "http://purl.org/rss/1.0/modules/textinput/",
@@ -91,7 +96,7 @@ $:.unshift(File.dirname(__FILE__) + "/feed_tools/vendor")
91
96
  begin
92
97
  begin
93
98
  require 'iconv'
94
- rescue LoadError
99
+ rescue Object
95
100
  warn("The Iconv library does not appear to be installed properly. " +
96
101
  "FeedTools cannot function properly without it.")
97
102
  raise
@@ -101,11 +106,15 @@ begin
101
106
 
102
107
  require_gem('builder', '>= 1.2.4')
103
108
 
109
+ # Preload optional libraries.
104
110
  begin
105
111
  require 'tidy'
106
- rescue LoadError
107
- # Ignore the error for now.
112
+ rescue Object
108
113
  end
114
+ begin
115
+ require 'idn'
116
+ rescue Object
117
+ end
109
118
 
110
119
  require 'feed_tools/vendor/htree'
111
120
 
@@ -126,11 +135,25 @@ begin
126
135
 
127
136
  require_gem('activesupport', '>= 1.1.1')
128
137
  require_gem('activerecord', '>= 1.11.1')
129
- require_gem('uuidtools', '>= 0.1.2')
130
138
 
139
+ begin
140
+ require_gem('uuidtools', '>= 0.1.2')
141
+ rescue Gem::LoadError
142
+ begin
143
+ require 'uuidtools'
144
+ rescue Object
145
+ raise unless defined? UUID
146
+ end
147
+ end
148
+
131
149
  require 'feed_tools/feed'
132
150
  require 'feed_tools/feed_item'
151
+ require 'feed_tools/feed_structures'
133
152
  require 'feed_tools/database_feed_cache'
153
+
154
+ require 'feed_tools/helpers/html_helper'
155
+ require 'feed_tools/helpers/xml_helper'
156
+ require 'feed_tools/helpers/uri_helper'
134
157
  rescue LoadError
135
158
  # ActiveSupport will very likely mess this up. So drop a warn so that the
136
159
  # programmer can figure it out if things get wierd and unpredictable.
@@ -159,19 +182,27 @@ module FeedTools
159
182
 
160
183
  def FeedTools.load_configurations
161
184
  if @configurations.blank?
185
+ # TODO: Load this from a config file.
162
186
  config_hash = {}
163
187
  @configurations = {
164
- :feed_cache => "FeedTools::DatabaseFeedCache",
188
+ :feed_cache => nil,
189
+ :proxy_address => nil,
190
+ :proxy_port => nil,
165
191
  :user_agent => "FeedTools/#{FEED_TOOLS_VERSION} " +
166
192
  "+http://www.sporkmonger.com/projects/feedtools/",
167
193
  :generator_name => "FeedTools/#{FEED_TOOLS_VERSION}",
168
194
  :generator_href => "http://www.sporkmonger.com/projects/feedtools/",
169
- :tidy_enabled => false,
195
+ :tidy_enabled => true,
170
196
  :tidy_options => {},
197
+ :idn_enabled => true,
198
+ :sanitization_enabled => true,
171
199
  :sanitize_with_nofollow => true,
200
+ :always_strip_wrapper_elements => true,
172
201
  :timestamp_estimation_enabled => true,
173
202
  :url_normalization_enabled => true,
203
+ :entry_sorting_property => "time",
174
204
  :strip_comment_count => false,
205
+ :tab_spaces => 2,
175
206
  :max_ttl => 3.days.to_s,
176
207
  :output_encoding => "utf-8"
177
208
  }.merge(config_hash)
@@ -236,6 +267,9 @@ module FeedTools
236
267
  cache_class = eval(class_name)
237
268
  if cache_class.kind_of?(Class)
238
269
  @feed_cache = cache_class
270
+ if @feed_cache.respond_to? :initialize_cache
271
+ @feed_cache.initialize_cache
272
+ end
239
273
  return cache_class
240
274
  else
241
275
  return nil
@@ -258,376 +292,7 @@ module FeedTools
258
292
  rescue
259
293
  return false
260
294
  end
261
- end
262
-
263
- # Returns true if the html tidy module can be used.
264
- #
265
- # Obviously, you need the tidy gem installed in order to run with html
266
- # tidy features turned on.
267
- #
268
- # This method does a fairly complicated, and probably unnecessarily
269
- # desperate search for the libtidy library. If you want this thing to
270
- # execute fast, the best thing to do is to set Tidy.path ahead of time.
271
- # If Tidy.path is set, this method doesn't do much. If it's not set,
272
- # it will do it's darnedest to find the libtidy library. If you set
273
- # the LIBTIDYPATH environment variable to the libtidy library, it should
274
- # be able to find it.
275
- #
276
- # Once the library is located, this method will run much faster.
277
- def FeedTools.tidy_enabled?
278
- # This is an override variable to keep tidy from being used even if it
279
- # is available.
280
- if FeedTools.configurations[:tidy_enabled] == false
281
- return false
282
- end
283
- if @tidy_enabled.nil? || @tidy_enabled == false
284
- @tidy_enabled = false
285
- begin
286
- require 'tidy'
287
- if Tidy.path.nil?
288
- # *Shrug*, just brute force it, I guess. There's a lot of places
289
- # this thing might be hiding in, depending on platform and general
290
- # sanity of the person who installed the thing. Most of these are
291
- # probably unlikely, but it's not like checking unlikely locations
292
- # hurts. Much. Especially if you actually find it.
293
- libtidy_locations = [
294
- '/usr/local/lib/libtidy.dylib',
295
- '/opt/local/lib/libtidy.dylib',
296
- '/usr/lib/libtidy.dylib',
297
- '/usr/local/lib/tidylib.dylib',
298
- '/opt/local/lib/tidylib.dylib',
299
- '/usr/lib/tidylib.dylib',
300
- '/usr/local/lib/tidy.dylib',
301
- '/opt/local/lib/tidy.dylib',
302
- '/usr/lib/tidy.dylib',
303
- '/usr/local/lib/libtidy.so',
304
- '/opt/local/lib/libtidy.so',
305
- '/usr/lib/libtidy.so',
306
- '/usr/local/lib/tidylib.so',
307
- '/opt/local/lib/tidylib.so',
308
- '/usr/lib/tidylib.so',
309
- '/usr/local/lib/tidy.so',
310
- '/opt/local/lib/tidy.so',
311
- '/usr/lib/tidy.so',
312
- 'C:\Program Files\Tidy\tidy.dll',
313
- 'C:\Tidy\tidy.dll',
314
- 'C:\Ruby\bin\tidy.dll',
315
- 'C:\Ruby\tidy.dll',
316
- '/usr/local/lib',
317
- '/opt/local/lib',
318
- '/usr/lib'
319
- ]
320
- # We just made this thing up, but if someone sets it, we'll
321
- # go ahead and check it
322
- unless ENV['LIBTIDYPATH'].nil?
323
- libtidy_locations =
324
- libtidy_locations.reverse.push(ENV['LIBTIDYPATH'])
325
- end
326
- for path in libtidy_locations
327
- if File.exists? path
328
- if File.ftype(path) == "file"
329
- Tidy.path = path
330
- @tidy_enabled = true
331
- break
332
- elsif File.ftype(path) == "directory"
333
- # Ok, now perhaps we're getting a bit more desperate
334
- lib_paths =
335
- `find #{path} -name '*tidy*' | grep '\\.\\(so\\|dylib\\)$'`
336
- # If there's more than one, grab the first one and
337
- # hope for the best, and if it doesn't work, then blame the
338
- # user for not specifying more accurately.
339
- tidy_path = lib_paths.split("\n").first
340
- unless tidy_path.nil?
341
- Tidy.path = tidy_path
342
- @tidy_enabled = true
343
- break
344
- end
345
- end
346
- end
347
- end
348
- # Still couldn't find it.
349
- unless @tidy_enabled
350
- @tidy_enabled = false
351
- end
352
- else
353
- @tidy_enabled = true
354
- end
355
- rescue LoadError
356
- # Tidy not installed, disable features that rely on tidy.
357
- @tidy_enabled = false
358
- end
359
- end
360
- return @tidy_enabled
361
- end
362
-
363
- # Attempts to ensures that the passed url is valid and sane. Accepts very, very ugly urls
364
- # and makes every effort to figure out what it was supposed to be. Also translates from
365
- # the feed: and rss: pseudo-protocols to the http: protocol.
366
- def FeedTools.normalize_url(url)
367
- if url.nil? || url == ""
368
- return nil
369
- end
370
- normalized_url = url.strip
371
-
372
- # if a url begins with the '/' character, it only makes sense that they
373
- # meant to be using a file:// url. Fix it for them.
374
- if normalized_url.length > 0 && normalized_url[0..0] == "/"
375
- normalized_url = "file://" + normalized_url
376
- end
377
-
378
- # if a url begins with a drive letter followed by a colon, we're looking at
379
- # a file:// url. Fix it for them.
380
- if normalized_url.length > 0 &&
381
- normalized_url.scan(/^[a-zA-Z]:[\\\/]/).size > 0
382
- normalized_url = "file:///" + normalized_url
383
- end
384
-
385
- # if a url begins with javascript:, it's quite possibly an attempt at
386
- # doing something malicious. Let's keep that from getting anywhere,
387
- # shall we?
388
- if (normalized_url.downcase =~ /javascript:/) != nil
389
- return "#"
390
- end
391
-
392
- # deal with all of the many ugly possibilities involved in the rss:
393
- # and feed: pseudo-protocols (incidentally, whose crazy idea was this
394
- # mess?)
395
- normalized_url.gsub!(/^http:\/*(feed:\/*)?/, "http://")
396
- normalized_url.gsub!(/^http:\/*(rss:\/*)?/, "http://")
397
- normalized_url.gsub!(/^feed:\/*(http:\/*)?/, "http://")
398
- normalized_url.gsub!(/^rss:\/*(http:\/*)?/, "http://")
399
- normalized_url.gsub!(/^file:\/*/, "file:///")
400
- normalized_url.gsub!(/^https:\/*/, "https://")
401
- # fix (very) bad urls (usually of the user-entered sort)
402
- normalized_url.gsub!(/^http:\/*(http:\/*)*/, "http://")
403
-
404
- if (normalized_url =~ /^file:/) == 0
405
- # Adjust windows-style urls
406
- normalized_url.gsub!(/^file:\/\/\/([a-zA-Z])\|/, 'file:///\1:')
407
- normalized_url.gsub!(/\\/, '/')
408
- else
409
- if (normalized_url =~ /https?:\/\//) == nil
410
- normalized_url = "http://" + normalized_url
411
- end
412
- if normalized_url == "http://"
413
- return nil
414
- end
415
- begin
416
- feed_uri = URI.parse(normalized_url)
417
- if feed_uri.scheme == nil
418
- feed_uri.scheme = "http"
419
- end
420
- if feed_uri.path == nil || feed_uri.path == ""
421
- feed_uri.path = "/"
422
- end
423
- if (feed_uri.path =~ /^[\/]+/) == 0
424
- feed_uri.path.gsub!(/^[\/]+/, "/")
425
- end
426
- feed_uri.host.downcase!
427
- normalized_url = feed_uri.to_s
428
- rescue URI::InvalidURIError
429
- end
430
- end
431
-
432
- # We can't do a proper set of escaping, so this will
433
- # have to do.
434
- normalized_url.gsub!(/%20/, " ")
435
- normalized_url.gsub!(/ /, "%20")
436
-
437
- return normalized_url
438
- end
439
-
440
- # Converts a url into a tag uri
441
- def FeedTools.build_tag_uri(url, date)
442
- unless url.kind_of? String
443
- raise ArgumentError, "Expected String, got #{url.class.name}"
444
- end
445
- unless date.kind_of? Time
446
- raise ArgumentError, "Expected Time, got #{date.class.name}"
447
- end
448
- tag_uri = normalize_url(url)
449
- unless FeedTools.is_uri?(tag_uri)
450
- raise ArgumentError, "Must supply a valid URL."
451
- end
452
- host = URI.parse(tag_uri).host
453
- tag_uri.gsub!(/^(http|ftp|file):\/*/, "")
454
- tag_uri.gsub!(/#/, "/")
455
- tag_uri = "tag:#{host},#{date.strftime('%Y-%m-%d')}:" +
456
- "#{tag_uri[(tag_uri.index(host) + host.size)..-1]}"
457
- return tag_uri
458
- end
459
-
460
- # Converts a url into a urn:uuid: uri
461
- def FeedTools.build_urn_uri(url)
462
- unless url.kind_of? String
463
- raise ArgumentError, "Expected String, got #{url.class.name}"
464
- end
465
- normalized_url = normalize_url(url)
466
- require 'uuidtools'
467
- return UUID.sha1_create(UUID_URL_NAMESPACE, normalized_url).to_uri_string
468
- end
469
-
470
- # Returns true if the parameter appears to be a valid uri
471
- def FeedTools.is_uri?(url)
472
- return false if url.nil?
473
- begin
474
- uri = URI.parse(url)
475
- if uri.scheme.nil? || uri.scheme == ""
476
- return false
477
- end
478
- rescue URI::InvalidURIError
479
- return false
480
- end
481
- return true
482
- end
483
-
484
- # Escapes all html entities
485
- def FeedTools.escape_entities(html)
486
- return nil if html.nil?
487
- escaped_html = CGI.escapeHTML(html)
488
- escaped_html.gsub!(/'/, "&apos;")
489
- escaped_html.gsub!(/"/, "&quot;")
490
- return escaped_html
491
- end
492
-
493
- # Unescapes all html entities
494
- def FeedTools.unescape_entities(html)
495
- return nil if html.nil?
496
- unescaped_html = html
497
- unescaped_html.gsub!(/&#x26;/, "&amp;")
498
- unescaped_html.gsub!(/&#38;/, "&amp;")
499
- unescaped_html = CGI.unescapeHTML(unescaped_html)
500
- unescaped_html.gsub!(/&apos;/, "'")
501
- unescaped_html.gsub!(/&quot;/, "\"")
502
- return unescaped_html
503
- end
504
-
505
- # Removes all html tags from the html formatted text.
506
- def FeedTools.strip_html(html)
507
- return nil if html.nil?
508
- # TODO: do this properly
509
- # ======================
510
- stripped_html = html.gsub(/<\/?[^>]+>/, "")
511
- return stripped_html
512
- end
513
-
514
- # Tidys up the html
515
- def FeedTools.tidy_html(html, options = {})
516
- return nil if html.nil?
517
- if FeedTools.tidy_enabled?
518
- is_fragment = true
519
- html.gsub!(/&lt;!'/, "&amp;lt;!'")
520
- if (html.strip =~ /<html>(.|\n)*<body>/) != nil ||
521
- (html.strip =~ /<\/body>(.|\n)*<\/html>$/) != nil
522
- is_fragment = false
523
- end
524
- if (html.strip =~ /<\?xml(.|\n)*\?>/) != nil
525
- is_fragment = false
526
- end
527
- tidy_html = Tidy.open(:show_warnings=>false) do |tidy|
528
- tidy.options.output_xml = true
529
- tidy.options.numeric_entities = true
530
- tidy.options.markup = true
531
- tidy.options.indent = false
532
- tidy.options.wrap = 0
533
- tidy.options.logical_emphasis = true
534
- # TODO: Make this match the actual encoding of the feed
535
- # =====================================================
536
- tidy.options.input_encoding = "utf8"
537
- tidy.options.output_encoding = "ascii"
538
- tidy.options.ascii_chars = false
539
- tidy.options.doctype = "omit"
540
- xml = tidy.clean(html)
541
- xml
542
- end
543
- if is_fragment
544
- # Tidy sticks <html>...<body>[our html]</body>...</html> in.
545
- # We don't want this.
546
- tidy_html.strip!
547
- tidy_html.gsub!(/^<html>(.|\n)*<body>/, "")
548
- tidy_html.gsub!(/<\/body>(.|\n)*<\/html>$/, "")
549
- tidy_html.strip!
550
- end
551
- tidy_html.gsub!(/&#x26;/, "&amp;")
552
- tidy_html.gsub!(/&#38;/, "&amp;")
553
- tidy_html.gsub!(/\320\262\320\202\342\204\242/, "\342\200\231")
554
-
555
- else
556
- tidy_html = html
557
- end
558
- if tidy_html.blank? && !html.blank?
559
- tidy_html = html.strip
560
- end
561
- return tidy_html
562
- end
563
-
564
- # Removes all dangerous html tags from the html formatted text.
565
- # If mode is set to :escape, dangerous and unknown elements will
566
- # be escaped. If mode is set to :strip, dangerous and unknown
567
- # elements and all children will be removed entirely.
568
- # Dangerous or unknown attributes are always removed.
569
- def FeedTools.sanitize_html(html, mode=:strip)
570
- return nil if html.nil?
571
-
572
- # Lists borrowed from Mark Pilgrim's feedparser
573
- acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
574
- 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
575
- 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl',
576
- 'dt', 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4',
577
- 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend',
578
- 'li', 'map', 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's',
579
- 'samp', 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup',
580
- 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt',
581
- 'u', 'ul', 'var']
582
-
583
- acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
584
- 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
585
- 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
586
- 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
587
- 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
588
- 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
589
- 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
590
- 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
591
- 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
592
- 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
593
- 'type', 'usemap', 'valign', 'value', 'vspace', 'width']
594
-
595
- # Replace with appropriate named entities
596
- html.gsub!(/&#x26;/, "&amp;")
597
- html.gsub!(/&#38;/, "&amp;")
598
- html.gsub!(/&lt;!'/, "&amp;lt;!'")
599
-
600
- # Hackity hack. But it works, and it seems plenty fast enough.
601
- html_doc = HTree.parse_xml("<root>" + html + "</root>").to_rexml
602
-
603
- sanitize_node = lambda do |html_node|
604
- if html_node.respond_to? :children
605
- for child in html_node.children
606
- if child.kind_of? REXML::Element
607
- unless acceptable_elements.include? child.name.downcase
608
- if mode == :strip
609
- html_node.delete_element(child)
610
- else
611
- new_child = REXML::Text.new(CGI.escapeHTML(child.to_s))
612
- html_node.insert_after(child, new_child)
613
- html_node.delete_element(child)
614
- end
615
- end
616
- for attribute in child.attributes.keys
617
- unless acceptable_attributes.include? attribute.downcase
618
- child.delete_attribute(attribute)
619
- end
620
- end
621
- end
622
- sanitize_node.call(child)
623
- end
624
- end
625
- html_node
626
- end
627
- sanitize_node.call(html_doc.root)
628
- html = html_doc.root.inner_xml
629
- return html
630
- end
295
+ end
631
296
 
632
297
  # Creates a merged "planet" feed from a set of urls.
633
298
  #
@@ -637,7 +302,7 @@ module FeedTools
637
302
  # in conjunction with the DatabaseFeedCache as it will
638
303
  # open multiple connections to the database.
639
304
  def FeedTools.build_merged_feed(url_array, options = {})
640
- validate_options([ :multi_threaded ],
305
+ FeedTools::GenericHelper.validate_options([ :multi_threaded ],
641
306
  options.keys)
642
307
  options = { :multi_threaded => false }.merge(options)
643
308
  return nil if url_array.nil?
@@ -930,19 +595,37 @@ module REXML # :nodoc:
930
595
  result << child.to_s
931
596
  end
932
597
  end
933
- return result
598
+ return result.strip
934
599
  end
600
+ else
601
+ warn("inner_xml method already exists.")
935
602
  end
936
603
 
937
- unless REXML::Element.public_instance_methods.include? :base_uri
938
- def base_uri # :nodoc:
939
- if not attribute('xml:base')
604
+ def base_uri # :nodoc:
605
+ begin
606
+ base_attribute = FeedTools::XmlHelper.try_xpaths(self, [
607
+ '@xml:base'
608
+ ])
609
+ if parent == nil || parent.kind_of?(REXML::Document)
610
+ return nil if base_attribute == nil
611
+ return base_attribute.value
612
+ end
613
+ if base_attribute != nil && parent == nil
614
+ return base_attribute.value
615
+ elsif parent != nil && base_attribute == nil
940
616
  return parent.base_uri
941
- elsif parent
942
- return URI.join(parent.base_uri, attribute('xml:base').value).to_s
943
- else
944
- return (attribute('xml:base').value or '')
617
+ elsif parent != nil && base_attribute != nil
618
+ parent_base_uri = parent.base_uri
619
+ if parent_base_uri != nil
620
+ uri = URI.parse(parent_base_uri)
621
+ return (uri + base_attribute.value).to_s
622
+ else
623
+ return base_attribute.value
624
+ end
945
625
  end
626
+ return nil
627
+ rescue
628
+ return nil
946
629
  end
947
630
  end
948
631
  end