hpricot_scrub 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,9 @@
1
+ 2007-04-05 Michael <michael@underpantsgnome.com>
2
+ Release 0.2.3
3
+ Add patches from Eric Wong
4
+ - Recursive scrubbing wasn't scrubbing if parent was in allow
5
+ - Add optional use of HTMLEntities
6
+
1
7
  2007-03-04 Michael <michael@underpantsgnome.com>
2
8
  Release 0.2.2
3
9
  Add patches from Eric Wong
@@ -6,10 +12,10 @@
6
12
 
7
13
  2007-03-04 Michael <michael@underpantsgnome.com>
8
14
  Release 0.2.0
9
- Add String methods for scrub and scrub!
15
+ - Add String methods for scrub and scrub!
10
16
 
11
- Fixed a bug where nested elements were not being scrubbed when using a
12
- config hash
17
+ - Fixed a bug where nested elements were not being scrubbed when using a
18
+ config hash
13
19
 
14
20
  2007-03-03 Michael <michael@underpantsgnome.com>
15
- Release 0.1.0, Initial Gem version
21
+ - Release 0.1.0, Initial Gem version
@@ -1,47 +1,49 @@
1
-
2
1
  ---
3
- :allow_tags: # let these tags stay, but will strip attributes
4
- - 'b'
5
- - 'blockquote'
6
- - 'br'
7
- - 'div'
8
- - 'h1'
9
- - 'h2'
10
- - 'h3'
11
- - 'h4'
12
- - 'h5'
13
- - 'h6'
14
- - 'hr'
15
- - 'i'
16
- - 'em'
17
- - 'img'
18
- - 'li'
19
- - 'ol'
20
- - 'p'
21
- - 'pre'
22
- - 'small'
23
- - 'span'
24
- - 'span'
25
- - 'strike'
26
- - 'strong'
27
- - 'sub'
28
- - 'sup'
29
- - 'table'
30
- - 'tbody'
31
- - 'td'
32
- - 'tfoot'
33
- - 'thead'
34
- - 'tr'
35
- - 'u'
36
- - 'ul'
2
+ :allow_tags: # let these tags stay, but will strip attributes
3
+ - 'html'
4
+ - 'head'
5
+ - 'body'
6
+ - 'b'
7
+ - 'blockquote'
8
+ - 'br'
9
+ - 'div'
10
+ - 'h1'
11
+ - 'h2'
12
+ - 'h3'
13
+ - 'h4'
14
+ - 'h5'
15
+ - 'h6'
16
+ - 'hr'
17
+ - 'i'
18
+ - 'em'
19
+ - 'img'
20
+ - 'li'
21
+ - 'ol'
22
+ - 'p'
23
+ - 'pre'
24
+ - 'small'
25
+ - 'span'
26
+ - 'span'
27
+ - 'strike'
28
+ - 'strong'
29
+ - 'sub'
30
+ - 'sup'
31
+ - 'table'
32
+ - 'tbody'
33
+ - 'td'
34
+ - 'tfoot'
35
+ - 'thead'
36
+ - 'tr'
37
+ - 'u'
38
+ - 'ul'
37
39
 
38
- :remove_tags: # completely removes everything between open and close tag
39
- - 'form'
40
- - 'script'
41
-
42
- :allow_attributes: # let these attributes stay, strip all others
43
- - 'src'
44
- - 'font'
45
- - 'alt'
46
- - 'style'
47
- - 'align'
40
+ :remove_tags: # completely removes everything between open and close tag
41
+ - 'form'
42
+ - 'script'
43
+
44
+ :allow_attributes: # let these attributes stay, strip all others
45
+ - 'src'
46
+ - 'font'
47
+ - 'alt'
48
+ - 'style'
49
+ - 'align'
@@ -10,9 +10,12 @@ require 'hpricot'
10
10
 
11
11
  module Hpricot
12
12
  module Scrubable
13
+ # TODO: figure out how to handle comments
13
14
  def scrubable?
14
- ! [Hpricot::Text, Hpricot::BogusETag].include?(self.class) &&
15
- self.respond_to?(:scrub)
15
+ ! [ Hpricot::Text,
16
+ Hpricot::BogusETag,
17
+ Hpricot::Comment
18
+ ].include?(self.class) && self.respond_to?(:scrub)
16
19
  end
17
20
  end
18
21
 
@@ -34,9 +37,7 @@ module Hpricot
34
37
  include Scrubable
35
38
 
36
39
  def scrub(config)
37
- children.reverse.each { |c|
38
- c.scrub(config) if c.scrubable? && ! config[:allow_tags].include?(c.name)
39
- }
40
+ children.reverse.each { |c| c.scrub(config) if c.scrubable? }
40
41
  strip unless config[:allow_tags].include?(name)
41
42
  end
42
43
 
@@ -61,7 +62,7 @@ module Hpricot
61
62
  end
62
63
 
63
64
  def strip_removes?
64
- # I'm sure there are others that shuould be ripped instead of stripped
65
+ # TODO: find other elements that should be removed instead of stripped
65
66
  attributes && attributes['type'] =~ /script|css/
66
67
  end
67
68
  end
@@ -93,3 +94,26 @@ class String
93
94
  dup.scrub!
94
95
  end
95
96
  end
97
+
98
+ begin
99
+ require 'htmlentities'
100
+
101
+ module Hpricot
102
+ class Scrub
103
+ @coder = HTMLEntities.new
104
+ class << self
105
+ def entifier; @coder end
106
+ end
107
+ end
108
+ end
109
+
110
+ class String
111
+ def decode!
112
+ self.gsub!(/^(\n|.)*$/, Hpricot::Scrub.entifier.decode(self))
113
+ end
114
+
115
+ def decode
116
+ dup.decode!
117
+ end
118
+ end
119
+ rescue LoadError; end
@@ -2,7 +2,7 @@ module HpricotScrub #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 2
5
- TINY = 2
5
+ TINY = 3
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -6,6 +6,17 @@ class HpricotScrubTest < Test::Unit::TestCase
6
6
  def setup
7
7
  @clean = Hpricot(MARKUP).scrub.inner_html
8
8
  @config = YAML.load_file('examples/config.yml')
9
+
10
+ # add some tags that most users will probably want
11
+ @config_full = @config.dup
12
+ %w(body head html).each { |x| @config_full[:allow_tags].push(x) }
13
+ end
14
+
15
+ def test_full_markup_partial_scrub
16
+ full = Hpricot(MARKUP)
17
+ full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
18
+ doc = Hpricot(full_markup).scrub(@config_full)
19
+ partial_scrub_common(doc, full)
9
20
  end
10
21
 
11
22
  def test_full_scrub
@@ -22,12 +33,16 @@ class HpricotScrubTest < Test::Unit::TestCase
22
33
  def test_partial_scrub
23
34
  full = Hpricot(MARKUP)
24
35
  doc = Hpricot(MARKUP).scrub(@config)
25
- # using the divisor search throws warnings in test
36
+ partial_scrub_common(doc, full)
37
+ end
38
+
39
+ def test_full_doc
40
+ doc = Hpricot(GOOGLE).scrub
26
41
  assert_tag_count(doc, 'a', 0)
27
- assert_tag_count(doc, 'p', full.search('//p').size)
28
- assert_tag_count(doc, 'div', full.search('//div').size)
29
- assert_tag_count(doc, 'img', full.search('//img').size)
30
- assert_tag_count(doc, 'br', full.search('//br').size)
42
+ assert_tag_count(doc, 'p', 0)
43
+ assert_tag_count(doc, 'img', 0)
44
+ assert_tag_count(doc, 'br', 0)
45
+ assert_tag_count(doc, 'div', 0)
31
46
  assert_tag_count(doc, 'script', 0)
32
47
  end
33
48
 
@@ -36,10 +51,34 @@ class HpricotScrubTest < Test::Unit::TestCase
36
51
  assert formatted.scrub == @clean
37
52
  assert formatted == MARKUP
38
53
  end
39
-
54
+
40
55
  def test_string_scrub!
41
56
  formatted = MARKUP
42
57
  assert formatted.scrub! == @clean
43
58
  assert formatted == @clean
44
59
  end
60
+
61
+ def test_decoder
62
+ str = 'some <a href="http://example.com/">example&nbsp;link</a> to nowhere'
63
+ scrubbed_str = str.scrub
64
+ assert scrubbed_str.include?('&nbsp;')
65
+
66
+ if defined?(HTMLEntities)
67
+ assert ! scrubbed_str.decode.include?('&nbsp;')
68
+
69
+ scrubbed_str.decode!
70
+ assert ! scrubbed_str.include?('&nbsp;')
71
+ end
72
+ end
73
+
74
+ private
75
+ def partial_scrub_common(doc, full)
76
+ # using the divisor search throws warnings in test
77
+ assert_tag_count(doc, 'a', 0)
78
+ assert_tag_count(doc, 'p', full.search('//p').size)
79
+ assert_tag_count(doc, 'div', full.search('//div').size)
80
+ assert_tag_count(doc, 'img', full.search('//img').size)
81
+ assert_tag_count(doc, 'br', full.search('//br').size)
82
+ assert_tag_count(doc, 'script', 0)
83
+ end
45
84
  end
@@ -12,3 +12,23 @@ alert("gotcha");</script><img src="http://content.example.com/content/3587a2f6ee
12
12
  <span>some random unclosed span
13
13
  <style type="text/css">.foo {color:blue}</style>
14
14
  EOS
15
+
16
+ GOOGLE = <<-EOS
17
+ <html><head><meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"><title>Google</title><style><!--
18
+ body,td,a,p,.h{font-family:arial,sans-serif}
19
+ .h{font-size:20px}
20
+ .h{color:#3366cc}
21
+ .q{color:#00c}
22
+ --></style>
23
+ <script>
24
+ <!--
25
+ function sf(){document.f.q.focus();}
26
+ // -->
27
+ </script>
28
+ </head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="sf();if(document.images){new Image().src='/images/nav_logo2.png'}" topmargin=3 marginheight=3><center><div align=right nowrap style="padding-bottom:4px" width=100%><font size=-1><a href="/url?sa=p&pref=ig&pval=3&q=http://www.google.com/ig%3Fhl%3Den&usg=__yvmOvIrk79QYmDkrJAeuYO8jTmo=">Personalize this page</a>&nbsp;|&nbsp;<a href="https://www.google.com/accounts/Login?continue=http://www.google.com/&hl=en">Sign in</a></font></div><img alt="Google" height=110 src="/intl/en_ALL/images/logo.gif" width=276><br><br><form action="/search" name=f><script defer><!--
29
+ function togDisp(e){stopB(e);var elems=document.getElementsByName('more');for(var i=0;i<elems.length;i++){var obj=elems[i],dp="";if(obj.style.display==""){dp="none";}obj.style.display=dp;}return false;}
30
+ function stopB(e){if(!e)e=window.event;e.cancelBubble=true;}
31
+ document.onclick=function(event){var elems=document.getElementsByName('more');if(elems[0].style.display==""){togDisp(event);}}
32
+ //-->
33
+ </script><table border=0 cellspacing=0 cellpadding=4><tr><td nowrap><font size=-1><b>Web</b>&nbsp;&nbsp;&nbsp;&nbsp;<a class=q href="http://images.google.com/imghp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wi">Images</a>&nbsp;&nbsp;&nbsp;&nbsp;<a class=q href="http://video.google.com/?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wv">Video</a>&nbsp;&nbsp;&nbsp;&nbsp;<a class=q href="http://news.google.com/nwshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wn">News</a>&nbsp;&nbsp;&nbsp;&nbsp;<a class=q href="http://maps.google.com/maps?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wl">Maps</a>&nbsp;&nbsp;&nbsp;&nbsp;<b><a href="/intl/en/options/" class=q onclick="this.blur();return togDisp(event)">more&nbsp;&raquo;</a></b><span name=more id=more style="display:none;position:absolute;background:#fff;border:1px solid #369;margin:-.5ex 2ex;padding:0 0 .5ex .8ex;width:16ex;line-height:1.9;z-index:1000" onclick="stopB(event)"><a href=# onclick="return togDisp(event)"><img border=0 src=/images/x2.gif width=12 height=12 alt="Close menu" align=right hspace=4 vspace=4></a><a class=q href="http://blogsearch.google.com/?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wb">Blogs</a><br><a class=q href="http://books.google.com/bkshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wp">Books</a><br><a class=q href="http://froogle.google.com/frghp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wf">Froogle</a><br><a class=q href="http://groups.google.com/grphp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wg">Groups</a><br><a class=q href="http://www.google.com/ptshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wt">Patents</a><br><a href="/intl/en/options/" class=q><b>even more &raquo;</b></a></span></font></td></tr></table><table cellpadding=0 cellspacing=0><tr valign=top><td width=25%>&nbsp;</td><td align=center nowrap><input name=hl type=hidden value=en><input type=hidden name=ie value="ISO-8859-1"><input maxlength=2048 name=q size=55 title="Google Search" value=""><br><input name=btnG type=submit value="Google Search"><input name=btnI type=submit value="I'm Feeling Lucky"></td><td nowrap width=25%><font size=-2>&nbsp;&nbsp;<a href=/advanced_search?hl=en>Advanced Search</a><br>&nbsp;&nbsp;<a href=/preferences?hl=en>Preferences</a><br>&nbsp;&nbsp;<a href=/language_tools?hl=en>Language Tools</a></font></td></tr></table></form><br><br><font size=-1><a href="/intl/en/ads/">Advertising&nbsp;Programs</a> - <a href="/services/">Business Solutions</a> - <a href=/intl/en/about.html>About Google</a></font><p><font size=-2>&copy;2007 Google</font></p></center></body></html>
34
+ EOS
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.1
2
+ rubygems_version: 0.9.2.1
3
3
  specification_version: 1
4
4
  name: hpricot_scrub
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.2
7
- date: 2007-03-13 00:00:00 -07:00
6
+ version: 0.2.3
7
+ date: 2007-04-05 00:00:00 -07:00
8
8
  summary: Scrub HTML with Hpricot
9
9
  require_paths:
10
10
  - lib