hpricot_scrub 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,9 @@
1
+ 2007-04-05 Michael <michael@underpantsgnome.com>
2
+ Release 0.2.3
3
+ Add patches from Eric Wong
4
+ - Recursive scrubbing wasn't scrubbing if parent was in allow
5
+ - Add optional use of HTMLEntities
6
+
1
7
  2007-03-04 Michael <michael@underpantsgnome.com>
2
8
  Release 0.2.2
3
9
  Add patches from Eric Wong
@@ -6,10 +12,10 @@
6
12
 
7
13
  2007-03-04 Michael <michael@underpantsgnome.com>
8
14
  Release 0.2.0
9
- Add String methods for scrub and scrub!
15
+ - Add String methods for scrub and scrub!
10
16
 
11
- Fixed a bug where nested elements were not being scrubbed when using a
12
- config hash
17
+ - Fixed a bug where nested elements were not being scrubbed when using a
18
+ config hash
13
19
 
14
20
  2007-03-03 Michael <michael@underpantsgnome.com>
15
- Release 0.1.0, Initial Gem version
21
+ - Release 0.1.0, Initial Gem version
@@ -1,47 +1,49 @@
1
-
2
1
  ---
3
- :allow_tags: # let these tags stay, but will strip attributes
4
- - 'b'
5
- - 'blockquote'
6
- - 'br'
7
- - 'div'
8
- - 'h1'
9
- - 'h2'
10
- - 'h3'
11
- - 'h4'
12
- - 'h5'
13
- - 'h6'
14
- - 'hr'
15
- - 'i'
16
- - 'em'
17
- - 'img'
18
- - 'li'
19
- - 'ol'
20
- - 'p'
21
- - 'pre'
22
- - 'small'
23
- - 'span'
24
- - 'span'
25
- - 'strike'
26
- - 'strong'
27
- - 'sub'
28
- - 'sup'
29
- - 'table'
30
- - 'tbody'
31
- - 'td'
32
- - 'tfoot'
33
- - 'thead'
34
- - 'tr'
35
- - 'u'
36
- - 'ul'
2
+ :allow_tags: # let these tags stay, but will strip attributes
3
+ - 'html'
4
+ - 'head'
5
+ - 'body'
6
+ - 'b'
7
+ - 'blockquote'
8
+ - 'br'
9
+ - 'div'
10
+ - 'h1'
11
+ - 'h2'
12
+ - 'h3'
13
+ - 'h4'
14
+ - 'h5'
15
+ - 'h6'
16
+ - 'hr'
17
+ - 'i'
18
+ - 'em'
19
+ - 'img'
20
+ - 'li'
21
+ - 'ol'
22
+ - 'p'
23
+ - 'pre'
24
+ - 'small'
25
+ - 'span'
26
+ - 'span'
27
+ - 'strike'
28
+ - 'strong'
29
+ - 'sub'
30
+ - 'sup'
31
+ - 'table'
32
+ - 'tbody'
33
+ - 'td'
34
+ - 'tfoot'
35
+ - 'thead'
36
+ - 'tr'
37
+ - 'u'
38
+ - 'ul'
37
39
 
38
- :remove_tags: # completely removes everything between open and close tag
39
- - 'form'
40
- - 'script'
41
-
42
- :allow_attributes: # let these attributes stay, strip all others
43
- - 'src'
44
- - 'font'
45
- - 'alt'
46
- - 'style'
47
- - 'align'
40
+ :remove_tags: # completely removes everything between open and close tag
41
+ - 'form'
42
+ - 'script'
43
+
44
+ :allow_attributes: # let these attributes stay, strip all others
45
+ - 'src'
46
+ - 'font'
47
+ - 'alt'
48
+ - 'style'
49
+ - 'align'
@@ -10,9 +10,12 @@ require 'hpricot'
10
10
 
11
11
  module Hpricot
12
12
  module Scrubable
13
+ # TODO: figure out how to handle comments
13
14
  def scrubable?
14
- ! [Hpricot::Text, Hpricot::BogusETag].include?(self.class) &&
15
- self.respond_to?(:scrub)
15
+ ! [ Hpricot::Text,
16
+ Hpricot::BogusETag,
17
+ Hpricot::Comment
18
+ ].include?(self.class) && self.respond_to?(:scrub)
16
19
  end
17
20
  end
18
21
 
@@ -34,9 +37,7 @@ module Hpricot
34
37
  include Scrubable
35
38
 
36
39
  def scrub(config)
37
- children.reverse.each { |c|
38
- c.scrub(config) if c.scrubable? && ! config[:allow_tags].include?(c.name)
39
- }
40
+ children.reverse.each { |c| c.scrub(config) if c.scrubable? }
40
41
  strip unless config[:allow_tags].include?(name)
41
42
  end
42
43
 
@@ -61,7 +62,7 @@ module Hpricot
61
62
  end
62
63
 
63
64
  def strip_removes?
64
- # I'm sure there are others that shuould be ripped instead of stripped
65
+ # TODO: find other elements that should be removed instead of stripped
65
66
  attributes && attributes['type'] =~ /script|css/
66
67
  end
67
68
  end
@@ -93,3 +94,26 @@ class String
93
94
  dup.scrub!
94
95
  end
95
96
  end
97
+
98
+ begin
99
+ require 'htmlentities'
100
+
101
+ module Hpricot
102
+ class Scrub
103
+ @coder = HTMLEntities.new
104
+ class << self
105
+ def entifier; @coder end
106
+ end
107
+ end
108
+ end
109
+
110
+ class String
111
+ def decode!
112
+ self.gsub!(/^(\n|.)*$/, Hpricot::Scrub.entifier.decode(self))
113
+ end
114
+
115
+ def decode
116
+ dup.decode!
117
+ end
118
+ end
119
+ rescue LoadError; end
@@ -2,7 +2,7 @@ module HpricotScrub #:nodoc:
2
2
  module VERSION #:nodoc:
3
3
  MAJOR = 0
4
4
  MINOR = 2
5
- TINY = 2
5
+ TINY = 3
6
6
 
7
7
  STRING = [MAJOR, MINOR, TINY].join('.')
8
8
  end
@@ -6,6 +6,17 @@ class HpricotScrubTest < Test::Unit::TestCase
6
6
  def setup
7
7
  @clean = Hpricot(MARKUP).scrub.inner_html
8
8
  @config = YAML.load_file('examples/config.yml')
9
+
10
+ # add some tags that most users will probably want
11
+ @config_full = @config.dup
12
+ %w(body head html).each { |x| @config_full[:allow_tags].push(x) }
13
+ end
14
+
15
+ def test_full_markup_partial_scrub
16
+ full = Hpricot(MARKUP)
17
+ full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
18
+ doc = Hpricot(full_markup).scrub(@config_full)
19
+ partial_scrub_common(doc, full)
9
20
  end
10
21
 
11
22
  def test_full_scrub
@@ -22,12 +33,16 @@ class HpricotScrubTest < Test::Unit::TestCase
22
33
  def test_partial_scrub
23
34
  full = Hpricot(MARKUP)
24
35
  doc = Hpricot(MARKUP).scrub(@config)
25
- # using the divisor search throws warnings in test
36
+ partial_scrub_common(doc, full)
37
+ end
38
+
39
+ def test_full_doc
40
+ doc = Hpricot(GOOGLE).scrub
26
41
  assert_tag_count(doc, 'a', 0)
27
- assert_tag_count(doc, 'p', full.search('//p').size)
28
- assert_tag_count(doc, 'div', full.search('//div').size)
29
- assert_tag_count(doc, 'img', full.search('//img').size)
30
- assert_tag_count(doc, 'br', full.search('//br').size)
42
+ assert_tag_count(doc, 'p', 0)
43
+ assert_tag_count(doc, 'img', 0)
44
+ assert_tag_count(doc, 'br', 0)
45
+ assert_tag_count(doc, 'div', 0)
31
46
  assert_tag_count(doc, 'script', 0)
32
47
  end
33
48
 
@@ -36,10 +51,34 @@ class HpricotScrubTest < Test::Unit::TestCase
36
51
  assert formatted.scrub == @clean
37
52
  assert formatted == MARKUP
38
53
  end
39
-
54
+
40
55
  def test_string_scrub!
41
56
  formatted = MARKUP
42
57
  assert formatted.scrub! == @clean
43
58
  assert formatted == @clean
44
59
  end
60
+
61
+ def test_decoder
62
+ str = 'some <a href="http://example.com/">example&nbsp;link</a> to nowhere'
63
+ scrubbed_str = str.scrub
64
+ assert scrubbed_str.include?('&nbsp;')
65
+
66
+ if defined?(HTMLEntities)
67
+ assert ! scrubbed_str.decode.include?('&nbsp;')
68
+
69
+ scrubbed_str.decode!
70
+ assert ! scrubbed_str.include?('&nbsp;')
71
+ end
72
+ end
73
+
74
+ private
75
+ def partial_scrub_common(doc, full)
76
+ # using the divisor search throws warnings in test
77
+ assert_tag_count(doc, 'a', 0)
78
+ assert_tag_count(doc, 'p', full.search('//p').size)
79
+ assert_tag_count(doc, 'div', full.search('//div').size)
80
+ assert_tag_count(doc, 'img', full.search('//img').size)
81
+ assert_tag_count(doc, 'br', full.search('//br').size)
82
+ assert_tag_count(doc, 'script', 0)
83
+ end
45
84
  end
@@ -12,3 +12,23 @@ alert("gotcha");</script><img src="http://content.example.com/content/3587a2f6ee
12
12
  <span>some random unclosed span
13
13
  <style type="text/css">.foo {color:blue}</style>
14
14
  EOS
15
+
16
+ GOOGLE = <<-EOS
17
+ <html><head><meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"><title>Google</title><style><!--
18
+ body,td,a,p,.h{font-family:arial,sans-serif}
19
+ .h{font-size:20px}
20
+ .h{color:#3366cc}
21
+ .q{color:#00c}
22
+ --></style>
23
+ <script>
24
+ <!--
25
+ function sf(){document.f.q.focus();}
26
+ // -->
27
+ </script>
28
+ </head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="sf();if(document.images){new Image().src='/images/nav_logo2.png'}" topmargin=3 marginheight=3><center><div align=right nowrap style="padding-bottom:4px" width=100%><font size=-1><a href="/url?sa=p&pref=ig&pval=3&q=http://www.google.com/ig%3Fhl%3Den&usg=__yvmOvIrk79QYmDkrJAeuYO8jTmo=">Personalize this page</a>&nbsp;|&nbsp;<a href="https://www.google.com/accounts/Login?continue=http://www.google.com/&hl=en">Sign in</a></font></div><img alt="Google" height=110 src="/intl/en_ALL/images/logo.gif" width=276><br><br><form action="/search" name=f><script defer><!--
29
+ function togDisp(e){stopB(e);var elems=document.getElementsByName('more');for(var i=0;i<elems.length;i++){var obj=elems[i],dp="";if(obj.style.display==""){dp="none";}obj.style.display=dp;}return false;}
30
+ function stopB(e){if(!e)e=window.event;e.cancelBubble=true;}
31
+ document.onclick=function(event){var elems=document.getElementsByName('more');if(elems[0].style.display==""){togDisp(event);}}
32
+ //-->
33
+ </script><table border=0 cellspacing=0 cellpadding=4><tr><td nowrap><font size=-1><b>Web</b>&nbsp;&nbsp;&nbsp;&nbsp;<a class=q href="http://images.google.com/imghp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wi">Images</a>&nbsp;&nbsp;&nbsp;&nbsp;<a class=q href="http://video.google.com/?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wv">Video</a>&nbsp;&nbsp;&nbsp;&nbsp;<a class=q href="http://news.google.com/nwshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wn">News</a>&nbsp;&nbsp;&nbsp;&nbsp;<a class=q href="http://maps.google.com/maps?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wl">Maps</a>&nbsp;&nbsp;&nbsp;&nbsp;<b><a href="/intl/en/options/" class=q onclick="this.blur();return togDisp(event)">more&nbsp;&raquo;</a></b><span name=more id=more style="display:none;position:absolute;background:#fff;border:1px solid #369;margin:-.5ex 2ex;padding:0 0 .5ex .8ex;width:16ex;line-height:1.9;z-index:1000" onclick="stopB(event)"><a href=# onclick="return togDisp(event)"><img border=0 src=/images/x2.gif width=12 height=12 alt="Close menu" align=right hspace=4 vspace=4></a><a class=q href="http://blogsearch.google.com/?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wb">Blogs</a><br><a class=q href="http://books.google.com/bkshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wp">Books</a><br><a class=q href="http://froogle.google.com/frghp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wf">Froogle</a><br><a class=q href="http://groups.google.com/grphp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wg">Groups</a><br><a class=q href="http://www.google.com/ptshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wt">Patents</a><br><a href="/intl/en/options/" class=q><b>even more &raquo;</b></a></span></font></td></tr></table><table cellpadding=0 cellspacing=0><tr valign=top><td width=25%>&nbsp;</td><td align=center nowrap><input name=hl type=hidden value=en><input type=hidden name=ie value="ISO-8859-1"><input maxlength=2048 name=q size=55 title="Google Search" value=""><br><input name=btnG type=submit value="Google Search"><input name=btnI type=submit value="I'm Feeling Lucky"></td><td nowrap width=25%><font size=-2>&nbsp;&nbsp;<a href=/advanced_search?hl=en>Advanced Search</a><br>&nbsp;&nbsp;<a href=/preferences?hl=en>Preferences</a><br>&nbsp;&nbsp;<a href=/language_tools?hl=en>Language Tools</a></font></td></tr></table></form><br><br><font size=-1><a href="/intl/en/ads/">Advertising&nbsp;Programs</a> - <a href="/services/">Business Solutions</a> - <a href=/intl/en/about.html>About Google</a></font><p><font size=-2>&copy;2007 Google</font></p></center></body></html>
34
+ EOS
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.1
2
+ rubygems_version: 0.9.2.1
3
3
  specification_version: 1
4
4
  name: hpricot_scrub
5
5
  version: !ruby/object:Gem::Version
6
- version: 0.2.2
7
- date: 2007-03-13 00:00:00 -07:00
6
+ version: 0.2.3
7
+ date: 2007-04-05 00:00:00 -07:00
8
8
  summary: Scrub HTML with Hpricot
9
9
  require_paths:
10
10
  - lib