hpricot_scrub 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.txt +10 -4
- data/examples/config.yml +47 -45
- data/lib/hpricot_scrub/hpricot_scrub.rb +30 -6
- data/lib/hpricot_scrub/version.rb +1 -1
- data/test/hpricot_scrub_test.rb +45 -6
- data/test/scrubber_data.rb +20 -0
- metadata +3 -3
data/CHANGELOG.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
2007-04-05 Michael <michael@underpantsgnome.com>
|
2
|
+
Release 0.2.3
|
3
|
+
Add patches from Eric Wong
|
4
|
+
- Recursive scrubbing wasn't scrubbing if parent was in allow
|
5
|
+
- Add optional use of HTMLEntities
|
6
|
+
|
1
7
|
2007-03-04 Michael <michael@underpantsgnome.com>
|
2
8
|
Release 0.2.2
|
3
9
|
Add patches from Eric Wong
|
@@ -6,10 +12,10 @@
|
|
6
12
|
|
7
13
|
2007-03-04 Michael <michael@underpantsgnome.com>
|
8
14
|
Release 0.2.0
|
9
|
-
|
15
|
+
- Add String methods for scrub and scrub!
|
10
16
|
|
11
|
-
|
12
|
-
|
17
|
+
- Fixed a bug where nested elements were not being scrubbed when using a
|
18
|
+
config hash
|
13
19
|
|
14
20
|
2007-03-03 Michael <michael@underpantsgnome.com>
|
15
|
-
|
21
|
+
- Release 0.1.0, Initial Gem version
|
data/examples/config.yml
CHANGED
@@ -1,47 +1,49 @@
|
|
1
|
-
|
2
1
|
---
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
2
|
+
:allow_tags: # let these tags stay, but will strip attributes
|
3
|
+
- 'html'
|
4
|
+
- 'head'
|
5
|
+
- 'body'
|
6
|
+
- 'b'
|
7
|
+
- 'blockquote'
|
8
|
+
- 'br'
|
9
|
+
- 'div'
|
10
|
+
- 'h1'
|
11
|
+
- 'h2'
|
12
|
+
- 'h3'
|
13
|
+
- 'h4'
|
14
|
+
- 'h5'
|
15
|
+
- 'h6'
|
16
|
+
- 'hr'
|
17
|
+
- 'i'
|
18
|
+
- 'em'
|
19
|
+
- 'img'
|
20
|
+
- 'li'
|
21
|
+
- 'ol'
|
22
|
+
- 'p'
|
23
|
+
- 'pre'
|
24
|
+
- 'small'
|
25
|
+
- 'span'
|
26
|
+
- 'span'
|
27
|
+
- 'strike'
|
28
|
+
- 'strong'
|
29
|
+
- 'sub'
|
30
|
+
- 'sup'
|
31
|
+
- 'table'
|
32
|
+
- 'tbody'
|
33
|
+
- 'td'
|
34
|
+
- 'tfoot'
|
35
|
+
- 'thead'
|
36
|
+
- 'tr'
|
37
|
+
- 'u'
|
38
|
+
- 'ul'
|
37
39
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
40
|
+
:remove_tags: # completely removes everything between open and close tag
|
41
|
+
- 'form'
|
42
|
+
- 'script'
|
43
|
+
|
44
|
+
:allow_attributes: # let these attributes stay, strip all others
|
45
|
+
- 'src'
|
46
|
+
- 'font'
|
47
|
+
- 'alt'
|
48
|
+
- 'style'
|
49
|
+
- 'align'
|
@@ -10,9 +10,12 @@ require 'hpricot'
|
|
10
10
|
|
11
11
|
module Hpricot
|
12
12
|
module Scrubable
|
13
|
+
# TODO: figure out how to handle comments
|
13
14
|
def scrubable?
|
14
|
-
! [Hpricot::Text,
|
15
|
-
|
15
|
+
! [ Hpricot::Text,
|
16
|
+
Hpricot::BogusETag,
|
17
|
+
Hpricot::Comment
|
18
|
+
].include?(self.class) && self.respond_to?(:scrub)
|
16
19
|
end
|
17
20
|
end
|
18
21
|
|
@@ -34,9 +37,7 @@ module Hpricot
|
|
34
37
|
include Scrubable
|
35
38
|
|
36
39
|
def scrub(config)
|
37
|
-
children.reverse.each { |c|
|
38
|
-
c.scrub(config) if c.scrubable? && ! config[:allow_tags].include?(c.name)
|
39
|
-
}
|
40
|
+
children.reverse.each { |c| c.scrub(config) if c.scrubable? }
|
40
41
|
strip unless config[:allow_tags].include?(name)
|
41
42
|
end
|
42
43
|
|
@@ -61,7 +62,7 @@ module Hpricot
|
|
61
62
|
end
|
62
63
|
|
63
64
|
def strip_removes?
|
64
|
-
#
|
65
|
+
# TODO: find other elements that should be removed instead of stripped
|
65
66
|
attributes && attributes['type'] =~ /script|css/
|
66
67
|
end
|
67
68
|
end
|
@@ -93,3 +94,26 @@ class String
|
|
93
94
|
dup.scrub!
|
94
95
|
end
|
95
96
|
end
|
97
|
+
|
98
|
+
begin
|
99
|
+
require 'htmlentities'
|
100
|
+
|
101
|
+
module Hpricot
|
102
|
+
class Scrub
|
103
|
+
@coder = HTMLEntities.new
|
104
|
+
class << self
|
105
|
+
def entifier; @coder end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
class String
|
111
|
+
def decode!
|
112
|
+
self.gsub!(/^(\n|.)*$/, Hpricot::Scrub.entifier.decode(self))
|
113
|
+
end
|
114
|
+
|
115
|
+
def decode
|
116
|
+
dup.decode!
|
117
|
+
end
|
118
|
+
end
|
119
|
+
rescue LoadError; end
|
data/test/hpricot_scrub_test.rb
CHANGED
@@ -6,6 +6,17 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
6
6
|
def setup
|
7
7
|
@clean = Hpricot(MARKUP).scrub.inner_html
|
8
8
|
@config = YAML.load_file('examples/config.yml')
|
9
|
+
|
10
|
+
# add some tags that most users will probably want
|
11
|
+
@config_full = @config.dup
|
12
|
+
%w(body head html).each { |x| @config_full[:allow_tags].push(x) }
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_full_markup_partial_scrub
|
16
|
+
full = Hpricot(MARKUP)
|
17
|
+
full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
|
18
|
+
doc = Hpricot(full_markup).scrub(@config_full)
|
19
|
+
partial_scrub_common(doc, full)
|
9
20
|
end
|
10
21
|
|
11
22
|
def test_full_scrub
|
@@ -22,12 +33,16 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
22
33
|
def test_partial_scrub
|
23
34
|
full = Hpricot(MARKUP)
|
24
35
|
doc = Hpricot(MARKUP).scrub(@config)
|
25
|
-
|
36
|
+
partial_scrub_common(doc, full)
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_full_doc
|
40
|
+
doc = Hpricot(GOOGLE).scrub
|
26
41
|
assert_tag_count(doc, 'a', 0)
|
27
|
-
assert_tag_count(doc, 'p',
|
28
|
-
assert_tag_count(doc, '
|
29
|
-
assert_tag_count(doc, '
|
30
|
-
assert_tag_count(doc, '
|
42
|
+
assert_tag_count(doc, 'p', 0)
|
43
|
+
assert_tag_count(doc, 'img', 0)
|
44
|
+
assert_tag_count(doc, 'br', 0)
|
45
|
+
assert_tag_count(doc, 'div', 0)
|
31
46
|
assert_tag_count(doc, 'script', 0)
|
32
47
|
end
|
33
48
|
|
@@ -36,10 +51,34 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
36
51
|
assert formatted.scrub == @clean
|
37
52
|
assert formatted == MARKUP
|
38
53
|
end
|
39
|
-
|
54
|
+
|
40
55
|
def test_string_scrub!
|
41
56
|
formatted = MARKUP
|
42
57
|
assert formatted.scrub! == @clean
|
43
58
|
assert formatted == @clean
|
44
59
|
end
|
60
|
+
|
61
|
+
def test_decoder
|
62
|
+
str = 'some <a href="http://example.com/">example link</a> to nowhere'
|
63
|
+
scrubbed_str = str.scrub
|
64
|
+
assert scrubbed_str.include?(' ')
|
65
|
+
|
66
|
+
if defined?(HTMLEntities)
|
67
|
+
assert ! scrubbed_str.decode.include?(' ')
|
68
|
+
|
69
|
+
scrubbed_str.decode!
|
70
|
+
assert ! scrubbed_str.include?(' ')
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
def partial_scrub_common(doc, full)
|
76
|
+
# using the divisor search throws warnings in test
|
77
|
+
assert_tag_count(doc, 'a', 0)
|
78
|
+
assert_tag_count(doc, 'p', full.search('//p').size)
|
79
|
+
assert_tag_count(doc, 'div', full.search('//div').size)
|
80
|
+
assert_tag_count(doc, 'img', full.search('//img').size)
|
81
|
+
assert_tag_count(doc, 'br', full.search('//br').size)
|
82
|
+
assert_tag_count(doc, 'script', 0)
|
83
|
+
end
|
45
84
|
end
|
data/test/scrubber_data.rb
CHANGED
@@ -12,3 +12,23 @@ alert("gotcha");</script><img src="http://content.example.com/content/3587a2f6ee
|
|
12
12
|
<span>some random unclosed span
|
13
13
|
<style type="text/css">.foo {color:blue}</style>
|
14
14
|
EOS
|
15
|
+
|
16
|
+
GOOGLE = <<-EOS
|
17
|
+
<html><head><meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"><title>Google</title><style><!--
|
18
|
+
body,td,a,p,.h{font-family:arial,sans-serif}
|
19
|
+
.h{font-size:20px}
|
20
|
+
.h{color:#3366cc}
|
21
|
+
.q{color:#00c}
|
22
|
+
--></style>
|
23
|
+
<script>
|
24
|
+
<!--
|
25
|
+
function sf(){document.f.q.focus();}
|
26
|
+
// -->
|
27
|
+
</script>
|
28
|
+
</head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="sf();if(document.images){new Image().src='/images/nav_logo2.png'}" topmargin=3 marginheight=3><center><div align=right nowrap style="padding-bottom:4px" width=100%><font size=-1><a href="/url?sa=p&pref=ig&pval=3&q=http://www.google.com/ig%3Fhl%3Den&usg=__yvmOvIrk79QYmDkrJAeuYO8jTmo=">Personalize this page</a> | <a href="https://www.google.com/accounts/Login?continue=http://www.google.com/&hl=en">Sign in</a></font></div><img alt="Google" height=110 src="/intl/en_ALL/images/logo.gif" width=276><br><br><form action="/search" name=f><script defer><!--
|
29
|
+
function togDisp(e){stopB(e);var elems=document.getElementsByName('more');for(var i=0;i<elems.length;i++){var obj=elems[i],dp="";if(obj.style.display==""){dp="none";}obj.style.display=dp;}return false;}
|
30
|
+
function stopB(e){if(!e)e=window.event;e.cancelBubble=true;}
|
31
|
+
document.onclick=function(event){var elems=document.getElementsByName('more');if(elems[0].style.display==""){togDisp(event);}}
|
32
|
+
//-->
|
33
|
+
</script><table border=0 cellspacing=0 cellpadding=4><tr><td nowrap><font size=-1><b>Web</b> <a class=q href="http://images.google.com/imghp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wi">Images</a> <a class=q href="http://video.google.com/?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wv">Video</a> <a class=q href="http://news.google.com/nwshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wn">News</a> <a class=q href="http://maps.google.com/maps?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wl">Maps</a> <b><a href="/intl/en/options/" class=q onclick="this.blur();return togDisp(event)">more »</a></b><span name=more id=more style="display:none;position:absolute;background:#fff;border:1px solid #369;margin:-.5ex 2ex;padding:0 0 .5ex .8ex;width:16ex;line-height:1.9;z-index:1000" onclick="stopB(event)"><a href=# onclick="return togDisp(event)"><img border=0 src=/images/x2.gif width=12 height=12 alt="Close menu" align=right hspace=4 vspace=4></a><a class=q href="http://blogsearch.google.com/?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wb">Blogs</a><br><a class=q href="http://books.google.com/bkshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wp">Books</a><br><a class=q href="http://froogle.google.com/frghp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wf">Froogle</a><br><a class=q href="http://groups.google.com/grphp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wg">Groups</a><br><a class=q href="http://www.google.com/ptshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wt">Patents</a><br><a href="/intl/en/options/" class=q><b>even more »</b></a></span></font></td></tr></table><table cellpadding=0 cellspacing=0><tr valign=top><td width=25%> </td><td align=center nowrap><input name=hl type=hidden value=en><input type=hidden name=ie value="ISO-8859-1"><input maxlength=2048 name=q size=55 title="Google Search" value=""><br><input name=btnG type=submit value="Google Search"><input name=btnI type=submit value="I'm Feeling Lucky"></td><td nowrap width=25%><font size=-2> <a href=/advanced_search?hl=en>Advanced Search</a><br> <a href=/preferences?hl=en>Preferences</a><br> <a href=/language_tools?hl=en>Language Tools</a></font></td></tr></table></form><br><br><font size=-1><a href="/intl/en/ads/">Advertising Programs</a> - <a href="/services/">Business Solutions</a> - <a href=/intl/en/about.html>About Google</a></font><p><font size=-2>©2007 Google</font></p></center></body></html>
|
34
|
+
EOS
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.1
|
2
|
+
rubygems_version: 0.9.2.1
|
3
3
|
specification_version: 1
|
4
4
|
name: hpricot_scrub
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.2.3
|
7
|
+
date: 2007-04-05 00:00:00 -07:00
|
8
8
|
summary: Scrub HTML with Hpricot
|
9
9
|
require_paths:
|
10
10
|
- lib
|