hpricot_scrub 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.txt +10 -4
- data/examples/config.yml +47 -45
- data/lib/hpricot_scrub/hpricot_scrub.rb +30 -6
- data/lib/hpricot_scrub/version.rb +1 -1
- data/test/hpricot_scrub_test.rb +45 -6
- data/test/scrubber_data.rb +20 -0
- metadata +3 -3
data/CHANGELOG.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
2007-04-05 Michael <michael@underpantsgnome.com>
|
2
|
+
Release 0.2.3
|
3
|
+
Add patches from Eric Wong
|
4
|
+
- Recursive scrubbing wasn't scrubbing if parent was in allow
|
5
|
+
- Add optional use of HTMLEntities
|
6
|
+
|
1
7
|
2007-03-04 Michael <michael@underpantsgnome.com>
|
2
8
|
Release 0.2.2
|
3
9
|
Add patches from Eric Wong
|
@@ -6,10 +12,10 @@
|
|
6
12
|
|
7
13
|
2007-03-04 Michael <michael@underpantsgnome.com>
|
8
14
|
Release 0.2.0
|
9
|
-
|
15
|
+
- Add String methods for scrub and scrub!
|
10
16
|
|
11
|
-
|
12
|
-
|
17
|
+
- Fixed a bug where nested elements were not being scrubbed when using a
|
18
|
+
config hash
|
13
19
|
|
14
20
|
2007-03-03 Michael <michael@underpantsgnome.com>
|
15
|
-
|
21
|
+
- Release 0.1.0, Initial Gem version
|
data/examples/config.yml
CHANGED
@@ -1,47 +1,49 @@
|
|
1
|
-
|
2
1
|
---
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
2
|
+
:allow_tags: # let these tags stay, but will strip attributes
|
3
|
+
- 'html'
|
4
|
+
- 'head'
|
5
|
+
- 'body'
|
6
|
+
- 'b'
|
7
|
+
- 'blockquote'
|
8
|
+
- 'br'
|
9
|
+
- 'div'
|
10
|
+
- 'h1'
|
11
|
+
- 'h2'
|
12
|
+
- 'h3'
|
13
|
+
- 'h4'
|
14
|
+
- 'h5'
|
15
|
+
- 'h6'
|
16
|
+
- 'hr'
|
17
|
+
- 'i'
|
18
|
+
- 'em'
|
19
|
+
- 'img'
|
20
|
+
- 'li'
|
21
|
+
- 'ol'
|
22
|
+
- 'p'
|
23
|
+
- 'pre'
|
24
|
+
- 'small'
|
25
|
+
- 'span'
|
26
|
+
- 'span'
|
27
|
+
- 'strike'
|
28
|
+
- 'strong'
|
29
|
+
- 'sub'
|
30
|
+
- 'sup'
|
31
|
+
- 'table'
|
32
|
+
- 'tbody'
|
33
|
+
- 'td'
|
34
|
+
- 'tfoot'
|
35
|
+
- 'thead'
|
36
|
+
- 'tr'
|
37
|
+
- 'u'
|
38
|
+
- 'ul'
|
37
39
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
40
|
+
:remove_tags: # completely removes everything between open and close tag
|
41
|
+
- 'form'
|
42
|
+
- 'script'
|
43
|
+
|
44
|
+
:allow_attributes: # let these attributes stay, strip all others
|
45
|
+
- 'src'
|
46
|
+
- 'font'
|
47
|
+
- 'alt'
|
48
|
+
- 'style'
|
49
|
+
- 'align'
|
@@ -10,9 +10,12 @@ require 'hpricot'
|
|
10
10
|
|
11
11
|
module Hpricot
|
12
12
|
module Scrubable
|
13
|
+
# TODO: figure out how to handle comments
|
13
14
|
def scrubable?
|
14
|
-
! [Hpricot::Text,
|
15
|
-
|
15
|
+
! [ Hpricot::Text,
|
16
|
+
Hpricot::BogusETag,
|
17
|
+
Hpricot::Comment
|
18
|
+
].include?(self.class) && self.respond_to?(:scrub)
|
16
19
|
end
|
17
20
|
end
|
18
21
|
|
@@ -34,9 +37,7 @@ module Hpricot
|
|
34
37
|
include Scrubable
|
35
38
|
|
36
39
|
def scrub(config)
|
37
|
-
children.reverse.each { |c|
|
38
|
-
c.scrub(config) if c.scrubable? && ! config[:allow_tags].include?(c.name)
|
39
|
-
}
|
40
|
+
children.reverse.each { |c| c.scrub(config) if c.scrubable? }
|
40
41
|
strip unless config[:allow_tags].include?(name)
|
41
42
|
end
|
42
43
|
|
@@ -61,7 +62,7 @@ module Hpricot
|
|
61
62
|
end
|
62
63
|
|
63
64
|
def strip_removes?
|
64
|
-
#
|
65
|
+
# TODO: find other elements that should be removed instead of stripped
|
65
66
|
attributes && attributes['type'] =~ /script|css/
|
66
67
|
end
|
67
68
|
end
|
@@ -93,3 +94,26 @@ class String
|
|
93
94
|
dup.scrub!
|
94
95
|
end
|
95
96
|
end
|
97
|
+
|
98
|
+
begin
|
99
|
+
require 'htmlentities'
|
100
|
+
|
101
|
+
module Hpricot
|
102
|
+
class Scrub
|
103
|
+
@coder = HTMLEntities.new
|
104
|
+
class << self
|
105
|
+
def entifier; @coder end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
class String
|
111
|
+
def decode!
|
112
|
+
self.gsub!(/^(\n|.)*$/, Hpricot::Scrub.entifier.decode(self))
|
113
|
+
end
|
114
|
+
|
115
|
+
def decode
|
116
|
+
dup.decode!
|
117
|
+
end
|
118
|
+
end
|
119
|
+
rescue LoadError; end
|
data/test/hpricot_scrub_test.rb
CHANGED
@@ -6,6 +6,17 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
6
6
|
def setup
|
7
7
|
@clean = Hpricot(MARKUP).scrub.inner_html
|
8
8
|
@config = YAML.load_file('examples/config.yml')
|
9
|
+
|
10
|
+
# add some tags that most users will probably want
|
11
|
+
@config_full = @config.dup
|
12
|
+
%w(body head html).each { |x| @config_full[:allow_tags].push(x) }
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_full_markup_partial_scrub
|
16
|
+
full = Hpricot(MARKUP)
|
17
|
+
full_markup = '<html><head></head><body>' + MARKUP + '</body></html>'
|
18
|
+
doc = Hpricot(full_markup).scrub(@config_full)
|
19
|
+
partial_scrub_common(doc, full)
|
9
20
|
end
|
10
21
|
|
11
22
|
def test_full_scrub
|
@@ -22,12 +33,16 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
22
33
|
def test_partial_scrub
|
23
34
|
full = Hpricot(MARKUP)
|
24
35
|
doc = Hpricot(MARKUP).scrub(@config)
|
25
|
-
|
36
|
+
partial_scrub_common(doc, full)
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_full_doc
|
40
|
+
doc = Hpricot(GOOGLE).scrub
|
26
41
|
assert_tag_count(doc, 'a', 0)
|
27
|
-
assert_tag_count(doc, 'p',
|
28
|
-
assert_tag_count(doc, '
|
29
|
-
assert_tag_count(doc, '
|
30
|
-
assert_tag_count(doc, '
|
42
|
+
assert_tag_count(doc, 'p', 0)
|
43
|
+
assert_tag_count(doc, 'img', 0)
|
44
|
+
assert_tag_count(doc, 'br', 0)
|
45
|
+
assert_tag_count(doc, 'div', 0)
|
31
46
|
assert_tag_count(doc, 'script', 0)
|
32
47
|
end
|
33
48
|
|
@@ -36,10 +51,34 @@ class HpricotScrubTest < Test::Unit::TestCase
|
|
36
51
|
assert formatted.scrub == @clean
|
37
52
|
assert formatted == MARKUP
|
38
53
|
end
|
39
|
-
|
54
|
+
|
40
55
|
def test_string_scrub!
|
41
56
|
formatted = MARKUP
|
42
57
|
assert formatted.scrub! == @clean
|
43
58
|
assert formatted == @clean
|
44
59
|
end
|
60
|
+
|
61
|
+
def test_decoder
|
62
|
+
str = 'some <a href="http://example.com/">example link</a> to nowhere'
|
63
|
+
scrubbed_str = str.scrub
|
64
|
+
assert scrubbed_str.include?(' ')
|
65
|
+
|
66
|
+
if defined?(HTMLEntities)
|
67
|
+
assert ! scrubbed_str.decode.include?(' ')
|
68
|
+
|
69
|
+
scrubbed_str.decode!
|
70
|
+
assert ! scrubbed_str.include?(' ')
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
private
|
75
|
+
def partial_scrub_common(doc, full)
|
76
|
+
# using the divisor search throws warnings in test
|
77
|
+
assert_tag_count(doc, 'a', 0)
|
78
|
+
assert_tag_count(doc, 'p', full.search('//p').size)
|
79
|
+
assert_tag_count(doc, 'div', full.search('//div').size)
|
80
|
+
assert_tag_count(doc, 'img', full.search('//img').size)
|
81
|
+
assert_tag_count(doc, 'br', full.search('//br').size)
|
82
|
+
assert_tag_count(doc, 'script', 0)
|
83
|
+
end
|
45
84
|
end
|
data/test/scrubber_data.rb
CHANGED
@@ -12,3 +12,23 @@ alert("gotcha");</script><img src="http://content.example.com/content/3587a2f6ee
|
|
12
12
|
<span>some random unclosed span
|
13
13
|
<style type="text/css">.foo {color:blue}</style>
|
14
14
|
EOS
|
15
|
+
|
16
|
+
GOOGLE = <<-EOS
|
17
|
+
<html><head><meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"><title>Google</title><style><!--
|
18
|
+
body,td,a,p,.h{font-family:arial,sans-serif}
|
19
|
+
.h{font-size:20px}
|
20
|
+
.h{color:#3366cc}
|
21
|
+
.q{color:#00c}
|
22
|
+
--></style>
|
23
|
+
<script>
|
24
|
+
<!--
|
25
|
+
function sf(){document.f.q.focus();}
|
26
|
+
// -->
|
27
|
+
</script>
|
28
|
+
</head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="sf();if(document.images){new Image().src='/images/nav_logo2.png'}" topmargin=3 marginheight=3><center><div align=right nowrap style="padding-bottom:4px" width=100%><font size=-1><a href="/url?sa=p&pref=ig&pval=3&q=http://www.google.com/ig%3Fhl%3Den&usg=__yvmOvIrk79QYmDkrJAeuYO8jTmo=">Personalize this page</a> | <a href="https://www.google.com/accounts/Login?continue=http://www.google.com/&hl=en">Sign in</a></font></div><img alt="Google" height=110 src="/intl/en_ALL/images/logo.gif" width=276><br><br><form action="/search" name=f><script defer><!--
|
29
|
+
function togDisp(e){stopB(e);var elems=document.getElementsByName('more');for(var i=0;i<elems.length;i++){var obj=elems[i],dp="";if(obj.style.display==""){dp="none";}obj.style.display=dp;}return false;}
|
30
|
+
function stopB(e){if(!e)e=window.event;e.cancelBubble=true;}
|
31
|
+
document.onclick=function(event){var elems=document.getElementsByName('more');if(elems[0].style.display==""){togDisp(event);}}
|
32
|
+
//-->
|
33
|
+
</script><table border=0 cellspacing=0 cellpadding=4><tr><td nowrap><font size=-1><b>Web</b> <a class=q href="http://images.google.com/imghp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wi">Images</a> <a class=q href="http://video.google.com/?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wv">Video</a> <a class=q href="http://news.google.com/nwshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wn">News</a> <a class=q href="http://maps.google.com/maps?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wl">Maps</a> <b><a href="/intl/en/options/" class=q onclick="this.blur();return togDisp(event)">more »</a></b><span name=more id=more style="display:none;position:absolute;background:#fff;border:1px solid #369;margin:-.5ex 2ex;padding:0 0 .5ex .8ex;width:16ex;line-height:1.9;z-index:1000" onclick="stopB(event)"><a href=# onclick="return togDisp(event)"><img border=0 src=/images/x2.gif width=12 height=12 alt="Close menu" align=right hspace=4 vspace=4></a><a class=q href="http://blogsearch.google.com/?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wb">Blogs</a><br><a class=q href="http://books.google.com/bkshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wp">Books</a><br><a class=q href="http://froogle.google.com/frghp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wf">Froogle</a><br><a class=q href="http://groups.google.com/grphp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wg">Groups</a><br><a class=q href="http://www.google.com/ptshp?ie=ISO-8859-1&oe=ISO-8859-1&hl=en&tab=wt">Patents</a><br><a href="/intl/en/options/" class=q><b>even more »</b></a></span></font></td></tr></table><table cellpadding=0 cellspacing=0><tr valign=top><td width=25%> </td><td align=center nowrap><input name=hl type=hidden value=en><input type=hidden name=ie value="ISO-8859-1"><input maxlength=2048 name=q size=55 title="Google Search" value=""><br><input name=btnG type=submit value="Google Search"><input name=btnI type=submit value="I'm Feeling Lucky"></td><td nowrap width=25%><font size=-2> <a href=/advanced_search?hl=en>Advanced Search</a><br> <a href=/preferences?hl=en>Preferences</a><br> <a href=/language_tools?hl=en>Language Tools</a></font></td></tr></table></form><br><br><font size=-1><a href="/intl/en/ads/">Advertising Programs</a> - <a href="/services/">Business Solutions</a> - <a href=/intl/en/about.html>About Google</a></font><p><font size=-2>©2007 Google</font></p></center></body></html>
|
34
|
+
EOS
|
metadata
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
|
-
rubygems_version: 0.9.1
|
2
|
+
rubygems_version: 0.9.2.1
|
3
3
|
specification_version: 1
|
4
4
|
name: hpricot_scrub
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 0.2.
|
7
|
-
date: 2007-
|
6
|
+
version: 0.2.3
|
7
|
+
date: 2007-04-05 00:00:00 -07:00
|
8
8
|
summary: Scrub HTML with Hpricot
|
9
9
|
require_paths:
|
10
10
|
- lib
|