rgrove-sanitize 1.0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HISTORY +46 -0
- data/LICENSE +18 -0
- data/README.rdoc +168 -0
- data/lib/sanitize/config/basic.rb +49 -0
- data/lib/sanitize/config/relaxed.rb +56 -0
- data/lib/sanitize/config/restricted.rb +29 -0
- data/lib/sanitize/config.rb +49 -0
- data/lib/sanitize.rb +182 -0
- metadata +69 -0
data/HISTORY
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
Sanitize History
|
2
|
+
================================================================================
|
3
|
+
|
4
|
+
Version ? (?)
|
5
|
+
* Removed htmlentities gem dependency.
|
6
|
+
* Existing well-formed character entity references in the input string are now
|
7
|
+
preserved rather than being decoded and re-encoded.
|
8
|
+
* The ' character is now encoded as ' instead of ' to prevent
|
9
|
+
problems in IE6.
|
10
|
+
* You can now specify the symbol :all in place of an element name in the
|
11
|
+
attributes config hash to allow certain attributes on all elements. [Thanks
|
12
|
+
to Mutwin Kraus]
|
13
|
+
|
14
|
+
Version 1.0.5 (2009-02-05)
|
15
|
+
* Fixed a bug introduced in version 1.0.3 that prevented non-whitelisted
|
16
|
+
protocols from being cleaned when relative URLs were allowed. [Reported by
|
17
|
+
Dev Purkayastha]
|
18
|
+
* Fixed "undefined method `parent='" exceptions caused by parser changes in
|
19
|
+
edge Hpricot.
|
20
|
+
|
21
|
+
Version 1.0.4 (2009-01-16)
|
22
|
+
* Fixed a bug that made it possible to sneak a non-whitelisted element through
|
23
|
+
by repeating it several times in a row. All versions of Sanitize prior to
|
24
|
+
1.0.4 are vulnerable. [Reported by Cristobal]
|
25
|
+
|
26
|
+
Version 1.0.3 (2009-01-15)
|
27
|
+
* Fixed a bug whereby incomplete Unicode or hex entities could be used to
|
28
|
+
prevent non-whitelisted protocols from being cleaned. Since IE6 and Opera
|
29
|
+
still decode the incomplete entities, users of those browsers may be
|
30
|
+
vulnerable to malicious script injection on websites using versions of
|
31
|
+
Sanitize prior to 1.0.3.
|
32
|
+
|
33
|
+
Version 1.0.2 (2009-01-04)
|
34
|
+
* Fixed a bug that caused an exception to be thrown when parsing a valueless
|
35
|
+
attribute that's expected to contain a URL.
|
36
|
+
|
37
|
+
Version 1.0.1 (2009-01-01)
|
38
|
+
* You can now specify :relative in a protocol config array to allow attributes
|
39
|
+
containing relative URLs with no protocol. The Basic and Relaxed configs
|
40
|
+
have been updated to allow relative URLs.
|
41
|
+
* Added a workaround for an Hpricot bug that causes HTML entities for
|
42
|
+
non-ASCII characters to be replaced by question marks, and all other
|
43
|
+
entities to be destructively decoded.
|
44
|
+
|
45
|
+
Version 1.0.0 (2008-12-25)
|
46
|
+
* First release.
|
data/LICENSE
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
4
|
+
this software and associated documentation files (the 'Software'), to deal in
|
5
|
+
the Software without restriction, including without limitation the rights to
|
6
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
7
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
8
|
+
subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
11
|
+
copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
15
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
16
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
17
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
18
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
= Sanitize
|
2
|
+
|
3
|
+
Sanitize is a whitelist-based HTML sanitizer. Given a list of acceptable
|
4
|
+
elements and attributes, Sanitize will remove all unacceptable HTML from a
|
5
|
+
string.
|
6
|
+
|
7
|
+
Using a simple configuration syntax, you can tell Sanitize to allow certain
|
8
|
+
elements, certain attributes within those elements, and even certain URL
|
9
|
+
protocols within attributes that contain URLs. Any HTML elements or attributes
|
10
|
+
that you don't explicitly allow will be removed.
|
11
|
+
|
12
|
+
Because it's based on Hpricot, a full-fledged HTML parser, rather than a bunch
|
13
|
+
of fragile regular expressions, Sanitize has no trouble dealing with malformed
|
14
|
+
or maliciously-formed HTML. When in doubt, Sanitize always errs on the side of
|
15
|
+
caution.
|
16
|
+
|
17
|
+
*Author*:: Ryan Grove (mailto:ryan@wonko.com)
|
18
|
+
*Version*:: ? (?)
|
19
|
+
*Copyright*:: Copyright (c) 2009 Ryan Grove. All rights reserved.
|
20
|
+
*License*:: MIT License (http://opensource.org/licenses/mit-license.php)
|
21
|
+
*Website*:: http://github.com/rgrove/sanitize
|
22
|
+
|
23
|
+
== Requires
|
24
|
+
|
25
|
+
* RubyGems
|
26
|
+
* Hpricot 0.6+
|
27
|
+
|
28
|
+
== Usage
|
29
|
+
|
30
|
+
If you don't specify any configuration options, Sanitize will use its strictest
|
31
|
+
settings by default, which means it will strip all HTML.
|
32
|
+
|
33
|
+
require 'rubygems'
|
34
|
+
require 'sanitize'
|
35
|
+
|
36
|
+
html = '<b><a href="http://foo.com/">foo</a></b><img src="http://foo.com/bar.jpg" />'
|
37
|
+
|
38
|
+
Sanitize.clean(html) # => 'foo'
|
39
|
+
|
40
|
+
== Configuration
|
41
|
+
|
42
|
+
In addition to the ultra-safe default settings, Sanitize comes with three other
|
43
|
+
built-in modes.
|
44
|
+
|
45
|
+
=== Sanitize::Config::RESTRICTED
|
46
|
+
|
47
|
+
Allows only very simple inline formatting markup. No links, images, or block
|
48
|
+
elements.
|
49
|
+
|
50
|
+
Sanitize.clean(html, Sanitize::Config::RESTRICTED) # => '<b>foo</b>'
|
51
|
+
|
52
|
+
=== Sanitize::Config::BASIC
|
53
|
+
|
54
|
+
Allows a variety of markup including formatting tags, links, and lists. Images
|
55
|
+
and tables are not allowed, links are limited to FTP, HTTP, HTTPS, and mailto
|
56
|
+
protocols, and a <code>rel="nofollow"</code> attribute is added to all links to
|
57
|
+
mitigate SEO spam.
|
58
|
+
|
59
|
+
Sanitize.clean(html, Sanitize::Config::BASIC)
|
60
|
+
# => '<b><a href="http://foo.com/" rel="nofollow">foo</a></b>'
|
61
|
+
|
62
|
+
=== Sanitize::Config::RELAXED
|
63
|
+
|
64
|
+
Allows an even wider variety of markup than BASIC, including images and tables.
|
65
|
+
Links are still limited to FTP, HTTP, HTTPS, and mailto protocols, while images
|
66
|
+
are limited to HTTP and HTTPS. In this mode, <code>rel="nofollow"</code> is not
|
67
|
+
added to links.
|
68
|
+
|
69
|
+
Sanitize.clean(html, Sanitize::Config::RELAXED)
|
70
|
+
# => '<b><a href="http://foo.com/">foo</a></b><img src="http://foo.com/bar.jpg" />'
|
71
|
+
|
72
|
+
=== Custom Configuration
|
73
|
+
|
74
|
+
If the built-in modes don't meet your needs, you can easily specify a custom
|
75
|
+
configuration:
|
76
|
+
|
77
|
+
Sanitize.clean(html, :elements => ['a', 'span'],
|
78
|
+
:attributes => {'a' => ['href', 'title'], 'span' => ['class']},
|
79
|
+
:protocols => {'a' => {'href' => ['http', 'https', 'mailto']}})
|
80
|
+
|
81
|
+
==== :elements
|
82
|
+
|
83
|
+
Array of element names to allow. Specify all names in lowercase.
|
84
|
+
|
85
|
+
:elements => [
|
86
|
+
'a', 'b', 'blockquote', 'br', 'cite', 'code', 'dd', 'dl', 'dt', 'em',
|
87
|
+
'i', 'li', 'ol', 'p', 'pre', 'q', 'small', 'strike', 'strong', 'sub',
|
88
|
+
'sup', 'u', 'ul'
|
89
|
+
]
|
90
|
+
|
91
|
+
==== :attributes
|
92
|
+
|
93
|
+
Attributes to allow for specific elements. Specify all element names and
|
94
|
+
attributes in lowercase.
|
95
|
+
|
96
|
+
:attributes => {
|
97
|
+
'a' => ['href', 'title'],
|
98
|
+
'blockquote' => ['cite'],
|
99
|
+
'img' => ['alt', 'src', 'title']
|
100
|
+
}
|
101
|
+
|
102
|
+
If you'd like to allow certain attributes on all elements, use the symbol
|
103
|
+
<code>:all</code> instead of an element name.
|
104
|
+
|
105
|
+
:attributes => {
|
106
|
+
:all => ['class'],
|
107
|
+
'a' => ['href', 'title'],
|
108
|
+
}
|
109
|
+
|
110
|
+
==== :add_attributes
|
111
|
+
|
112
|
+
Attributes to add to specific elements. If the attribute already exists, it will
|
113
|
+
be replaced with the value specified here. Specify all element names and
|
114
|
+
attributes in lowercase.
|
115
|
+
|
116
|
+
:add_attributes => {
|
117
|
+
'a' => {'rel' => 'nofollow'}
|
118
|
+
}
|
119
|
+
|
120
|
+
==== :protocols
|
121
|
+
|
122
|
+
URL protocols to allow in specific attributes. If an attribute is listed here
|
123
|
+
and contains a protocol other than those specified (or if it contains no
|
124
|
+
protocol at all), it will be removed.
|
125
|
+
|
126
|
+
:protocols => {
|
127
|
+
'a' => {'href' => ['ftp', 'http', 'https', 'mailto']},
|
128
|
+
'img' => {'src' => ['http', 'https']}
|
129
|
+
}
|
130
|
+
|
131
|
+
If you'd like to allow the use of relative URLs which don't have a protocol,
|
132
|
+
include the symbol <code>:relative</code> in the protocol array:
|
133
|
+
|
134
|
+
:protocols => {
|
135
|
+
'a' => {'href' => ['http', 'https', :relative]}
|
136
|
+
}
|
137
|
+
|
138
|
+
|
139
|
+
== Contributors
|
140
|
+
|
141
|
+
The following lovely people have contributed to Sanitize in the form of patches
|
142
|
+
or ideas that later became code:
|
143
|
+
|
144
|
+
* Ryan Grove <ryan@wonko.com>
|
145
|
+
* Adam Hooper <adam@adamhooper.com>
|
146
|
+
* Mutwin Kraus <mutle@blogage.de>
|
147
|
+
* Dev Purkayastha <dev.purkayastha@gmail.com>
|
148
|
+
|
149
|
+
== License
|
150
|
+
|
151
|
+
Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
152
|
+
|
153
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
154
|
+
this software and associated documentation files (the 'Software'), to deal in
|
155
|
+
the Software without restriction, including without limitation the rights to
|
156
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
157
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
158
|
+
subject to the following conditions:
|
159
|
+
|
160
|
+
The above copyright notice and this permission notice shall be included in all
|
161
|
+
copies or substantial portions of the Software.
|
162
|
+
|
163
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
164
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
165
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
166
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
167
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
168
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
# of this software and associated documentation files (the 'Software'), to deal
|
6
|
+
# in the Software without restriction, including without limitation the rights
|
7
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
# copies of the Software, and to permit persons to whom the Software is
|
9
|
+
# furnished to do so, subject to the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be included in all
|
12
|
+
# copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20
|
+
# SOFTWARE.
|
21
|
+
#++
|
22
|
+
|
23
|
+
class Sanitize
|
24
|
+
module Config
|
25
|
+
BASIC = {
|
26
|
+
:elements => [
|
27
|
+
'a', 'b', 'blockquote', 'br', 'cite', 'code', 'dd', 'dl', 'dt', 'em',
|
28
|
+
'i', 'li', 'ol', 'p', 'pre', 'q', 'small', 'strike', 'strong', 'sub',
|
29
|
+
'sup', 'u', 'ul'],
|
30
|
+
|
31
|
+
:attributes => {
|
32
|
+
'a' => ['href'],
|
33
|
+
'blockquote' => ['cite'],
|
34
|
+
'q' => ['cite']
|
35
|
+
},
|
36
|
+
|
37
|
+
:add_attributes => {
|
38
|
+
'a' => {'rel' => 'nofollow'}
|
39
|
+
},
|
40
|
+
|
41
|
+
:protocols => {
|
42
|
+
'a' => {'href' => ['ftp', 'http', 'https', 'mailto',
|
43
|
+
:relative]},
|
44
|
+
'blockquote' => {'cite' => ['http', 'https', :relative]},
|
45
|
+
'q' => {'cite' => ['http', 'https', :relative]}
|
46
|
+
}
|
47
|
+
}
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
# of this software and associated documentation files (the 'Software'), to deal
|
6
|
+
# in the Software without restriction, including without limitation the rights
|
7
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
# copies of the Software, and to permit persons to whom the Software is
|
9
|
+
# furnished to do so, subject to the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be included in all
|
12
|
+
# copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20
|
+
# SOFTWARE.
|
21
|
+
#++
|
22
|
+
|
23
|
+
class Sanitize
|
24
|
+
module Config
|
25
|
+
RELAXED = {
|
26
|
+
:elements => [
|
27
|
+
'a', 'b', 'blockquote', 'br', 'caption', 'cite', 'code', 'col',
|
28
|
+
'colgroup', 'dd', 'dl', 'dt', 'em', 'i', 'img', 'li', 'ol', 'p', 'pre',
|
29
|
+
'q', 'small', 'strike', 'strong', 'sub', 'sup', 'table', 'tbody', 'td',
|
30
|
+
'tfoot', 'th', 'thead', 'tr', 'u', 'ul'],
|
31
|
+
|
32
|
+
:attributes => {
|
33
|
+
'a' => ['href', 'title'],
|
34
|
+
'blockquote' => ['cite'],
|
35
|
+
'col' => ['span', 'width'],
|
36
|
+
'colgroup' => ['span', 'width'],
|
37
|
+
'img' => ['align', 'alt', 'height', 'src', 'title', 'width'],
|
38
|
+
'ol' => ['start', 'type'],
|
39
|
+
'q' => ['cite'],
|
40
|
+
'table' => ['summary', 'width'],
|
41
|
+
'td' => ['abbr', 'axis', 'colspan', 'rowspan', 'width'],
|
42
|
+
'th' => ['abbr', 'axis', 'colspan', 'rowspan', 'scope',
|
43
|
+
'width'],
|
44
|
+
'ul' => ['type']
|
45
|
+
},
|
46
|
+
|
47
|
+
:protocols => {
|
48
|
+
'a' => {'href' => ['ftp', 'http', 'https', 'mailto',
|
49
|
+
:relative]},
|
50
|
+
'blockquote' => {'cite' => ['http', 'https', :relative]},
|
51
|
+
'img' => {'src' => ['http', 'https', :relative]},
|
52
|
+
'q' => {'cite' => ['http', 'https', :relative]}
|
53
|
+
}
|
54
|
+
}
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
# of this software and associated documentation files (the 'Software'), to deal
|
6
|
+
# in the Software without restriction, including without limitation the rights
|
7
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
# copies of the Software, and to permit persons to whom the Software is
|
9
|
+
# furnished to do so, subject to the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be included in all
|
12
|
+
# copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20
|
+
# SOFTWARE.
|
21
|
+
#++
|
22
|
+
|
23
|
+
class Sanitize
|
24
|
+
module Config
|
25
|
+
RESTRICTED = {
|
26
|
+
:elements => ['b', 'em', 'i', 'strong', 'u']
|
27
|
+
}
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
# of this software and associated documentation files (the 'Software'), to deal
|
6
|
+
# in the Software without restriction, including without limitation the rights
|
7
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
# copies of the Software, and to permit persons to whom the Software is
|
9
|
+
# furnished to do so, subject to the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be included in all
|
12
|
+
# copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20
|
+
# SOFTWARE.
|
21
|
+
#++
|
22
|
+
|
23
|
+
class Sanitize
|
24
|
+
module Config
|
25
|
+
DEFAULT = {
|
26
|
+
# Whether or not to allow HTML comments. Allowing comments is strongly
|
27
|
+
# discouraged, since IE allows script execution within conditional
|
28
|
+
# comments.
|
29
|
+
:allow_comments => false,
|
30
|
+
|
31
|
+
# HTML elements to allow. By default, no elements are allowed (which means
|
32
|
+
# that all HTML will be stripped).
|
33
|
+
:elements => [],
|
34
|
+
|
35
|
+
# HTML attributes to allow in specific elements. By default, no attributes
|
36
|
+
# are allowed.
|
37
|
+
:attributes => {},
|
38
|
+
|
39
|
+
# HTML attributes to add to specific elements. By default, no attributes
|
40
|
+
# are added.
|
41
|
+
:add_attributes => {},
|
42
|
+
|
43
|
+
# URL handling protocols to allow in specific attributes. By default, no
|
44
|
+
# protocols are allowed. Use :relative in place of a protocol if you want
|
45
|
+
# to allow relative URLs sans protocol.
|
46
|
+
:protocols => {}
|
47
|
+
}
|
48
|
+
end
|
49
|
+
end
|
data/lib/sanitize.rb
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
#--
|
2
|
+
# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
+
# of this software and associated documentation files (the 'Software'), to deal
|
6
|
+
# in the Software without restriction, including without limitation the rights
|
7
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
+
# copies of the Software, and to permit persons to whom the Software is
|
9
|
+
# furnished to do so, subject to the following conditions:
|
10
|
+
#
|
11
|
+
# The above copyright notice and this permission notice shall be included in all
|
12
|
+
# copies or substantial portions of the Software.
|
13
|
+
#
|
14
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20
|
+
# SOFTWARE.
|
21
|
+
#++
|
22
|
+
|
23
|
+
# Append this file's directory to the include path if it's not there already.
|
24
|
+
$:.unshift(File.dirname(File.expand_path(__FILE__)))
|
25
|
+
$:.uniq!
|
26
|
+
|
27
|
+
require 'rubygems'
|
28
|
+
|
29
|
+
gem 'hpricot', '~> 0.6'
|
30
|
+
|
31
|
+
require 'hpricot'
|
32
|
+
require 'sanitize/config'
|
33
|
+
require 'sanitize/config/restricted'
|
34
|
+
require 'sanitize/config/basic'
|
35
|
+
require 'sanitize/config/relaxed'
|
36
|
+
|
37
|
+
class Sanitize
|
38
|
+
|
39
|
+
# Characters that should be replaced with entities in text nodes.
|
40
|
+
ENTITY_MAP = {
|
41
|
+
'<' => '<',
|
42
|
+
'>' => '>',
|
43
|
+
'"' => '"',
|
44
|
+
"'" => '''
|
45
|
+
}
|
46
|
+
|
47
|
+
# Matches an unencoded ampersand that is not part of a valid character entity
|
48
|
+
# reference.
|
49
|
+
REGEX_AMPERSAND = /&(?!(?:[a-z]+|#[0-9]+|#x[0-9a-f]+);)/i
|
50
|
+
|
51
|
+
# Matches an attribute value that could be treated by a browser as a URL
|
52
|
+
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
|
53
|
+
# or more characters followed by a colon is considered a match, even if the
|
54
|
+
# colon is encoded as an entity and even if it's an incomplete entity (which
|
55
|
+
# IE6 and Opera will still parse).
|
56
|
+
REGEX_PROTOCOL = /^([^:]*)(?:\:|�*58|�*3a)/i
|
57
|
+
|
58
|
+
#--
|
59
|
+
# Instance Methods
|
60
|
+
#++
|
61
|
+
|
62
|
+
# Returns a new Sanitize object initialized with the settings in _config_.
|
63
|
+
def initialize(config = {})
|
64
|
+
@config = Config::DEFAULT.merge(config)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Returns a sanitized copy of _html_.
|
68
|
+
def clean(html)
|
69
|
+
dupe = html.dup
|
70
|
+
clean!(dupe) || dupe
|
71
|
+
end
|
72
|
+
|
73
|
+
# Performs clean in place, returning _html_, or +nil+ if no changes were
|
74
|
+
# made.
|
75
|
+
def clean!(html)
|
76
|
+
fragment = Hpricot(html)
|
77
|
+
|
78
|
+
fragment.search('*') do |node|
|
79
|
+
if node.bogusetag? || node.doctype? || node.procins? || node.xmldecl?
|
80
|
+
node.parent.replace_child(node, '')
|
81
|
+
next
|
82
|
+
end
|
83
|
+
|
84
|
+
if node.comment?
|
85
|
+
node.parent.replace_child(node, '') unless @config[:allow_comments]
|
86
|
+
elsif node.elem?
|
87
|
+
name = node.name.to_s.downcase
|
88
|
+
|
89
|
+
# Delete any element that isn't in the whitelist.
|
90
|
+
unless @config[:elements].include?(name)
|
91
|
+
node.parent.replace_child(node, node.children || '')
|
92
|
+
next
|
93
|
+
end
|
94
|
+
|
95
|
+
node.raw_attributes ||= {}
|
96
|
+
|
97
|
+
attr_whitelist = ((@config[:attributes][name] || []) +
|
98
|
+
(@config[:attributes][:all] || [])).uniq
|
99
|
+
|
100
|
+
if attr_whitelist.empty?
|
101
|
+
# Delete all attributes from elements with no whitelisted
|
102
|
+
# attributes.
|
103
|
+
node.raw_attributes = {}
|
104
|
+
else
|
105
|
+
# Delete any attribute that isn't in the whitelist for this element.
|
106
|
+
node.raw_attributes.delete_if do |key, value|
|
107
|
+
!attr_whitelist.include?(key.to_s.downcase)
|
108
|
+
end
|
109
|
+
|
110
|
+
# Delete remaining attributes that use unacceptable protocols.
|
111
|
+
if @config[:protocols].has_key?(name)
|
112
|
+
protocol = @config[:protocols][name]
|
113
|
+
|
114
|
+
node.raw_attributes.delete_if do |key, value|
|
115
|
+
next false unless protocol.has_key?(key)
|
116
|
+
next true if value.nil?
|
117
|
+
|
118
|
+
if value.to_s.downcase =~ REGEX_PROTOCOL
|
119
|
+
!protocol[key].include?($1.downcase)
|
120
|
+
else
|
121
|
+
!protocol[key].include?(:relative)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Add required attributes.
|
128
|
+
if @config[:add_attributes].has_key?(name)
|
129
|
+
node.raw_attributes.merge!(@config[:add_attributes][name])
|
130
|
+
end
|
131
|
+
|
132
|
+
# Escape special chars in attribute values.
|
133
|
+
node.raw_attributes.each do |key, value|
|
134
|
+
node.raw_attributes[key] = Sanitize.encode_html(value)
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# Make one last pass through the fragment and encode all special HTML chars
|
140
|
+
# as entities. This eliminates certain types of maliciously-malformed nested
|
141
|
+
# tags.
|
142
|
+
fragment.search('*') do |node|
|
143
|
+
node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
|
144
|
+
end
|
145
|
+
|
146
|
+
result = fragment.to_s
|
147
|
+
return result == html ? nil : html[0, html.length] = result
|
148
|
+
end
|
149
|
+
|
150
|
+
#--
|
151
|
+
# Class Methods
|
152
|
+
#++
|
153
|
+
|
154
|
+
class << self
|
155
|
+
# Returns a sanitized copy of _html_, using the settings in _config_ if
|
156
|
+
# specified.
|
157
|
+
def clean(html, config = {})
|
158
|
+
sanitize = Sanitize.new(config)
|
159
|
+
sanitize.clean(html)
|
160
|
+
end
|
161
|
+
|
162
|
+
# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
|
163
|
+
# were made.
|
164
|
+
def clean!(html, config = {})
|
165
|
+
sanitize = Sanitize.new(config)
|
166
|
+
sanitize.clean!(html)
|
167
|
+
end
|
168
|
+
|
169
|
+
# Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
|
170
|
+
# references and returns the encoded string.
|
171
|
+
def encode_html(html)
|
172
|
+
str = html.dup
|
173
|
+
|
174
|
+
# Encode special chars.
|
175
|
+
ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
|
176
|
+
|
177
|
+
# Convert unencoded ampersands to entity references.
|
178
|
+
str.gsub(REGEX_AMPERSAND, '&')
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
metadata
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rgrove-sanitize
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.5.3
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ryan Grove
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-02-13 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.6"
|
24
|
+
version:
|
25
|
+
description:
|
26
|
+
email: ryan@wonko.com
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files: []
|
32
|
+
|
33
|
+
files:
|
34
|
+
- HISTORY
|
35
|
+
- LICENSE
|
36
|
+
- README.rdoc
|
37
|
+
- lib/sanitize.rb
|
38
|
+
- lib/sanitize/config.rb
|
39
|
+
- lib/sanitize/config/basic.rb
|
40
|
+
- lib/sanitize/config/relaxed.rb
|
41
|
+
- lib/sanitize/config/restricted.rb
|
42
|
+
has_rdoc: false
|
43
|
+
homepage: http://github.com/rgrove/sanitize/
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
|
47
|
+
require_paths:
|
48
|
+
- lib
|
49
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.8.6
|
54
|
+
version:
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: "0"
|
60
|
+
version:
|
61
|
+
requirements: []
|
62
|
+
|
63
|
+
rubyforge_project:
|
64
|
+
rubygems_version: 1.2.0
|
65
|
+
signing_key:
|
66
|
+
specification_version: 2
|
67
|
+
summary: Whitelist-based HTML sanitizer.
|
68
|
+
test_files: []
|
69
|
+
|