sanitize 1.0.8 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of sanitize might be problematic. Click here for more details.
- data/HISTORY +7 -0
- data/README.rdoc +15 -4
- data/lib/sanitize.rb +42 -68
- data/lib/sanitize/config.rb +10 -6
- data/lib/sanitize/version.rb +3 -0
- metadata +33 -10
data/HISTORY
CHANGED
@@ -1,6 +1,13 @@
|
|
1
1
|
Sanitize History
|
2
2
|
================================================================================
|
3
3
|
|
4
|
+
Version 1.1.0 (2009-10-11)
|
5
|
+
* Migrated from Hpricot to Nokogiri. Requires libxml2 >= 2.7.2 [Adam Hooper]
|
6
|
+
* Added an :output config setting to allow the output format to be specified.
|
7
|
+
Supported formats are :xhtml (the default) and :html (which outputs HTML4).
|
8
|
+
* Changed protocol regex to ensure Sanitize doesn't kill URLs with colons in
|
9
|
+
path segments. [Peter Cooper]
|
10
|
+
|
4
11
|
Version 1.0.8 (2009-04-23)
|
5
12
|
* Added a workaround for an Hpricot bug that prevents attribute names from
|
6
13
|
being downcased in recent versions of Hpricot. This was exploitable to
|
data/README.rdoc
CHANGED
@@ -9,21 +9,31 @@ elements, certain attributes within those elements, and even certain URL
|
|
9
9
|
protocols within attributes that contain URLs. Any HTML elements or attributes
|
10
10
|
that you don't explicitly allow will be removed.
|
11
11
|
|
12
|
-
Because it's based on
|
12
|
+
Because it's based on Nokogiri, a full-fledged HTML parser, rather than a bunch
|
13
13
|
of fragile regular expressions, Sanitize has no trouble dealing with malformed
|
14
14
|
or maliciously-formed HTML. When in doubt, Sanitize always errs on the side of
|
15
15
|
caution.
|
16
16
|
|
17
17
|
*Author*:: Ryan Grove (mailto:ryan@wonko.com)
|
18
|
-
*Version*:: 1.0
|
18
|
+
*Version*:: 1.1.0 (2009-10-11)
|
19
19
|
*Copyright*:: Copyright (c) 2009 Ryan Grove. All rights reserved.
|
20
20
|
*License*:: MIT License (http://opensource.org/licenses/mit-license.php)
|
21
21
|
*Website*:: http://github.com/rgrove/sanitize
|
22
22
|
|
23
23
|
== Requires
|
24
24
|
|
25
|
-
*
|
26
|
-
*
|
25
|
+
* Nokogiri
|
26
|
+
* libxml2 >= 2.7.2
|
27
|
+
|
28
|
+
== Installation
|
29
|
+
|
30
|
+
Latest stable release:
|
31
|
+
|
32
|
+
gem install sanitize
|
33
|
+
|
34
|
+
Latest development version:
|
35
|
+
|
36
|
+
gem install sanitize -s http://gemcutter.org --prerelease
|
27
37
|
|
28
38
|
== Usage
|
29
39
|
|
@@ -141,6 +151,7 @@ include the symbol <code>:relative</code> in the protocol array:
|
|
141
151
|
The following lovely people have contributed to Sanitize in the form of patches
|
142
152
|
or ideas that later became code:
|
143
153
|
|
154
|
+
* Peter Cooper <git@peterc.org>
|
144
155
|
* Ryan Grove <ryan@wonko.com>
|
145
156
|
* Adam Hooper <adam@adamhooper.com>
|
146
157
|
* Mutwin Kraus <mutle@blogage.de>
|
data/lib/sanitize.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
#--
|
2
3
|
# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
3
4
|
#
|
@@ -20,15 +21,8 @@
|
|
20
21
|
# SOFTWARE.
|
21
22
|
#++
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
$:.uniq!
|
26
|
-
|
27
|
-
require 'rubygems'
|
28
|
-
|
29
|
-
gem 'hpricot', '~> 0.8.1'
|
30
|
-
|
31
|
-
require 'hpricot'
|
24
|
+
require 'nokogiri'
|
25
|
+
require 'sanitize/version'
|
32
26
|
require 'sanitize/config'
|
33
27
|
require 'sanitize/config/restricted'
|
34
28
|
require 'sanitize/config/basic'
|
@@ -36,24 +30,12 @@ require 'sanitize/config/relaxed'
|
|
36
30
|
|
37
31
|
class Sanitize
|
38
32
|
|
39
|
-
# Characters that should be replaced with entities in text nodes.
|
40
|
-
ENTITY_MAP = {
|
41
|
-
'<' => '<',
|
42
|
-
'>' => '>',
|
43
|
-
'"' => '"',
|
44
|
-
"'" => '''
|
45
|
-
}
|
46
|
-
|
47
|
-
# Matches an unencoded ampersand that is not part of a valid character entity
|
48
|
-
# reference.
|
49
|
-
REGEX_AMPERSAND = /&(?!(?:[a-z]+[0-9]{0,2}|#[0-9]+|#x[0-9a-f]+);)/i
|
50
|
-
|
51
33
|
# Matches an attribute value that could be treated by a browser as a URL
|
52
34
|
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
|
53
35
|
# or more characters followed by a colon is considered a match, even if the
|
54
36
|
# colon is encoded as an entity and even if it's an incomplete entity (which
|
55
37
|
# IE6 and Opera will still parse).
|
56
|
-
REGEX_PROTOCOL = /^([
|
38
|
+
REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|�*58|�*3a)/i
|
57
39
|
|
58
40
|
#--
|
59
41
|
# Instance Methods
|
@@ -73,78 +55,82 @@ class Sanitize
|
|
73
55
|
# Performs clean in place, returning _html_, or +nil+ if no changes were
|
74
56
|
# made.
|
75
57
|
def clean!(html)
|
76
|
-
fragment =
|
77
|
-
|
78
|
-
fragment.search('*') do |node|
|
79
|
-
if node.bogusetag? || node.doctype? || node.procins? || node.xmldecl?
|
80
|
-
node.parent.replace_child(node, '')
|
81
|
-
next
|
82
|
-
end
|
58
|
+
fragment = Nokogiri::HTML::DocumentFragment.parse(html)
|
83
59
|
|
60
|
+
fragment.traverse do |node|
|
84
61
|
if node.comment?
|
85
|
-
node.
|
86
|
-
elsif node.
|
62
|
+
node.unlink unless @config[:allow_comments]
|
63
|
+
elsif node.element?
|
87
64
|
name = node.name.to_s.downcase
|
88
65
|
|
89
66
|
# Delete any element that isn't in the whitelist.
|
90
67
|
unless @config[:elements].include?(name)
|
91
|
-
node.
|
68
|
+
node.children.each { |n| node.add_previous_sibling(n) }
|
69
|
+
node.unlink
|
92
70
|
next
|
93
71
|
end
|
94
72
|
|
95
|
-
node.raw_attributes ||= {}
|
96
|
-
|
97
73
|
attr_whitelist = ((@config[:attributes][name] || []) +
|
98
74
|
(@config[:attributes][:all] || [])).uniq
|
99
75
|
|
100
76
|
if attr_whitelist.empty?
|
101
77
|
# Delete all attributes from elements with no whitelisted
|
102
78
|
# attributes.
|
103
|
-
node.
|
79
|
+
node.attribute_nodes.each { |attr| attr.remove }
|
104
80
|
else
|
105
81
|
# Delete any attribute that isn't in the whitelist for this element.
|
106
|
-
node.
|
107
|
-
|
82
|
+
node.attribute_nodes.each do |attr|
|
83
|
+
attr.unlink unless attr_whitelist.include?(attr.name.downcase)
|
108
84
|
end
|
109
85
|
|
110
86
|
# Delete remaining attributes that use unacceptable protocols.
|
111
87
|
if @config[:protocols].has_key?(name)
|
112
88
|
protocol = @config[:protocols][name]
|
113
89
|
|
114
|
-
node.
|
115
|
-
|
116
|
-
next false unless protocol.has_key?(
|
117
|
-
next true if value.nil?
|
90
|
+
node.attribute_nodes.each do |attr|
|
91
|
+
attr_name = attr.name.downcase
|
92
|
+
next false unless protocol.has_key?(attr_name)
|
118
93
|
|
119
|
-
if value.to_s.downcase =~ REGEX_PROTOCOL
|
120
|
-
!protocol[
|
94
|
+
del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
|
95
|
+
!protocol[attr_name].include?($1.downcase)
|
121
96
|
else
|
122
|
-
!protocol[
|
97
|
+
!protocol[attr_name].include?(:relative)
|
123
98
|
end
|
99
|
+
|
100
|
+
attr.unlink if del
|
124
101
|
end
|
125
102
|
end
|
126
103
|
end
|
127
104
|
|
128
105
|
# Add required attributes.
|
129
106
|
if @config[:add_attributes].has_key?(name)
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
# Escape special chars in attribute values.
|
134
|
-
node.raw_attributes.each do |key, value|
|
135
|
-
node.raw_attributes[key] = Sanitize.encode_html(value)
|
107
|
+
@config[:add_attributes][name].each do |key, val|
|
108
|
+
node[key] = val
|
109
|
+
end
|
136
110
|
end
|
111
|
+
elsif node.cdata?
|
112
|
+
node.replace(Nokogiri::XML::Text.new(node.text, node.document))
|
137
113
|
end
|
138
114
|
end
|
139
115
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
116
|
+
if @config[:output] == :xhtml
|
117
|
+
output_method = fragment.method(:to_xhtml)
|
118
|
+
elsif @config[:output] == :html
|
119
|
+
output_method = fragment.method(:to_html)
|
120
|
+
else
|
121
|
+
raise Error, "unsupported output format: #{@config[:output]}"
|
122
|
+
end
|
123
|
+
|
124
|
+
if RUBY_VERSION >= '1.9'
|
125
|
+
# Nokogiri 1.3.3 (and possibly earlier versions) always returns a US-ASCII
|
126
|
+
# string no matter what we ask for. This will be fixed in 1.4.0, but for
|
127
|
+
# now we have to hack around it to prevent errors.
|
128
|
+
result = output_method.call(:encoding => 'utf-8', :indent => 0).force_encoding('utf-8')
|
129
|
+
result.gsub!(">\n", '>')
|
130
|
+
else
|
131
|
+
result = output_method.call(:encoding => 'utf-8', :indent => 0).gsub(">\n", '>')
|
145
132
|
end
|
146
133
|
|
147
|
-
result = fragment.to_s
|
148
134
|
return result == html ? nil : html[0, html.length] = result
|
149
135
|
end
|
150
136
|
|
@@ -166,18 +152,6 @@ class Sanitize
|
|
166
152
|
sanitize = Sanitize.new(config)
|
167
153
|
sanitize.clean!(html)
|
168
154
|
end
|
169
|
-
|
170
|
-
# Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
|
171
|
-
# references and returns the encoded string.
|
172
|
-
def encode_html(html)
|
173
|
-
str = html.dup
|
174
|
-
|
175
|
-
# Encode special chars.
|
176
|
-
ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
|
177
|
-
|
178
|
-
# Convert unencoded ampersands to entity references.
|
179
|
-
str.gsub(REGEX_AMPERSAND, '&')
|
180
|
-
end
|
181
155
|
end
|
182
156
|
|
183
157
|
end
|
data/lib/sanitize/config.rb
CHANGED
@@ -28,17 +28,21 @@ class Sanitize
|
|
28
28
|
# comments.
|
29
29
|
:allow_comments => false,
|
30
30
|
|
31
|
-
# HTML
|
32
|
-
#
|
33
|
-
:
|
31
|
+
# HTML attributes to add to specific elements. By default, no attributes
|
32
|
+
# are added.
|
33
|
+
:add_attributes => {},
|
34
34
|
|
35
35
|
# HTML attributes to allow in specific elements. By default, no attributes
|
36
36
|
# are allowed.
|
37
37
|
:attributes => {},
|
38
38
|
|
39
|
-
# HTML
|
40
|
-
#
|
41
|
-
:
|
39
|
+
# HTML elements to allow. By default, no elements are allowed (which means
|
40
|
+
# that all HTML will be stripped).
|
41
|
+
:elements => [],
|
42
|
+
|
43
|
+
# Output format. Supported formats are :html and :xhtml (which is the
|
44
|
+
# default).
|
45
|
+
:output => :xhtml,
|
42
46
|
|
43
47
|
# URL handling protocols to allow in specific attributes. By default, no
|
44
48
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanitize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Grove
|
@@ -9,18 +9,38 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-10-11 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: nokogiri
|
17
17
|
type: :runtime
|
18
18
|
version_requirement:
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ~>
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 1.3.3
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: bacon
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.1.0
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: rake
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.8.0
|
24
44
|
version:
|
25
45
|
description:
|
26
46
|
email: ryan@wonko.com
|
@@ -34,13 +54,16 @@ files:
|
|
34
54
|
- HISTORY
|
35
55
|
- LICENSE
|
36
56
|
- README.rdoc
|
37
|
-
- lib/sanitize.rb
|
38
|
-
- lib/sanitize/config.rb
|
39
57
|
- lib/sanitize/config/basic.rb
|
40
58
|
- lib/sanitize/config/relaxed.rb
|
41
59
|
- lib/sanitize/config/restricted.rb
|
42
|
-
|
60
|
+
- lib/sanitize/config.rb
|
61
|
+
- lib/sanitize/version.rb
|
62
|
+
- lib/sanitize.rb
|
63
|
+
has_rdoc: true
|
43
64
|
homepage: http://github.com/rgrove/sanitize/
|
65
|
+
licenses: []
|
66
|
+
|
44
67
|
post_install_message:
|
45
68
|
rdoc_options: []
|
46
69
|
|
@@ -60,10 +83,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
60
83
|
version:
|
61
84
|
requirements: []
|
62
85
|
|
63
|
-
rubyforge_project:
|
64
|
-
rubygems_version: 1.
|
86
|
+
rubyforge_project: riposte
|
87
|
+
rubygems_version: 1.3.5
|
65
88
|
signing_key:
|
66
|
-
specification_version:
|
89
|
+
specification_version: 3
|
67
90
|
summary: Whitelist-based HTML sanitizer.
|
68
91
|
test_files: []
|
69
92
|
|