sanitize 1.2.1.dev.20100124 → 1.2.1.dev.20100329
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of sanitize might be problematic. Click here for more details.
- data/HISTORY +7 -4
- data/README.rdoc +11 -8
- data/lib/sanitize.rb +17 -26
- data/lib/sanitize/config.rb +6 -6
- data/lib/sanitize/version.rb +1 -1
- metadata +2 -2
data/HISTORY
CHANGED
@@ -2,11 +2,14 @@ Sanitize History
|
|
2
2
|
================================================================================
|
3
3
|
|
4
4
|
Version 1.2.1 (git)
|
5
|
-
* Added an :escape_only config setting. If set to true, Sanitize will escape
|
6
|
-
non-whitelisted elements and their contents instead of removing them.
|
7
5
|
* Added a :remove_contents config setting. If set to true, Sanitize will
|
8
|
-
remove the contents of non-whitelisted elements in addition to the
|
9
|
-
themselves.
|
6
|
+
remove the contents of all non-whitelisted elements in addition to the
|
7
|
+
elements themselves. If set to an Array of element names, Sanitize will
|
8
|
+
remove the contents of only those elements (when filtered), and leave the
|
9
|
+
contents of other filtered elements. [Thanks to Rafael Souza for the Array
|
10
|
+
option]
|
11
|
+
* Added an :output_encoding config setting to allow the character encoding for
|
12
|
+
HTML output to be specified. The default is 'utf-8'.
|
10
13
|
* The environment hash passed into transformers now includes a :node_name item
|
11
14
|
containing the lowercase name of the current HTML node (e.g. "div").
|
12
15
|
* Returning anything other than a Hash or nil from a transformer will now
|
data/README.rdoc
CHANGED
@@ -133,16 +133,15 @@ Array of element names to allow. Specify all names in lowercase.
|
|
133
133
|
'sup', 'u', 'ul'
|
134
134
|
]
|
135
135
|
|
136
|
-
==== :escape_only (boolean)
|
137
|
-
|
138
|
-
If set to +true+, Sanitize will escape non-whitelisted elements and their
|
139
|
-
contents rather than removing them.
|
140
|
-
|
141
136
|
==== :output (Symbol)
|
142
137
|
|
143
138
|
Output format. Supported formats are <code>:html</code> and <code>:xhtml</code>,
|
144
139
|
defaulting to <code>:xhtml</code>.
|
145
140
|
|
141
|
+
==== :output_encoding (String)
|
142
|
+
|
143
|
+
Character encoding to use for HTML output. Default is <code>'utf-8'</code>.
|
144
|
+
|
146
145
|
==== :protocols (Hash)
|
147
146
|
|
148
147
|
URL protocols to allow in specific attributes. If an attribute is listed here
|
@@ -161,14 +160,17 @@ include the symbol <code>:relative</code> in the protocol array:
|
|
161
160
|
'a' => {'href' => ['http', 'https', :relative]}
|
162
161
|
}
|
163
162
|
|
164
|
-
==== :remove_contents (boolean)
|
163
|
+
==== :remove_contents (boolean or Array)
|
165
164
|
|
166
165
|
If set to +true+, Sanitize will remove the contents of any non-whitelisted
|
167
166
|
elements in addition to the elements themselves. By default, Sanitize leaves the
|
168
167
|
safe parts of an element's contents behind when the element is removed.
|
169
168
|
|
170
|
-
If
|
171
|
-
|
169
|
+
If set to an Array of element names, then only the contents of the specified
|
170
|
+
elements (when filtered) will be removed, and the contents of all other filtered
|
171
|
+
elements will be left behind.
|
172
|
+
|
173
|
+
The default value is <code>false</code>.
|
172
174
|
|
173
175
|
==== :transformers
|
174
176
|
|
@@ -306,6 +308,7 @@ or ideas that later became code:
|
|
306
308
|
* Mutwin Kraus <mutle@blogage.de>
|
307
309
|
* Dev Purkayastha <dev.purkayastha@gmail.com>
|
308
310
|
* David Reese <work@whatcould.com>
|
311
|
+
* Rafael Souza <me@rafaelss.com>
|
309
312
|
* Ben Wanicur <bwanicur@verticalresponse.com>
|
310
313
|
|
311
314
|
== License
|
data/lib/sanitize.rb
CHANGED
@@ -70,17 +70,22 @@ class Sanitize
|
|
70
70
|
def initialize(config = {})
|
71
71
|
# Sanitize configuration.
|
72
72
|
@config = Config::DEFAULT.merge(config)
|
73
|
-
@config[:transformers] = Array(@config[:transformers])
|
74
|
-
|
75
|
-
# :remove_contents takes precedence over :escape_only.
|
76
|
-
if @config[:remove_contents] && @config[:escape_only]
|
77
|
-
@config[:escape_only] = false
|
78
|
-
end
|
73
|
+
@config[:transformers] = Array(@config[:transformers].dup)
|
79
74
|
|
80
75
|
# Convert the list of allowed elements to a Hash for faster lookup.
|
81
76
|
@allowed_elements = {}
|
82
77
|
@config[:elements].each {|el| @allowed_elements[el] = true }
|
83
78
|
|
79
|
+
# Convert the list of :remove_contents elements to a Hash for faster lookup.
|
80
|
+
@remove_all_contents = false
|
81
|
+
@remove_element_contents = {}
|
82
|
+
|
83
|
+
if @config[:remove_contents].is_a?(Array)
|
84
|
+
@config[:remove_contents].each {|el| @remove_element_contents[el] = true }
|
85
|
+
else
|
86
|
+
@remove_all_contents = !!@config[:remove_contents]
|
87
|
+
end
|
88
|
+
|
84
89
|
# Specific nodes to whitelist (along with all their attributes). This array
|
85
90
|
# is generated at runtime by transformers, and is cleared before and after
|
86
91
|
# a fragment is cleaned (so it applies only to a specific fragment).
|
@@ -99,7 +104,7 @@ class Sanitize
|
|
99
104
|
fragment = Nokogiri::HTML::DocumentFragment.parse(html)
|
100
105
|
clean_node!(fragment)
|
101
106
|
|
102
|
-
output_method_params = {:encoding =>
|
107
|
+
output_method_params = {:encoding => @config[:output_encoding], :indent => 0}
|
103
108
|
|
104
109
|
if @config[:output] == :xhtml
|
105
110
|
output_method = fragment.method(:to_xhtml)
|
@@ -112,10 +117,6 @@ class Sanitize
|
|
112
117
|
|
113
118
|
result = output_method.call(output_method_params)
|
114
119
|
|
115
|
-
# Ensure that the result is always a UTF-8 string in Ruby 1.9, no matter
|
116
|
-
# what. Nokogiri seems to return empty strings as ASCII for some reason.
|
117
|
-
result.force_encoding('utf-8') if RUBY_VERSION >= '1.9'
|
118
|
-
|
119
120
|
return result == html ? nil : html[0, html.length] = result
|
120
121
|
end
|
121
122
|
|
@@ -129,13 +130,7 @@ class Sanitize
|
|
129
130
|
if child.element?
|
130
131
|
clean_element!(child)
|
131
132
|
elsif child.comment?
|
132
|
-
unless @config[:allow_comments]
|
133
|
-
if @config[:escape_only]
|
134
|
-
child.replace(Nokogiri::XML::Text.new(child.to_s, child.document))
|
135
|
-
else
|
136
|
-
child.unlink
|
137
|
-
end
|
138
|
-
end
|
133
|
+
child.unlink unless @config[:allow_comments]
|
139
134
|
elsif child.cdata?
|
140
135
|
child.replace(Nokogiri::XML::Text.new(child.text, child.document))
|
141
136
|
end
|
@@ -160,16 +155,12 @@ class Sanitize
|
|
160
155
|
|
161
156
|
# Delete any element that isn't in the whitelist.
|
162
157
|
unless transform[:whitelist] || @allowed_elements[name]
|
163
|
-
|
164
|
-
node.
|
165
|
-
else
|
166
|
-
unless @config[:remove_contents]
|
167
|
-
node.children.each { |n| node.add_previous_sibling(n) }
|
168
|
-
end
|
169
|
-
|
170
|
-
node.unlink
|
158
|
+
unless @remove_all_contents || @remove_element_contents[name]
|
159
|
+
node.children.each { |n| node.add_previous_sibling(n) }
|
171
160
|
end
|
172
161
|
|
162
|
+
node.unlink
|
163
|
+
|
173
164
|
return
|
174
165
|
end
|
175
166
|
|
data/lib/sanitize/config.rb
CHANGED
@@ -40,14 +40,13 @@ class Sanitize
|
|
40
40
|
# that all HTML will be stripped).
|
41
41
|
:elements => [],
|
42
42
|
|
43
|
-
# If this is true, Sanitize will escape non-whitelisted elements and their
|
44
|
-
# contents rather than removing them.
|
45
|
-
:escape_only => false,
|
46
|
-
|
47
43
|
# Output format. Supported formats are :html and :xhtml (which is the
|
48
44
|
# default).
|
49
45
|
:output => :xhtml,
|
50
46
|
|
47
|
+
# Character encoding to use for HTML output. Default is 'utf-8'.
|
48
|
+
:output_encoding => 'utf-8',
|
49
|
+
|
51
50
|
# URL handling protocols to allow in specific attributes. By default, no
|
52
51
|
# protocols are allowed. Use :relative in place of a protocol if you want
|
53
52
|
# to allow relative URLs sans protocol.
|
@@ -58,8 +57,9 @@ class Sanitize
|
|
58
57
|
# leaves the safe parts of an element's contents behind when the element
|
59
58
|
# is removed.
|
60
59
|
#
|
61
|
-
# If
|
62
|
-
# will
|
60
|
+
# If this is an Array of element names, then only the contents of the
|
61
|
+
# specified elements (when filtered) will be removed, and the contents of
|
62
|
+
# all other filtered elements will be left behind.
|
63
63
|
:remove_contents => false,
|
64
64
|
|
65
65
|
# Transformers allow you to filter or alter nodes using custom logic. See
|
data/lib/sanitize/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanitize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.2.1.dev.
|
4
|
+
version: 1.2.1.dev.20100329
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Grove
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-03-29 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|