sanitize 1.0.5 → 1.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/HISTORY +10 -0
- data/README.rdoc +21 -3
- data/lib/sanitize.rb +66 -37
- metadata +2 -13
- data/lib/sanitize/monkeypatch/hpricot.rb +0 -33
data/HISTORY
CHANGED
@@ -1,6 +1,16 @@
|
|
1
1
|
Sanitize History
|
2
2
|
================================================================================
|
3
3
|
|
4
|
+
Version 1.0.6 (2009-02-23)
|
5
|
+
* Removed htmlentities gem dependency.
|
6
|
+
* Existing well-formed character entity references in the input string are now
|
7
|
+
preserved rather than being decoded and re-encoded.
|
8
|
+
* The ' character is now encoded as ' instead of ' to prevent
|
9
|
+
problems in IE6.
|
10
|
+
* You can now specify the symbol :all in place of an element name in the
|
11
|
+
attributes config hash to allow certain attributes on all elements. [Thanks
|
12
|
+
to Mutwin Kraus]
|
13
|
+
|
4
14
|
Version 1.0.5 (2009-02-05)
|
5
15
|
* Fixed a bug introduced in version 1.0.3 that prevented non-whitelisted
|
6
16
|
protocols from being cleaned when relative URLs were allowed. [Reported by
|
data/README.rdoc
CHANGED
@@ -15,7 +15,7 @@ or maliciously-formed HTML. When in doubt, Sanitize always errs on the side of
|
|
15
15
|
caution.
|
16
16
|
|
17
17
|
*Author*:: Ryan Grove (mailto:ryan@wonko.com)
|
18
|
-
*Version*:: 1.0.
|
18
|
+
*Version*:: 1.0.6 (2009-02-23)
|
19
19
|
*Copyright*:: Copyright (c) 2009 Ryan Grove. All rights reserved.
|
20
20
|
*License*:: MIT License (http://opensource.org/licenses/mit-license.php)
|
21
21
|
*Website*:: http://github.com/rgrove/sanitize
|
@@ -24,7 +24,6 @@ caution.
|
|
24
24
|
|
25
25
|
* RubyGems
|
26
26
|
* Hpricot 0.6+
|
27
|
-
* HTMLEntities 4.0.0+
|
28
27
|
|
29
28
|
== Usage
|
30
29
|
|
@@ -100,6 +99,14 @@ attributes in lowercase.
|
|
100
99
|
'img' => ['alt', 'src', 'title']
|
101
100
|
}
|
102
101
|
|
102
|
+
If you'd like to allow certain attributes on all elements, use the symbol
|
103
|
+
<code>:all</code> instead of an element name.
|
104
|
+
|
105
|
+
:attributes => {
|
106
|
+
:all => ['class'],
|
107
|
+
'a' => ['href', 'title']
|
108
|
+
}
|
109
|
+
|
103
110
|
==== :add_attributes
|
104
111
|
|
105
112
|
Attributes to add to specific elements. If the attribute already exists, it will
|
@@ -122,12 +129,23 @@ protocol at all), it will be removed.
|
|
122
129
|
}
|
123
130
|
|
124
131
|
If you'd like to allow the use of relative URLs which don't have a protocol,
|
125
|
-
include the
|
132
|
+
include the symbol <code>:relative</code> in the protocol array:
|
126
133
|
|
127
134
|
:protocols => {
|
128
135
|
'a' => {'href' => ['http', 'https', :relative]}
|
129
136
|
}
|
130
137
|
|
138
|
+
|
139
|
+
== Contributors
|
140
|
+
|
141
|
+
The following lovely people have contributed to Sanitize in the form of patches
|
142
|
+
or ideas that later became code:
|
143
|
+
|
144
|
+
* Ryan Grove <ryan@wonko.com>
|
145
|
+
* Adam Hooper <adam@adamhooper.com>
|
146
|
+
* Mutwin Kraus <mutle@blogage.de>
|
147
|
+
* Dev Purkayastha <dev.purkayastha@gmail.com>
|
148
|
+
|
131
149
|
== License
|
132
150
|
|
133
151
|
Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
data/lib/sanitize.rb
CHANGED
@@ -26,19 +26,28 @@ $:.uniq!
|
|
26
26
|
|
27
27
|
require 'rubygems'
|
28
28
|
|
29
|
-
gem 'hpricot',
|
30
|
-
gem 'htmlentities', '~> 4.0.0'
|
29
|
+
gem 'hpricot', '~> 0.6'
|
31
30
|
|
32
31
|
require 'hpricot'
|
33
|
-
require 'htmlentities'
|
34
32
|
require 'sanitize/config'
|
35
33
|
require 'sanitize/config/restricted'
|
36
34
|
require 'sanitize/config/basic'
|
37
35
|
require 'sanitize/config/relaxed'
|
38
|
-
require 'sanitize/monkeypatch/hpricot'
|
39
36
|
|
40
37
|
class Sanitize
|
41
38
|
|
39
|
+
# Characters that should be replaced with entities in text nodes.
|
40
|
+
ENTITY_MAP = {
|
41
|
+
'<' => '<',
|
42
|
+
'>' => '>',
|
43
|
+
'"' => '"',
|
44
|
+
"'" => '''
|
45
|
+
}
|
46
|
+
|
47
|
+
# Matches an unencoded ampersand that is not part of a valid character entity
|
48
|
+
# reference.
|
49
|
+
REGEX_AMPERSAND = /&(?!(?:[a-z]+|#[0-9]+|#x[0-9a-f]+);)/i
|
50
|
+
|
42
51
|
# Matches an attribute value that could be treated by a browser as a URL
|
43
52
|
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
|
44
53
|
# or more characters followed by a colon is considered a match, even if the
|
@@ -46,24 +55,6 @@ class Sanitize
|
|
46
55
|
# IE6 and Opera will still parse).
|
47
56
|
REGEX_PROTOCOL = /^([^:]*)(?:\:|�*58|�*3a)/i
|
48
57
|
|
49
|
-
#--
|
50
|
-
# Class Methods
|
51
|
-
#++
|
52
|
-
|
53
|
-
# Returns a sanitized copy of _html_, using the settings in _config_ if
|
54
|
-
# specified.
|
55
|
-
def self.clean(html, config = {})
|
56
|
-
sanitize = Sanitize.new(config)
|
57
|
-
sanitize.clean(html)
|
58
|
-
end
|
59
|
-
|
60
|
-
# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
|
61
|
-
# were made.
|
62
|
-
def self.clean!(html, config = {})
|
63
|
-
sanitize = Sanitize.new(config)
|
64
|
-
sanitize.clean!(html)
|
65
|
-
end
|
66
|
-
|
67
58
|
#--
|
68
59
|
# Instance Methods
|
69
60
|
#++
|
@@ -101,10 +92,19 @@ class Sanitize
|
|
101
92
|
next
|
102
93
|
end
|
103
94
|
|
104
|
-
|
95
|
+
node.raw_attributes ||= {}
|
96
|
+
|
97
|
+
attr_whitelist = ((@config[:attributes][name] || []) +
|
98
|
+
(@config[:attributes][:all] || [])).uniq
|
99
|
+
|
100
|
+
if attr_whitelist.empty?
|
101
|
+
# Delete all attributes from elements with no whitelisted
|
102
|
+
# attributes.
|
103
|
+
node.raw_attributes = {}
|
104
|
+
else
|
105
105
|
# Delete any attribute that isn't in the whitelist for this element.
|
106
106
|
node.raw_attributes.delete_if do |key, value|
|
107
|
-
|
107
|
+
!attr_whitelist.include?(key.to_s.downcase)
|
108
108
|
end
|
109
109
|
|
110
110
|
# Delete remaining attributes that use unacceptable protocols.
|
@@ -122,32 +122,61 @@ class Sanitize
|
|
122
122
|
end
|
123
123
|
end
|
124
124
|
end
|
125
|
-
else
|
126
|
-
# Delete all attributes from elements with no whitelisted
|
127
|
-
# attributes.
|
128
|
-
node.raw_attributes = {}
|
129
125
|
end
|
130
126
|
|
131
127
|
# Add required attributes.
|
132
128
|
if @config[:add_attributes].has_key?(name)
|
133
129
|
node.raw_attributes.merge!(@config[:add_attributes][name])
|
134
130
|
end
|
131
|
+
|
132
|
+
# Escape special chars in attribute values.
|
133
|
+
node.raw_attributes.each do |key, value|
|
134
|
+
node.raw_attributes[key] = Sanitize.encode_html(value)
|
135
|
+
end
|
135
136
|
end
|
136
137
|
end
|
137
138
|
|
138
139
|
# Make one last pass through the fragment and encode all special HTML chars
|
139
|
-
#
|
140
|
-
#
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
fragment.traverse_element do |node|
|
145
|
-
if node.text?
|
146
|
-
node.swap(coder.encode(node.inner_text, :named))
|
147
|
-
end
|
140
|
+
# as entities. This eliminates certain types of maliciously-malformed nested
|
141
|
+
# tags.
|
142
|
+
fragment.search('*') do |node|
|
143
|
+
node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
|
148
144
|
end
|
149
145
|
|
150
146
|
result = fragment.to_s
|
151
147
|
return result == html ? nil : html[0, html.length] = result
|
152
148
|
end
|
149
|
+
|
150
|
+
#--
|
151
|
+
# Class Methods
|
152
|
+
#++
|
153
|
+
|
154
|
+
class << self
|
155
|
+
# Returns a sanitized copy of _html_, using the settings in _config_ if
|
156
|
+
# specified.
|
157
|
+
def clean(html, config = {})
|
158
|
+
sanitize = Sanitize.new(config)
|
159
|
+
sanitize.clean(html)
|
160
|
+
end
|
161
|
+
|
162
|
+
# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
|
163
|
+
# were made.
|
164
|
+
def clean!(html, config = {})
|
165
|
+
sanitize = Sanitize.new(config)
|
166
|
+
sanitize.clean!(html)
|
167
|
+
end
|
168
|
+
|
169
|
+
# Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
|
170
|
+
# references and returns the encoded string.
|
171
|
+
def encode_html(html)
|
172
|
+
str = html.dup
|
173
|
+
|
174
|
+
# Encode special chars.
|
175
|
+
ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
|
176
|
+
|
177
|
+
# Convert unencoded ampersands to entity references.
|
178
|
+
str.gsub(REGEX_AMPERSAND, '&')
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
153
182
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanitize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Grove
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-02-
|
12
|
+
date: 2009-02-23 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,16 +22,6 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: "0.6"
|
24
24
|
version:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: htmlentities
|
27
|
-
type: :runtime
|
28
|
-
version_requirement:
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ~>
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 4.0.0
|
34
|
-
version:
|
35
25
|
description:
|
36
26
|
email: ryan@wonko.com
|
37
27
|
executables: []
|
@@ -49,7 +39,6 @@ files:
|
|
49
39
|
- lib/sanitize/config/basic.rb
|
50
40
|
- lib/sanitize/config/relaxed.rb
|
51
41
|
- lib/sanitize/config/restricted.rb
|
52
|
-
- lib/sanitize/monkeypatch/hpricot.rb
|
53
42
|
has_rdoc: false
|
54
43
|
homepage: http://github.com/rgrove/sanitize/
|
55
44
|
post_install_message:
|
@@ -1,33 +0,0 @@
|
|
1
|
-
#--
|
2
|
-
# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
3
|
-
#
|
4
|
-
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
-
# of this software and associated documentation files (the 'Software'), to deal
|
6
|
-
# in the Software without restriction, including without limitation the rights
|
7
|
-
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
-
# copies of the Software, and to permit persons to whom the Software is
|
9
|
-
# furnished to do so, subject to the following conditions:
|
10
|
-
#
|
11
|
-
# The above copyright notice and this permission notice shall be included in all
|
12
|
-
# copies or substantial portions of the Software.
|
13
|
-
#
|
14
|
-
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
-
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
-
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
-
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
-
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
-
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20
|
-
# SOFTWARE.
|
21
|
-
#++
|
22
|
-
|
23
|
-
module Hpricot
|
24
|
-
|
25
|
-
# Monkeypatch to fix an Hpricot bug that causes HTML entities to be decoded
|
26
|
-
# incorrectly.
|
27
|
-
def self.uxs(str)
|
28
|
-
str.to_s.
|
29
|
-
gsub(/&(\w+);/) { [Hpricot::NamedCharacters[$1] || ??].pack("U*") }.
|
30
|
-
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
|
31
|
-
end
|
32
|
-
|
33
|
-
end
|