sanitize 1.0.5 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HISTORY +10 -0
- data/README.rdoc +21 -3
- data/lib/sanitize.rb +66 -37
- metadata +2 -13
- data/lib/sanitize/monkeypatch/hpricot.rb +0 -33
data/HISTORY
CHANGED
@@ -1,6 +1,16 @@
|
|
1
1
|
Sanitize History
|
2
2
|
================================================================================
|
3
3
|
|
4
|
+
Version 1.0.6 (2009-02-23)
|
5
|
+
* Removed htmlentities gem dependency.
|
6
|
+
* Existing well-formed character entity references in the input string are now
|
7
|
+
preserved rather than being decoded and re-encoded.
|
8
|
+
* The ' character is now encoded as ' instead of ' to prevent
|
9
|
+
problems in IE6.
|
10
|
+
* You can now specify the symbol :all in place of an element name in the
|
11
|
+
attributes config hash to allow certain attributes on all elements. [Thanks
|
12
|
+
to Mutwin Kraus]
|
13
|
+
|
4
14
|
Version 1.0.5 (2009-02-05)
|
5
15
|
* Fixed a bug introduced in version 1.0.3 that prevented non-whitelisted
|
6
16
|
protocols from being cleaned when relative URLs were allowed. [Reported by
|
data/README.rdoc
CHANGED
@@ -15,7 +15,7 @@ or maliciously-formed HTML. When in doubt, Sanitize always errs on the side of
|
|
15
15
|
caution.
|
16
16
|
|
17
17
|
*Author*:: Ryan Grove (mailto:ryan@wonko.com)
|
18
|
-
*Version*:: 1.0.
|
18
|
+
*Version*:: 1.0.6 (2009-02-23)
|
19
19
|
*Copyright*:: Copyright (c) 2009 Ryan Grove. All rights reserved.
|
20
20
|
*License*:: MIT License (http://opensource.org/licenses/mit-license.php)
|
21
21
|
*Website*:: http://github.com/rgrove/sanitize
|
@@ -24,7 +24,6 @@ caution.
|
|
24
24
|
|
25
25
|
* RubyGems
|
26
26
|
* Hpricot 0.6+
|
27
|
-
* HTMLEntities 4.0.0+
|
28
27
|
|
29
28
|
== Usage
|
30
29
|
|
@@ -100,6 +99,14 @@ attributes in lowercase.
|
|
100
99
|
'img' => ['alt', 'src', 'title']
|
101
100
|
}
|
102
101
|
|
102
|
+
If you'd like to allow certain attributes on all elements, use the symbol
|
103
|
+
<code>:all</code> instead of an element name.
|
104
|
+
|
105
|
+
:attributes => {
|
106
|
+
:all => ['class'],
|
107
|
+
'a' => ['href', 'title']
|
108
|
+
}
|
109
|
+
|
103
110
|
==== :add_attributes
|
104
111
|
|
105
112
|
Attributes to add to specific elements. If the attribute already exists, it will
|
@@ -122,12 +129,23 @@ protocol at all), it will be removed.
|
|
122
129
|
}
|
123
130
|
|
124
131
|
If you'd like to allow the use of relative URLs which don't have a protocol,
|
125
|
-
include the
|
132
|
+
include the symbol <code>:relative</code> in the protocol array:
|
126
133
|
|
127
134
|
:protocols => {
|
128
135
|
'a' => {'href' => ['http', 'https', :relative]}
|
129
136
|
}
|
130
137
|
|
138
|
+
|
139
|
+
== Contributors
|
140
|
+
|
141
|
+
The following lovely people have contributed to Sanitize in the form of patches
|
142
|
+
or ideas that later became code:
|
143
|
+
|
144
|
+
* Ryan Grove <ryan@wonko.com>
|
145
|
+
* Adam Hooper <adam@adamhooper.com>
|
146
|
+
* Mutwin Kraus <mutle@blogage.de>
|
147
|
+
* Dev Purkayastha <dev.purkayastha@gmail.com>
|
148
|
+
|
131
149
|
== License
|
132
150
|
|
133
151
|
Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
data/lib/sanitize.rb
CHANGED
@@ -26,19 +26,28 @@ $:.uniq!
|
|
26
26
|
|
27
27
|
require 'rubygems'
|
28
28
|
|
29
|
-
gem 'hpricot',
|
30
|
-
gem 'htmlentities', '~> 4.0.0'
|
29
|
+
gem 'hpricot', '~> 0.6'
|
31
30
|
|
32
31
|
require 'hpricot'
|
33
|
-
require 'htmlentities'
|
34
32
|
require 'sanitize/config'
|
35
33
|
require 'sanitize/config/restricted'
|
36
34
|
require 'sanitize/config/basic'
|
37
35
|
require 'sanitize/config/relaxed'
|
38
|
-
require 'sanitize/monkeypatch/hpricot'
|
39
36
|
|
40
37
|
class Sanitize
|
41
38
|
|
39
|
+
# Characters that should be replaced with entities in text nodes.
|
40
|
+
ENTITY_MAP = {
|
41
|
+
'<' => '<',
|
42
|
+
'>' => '>',
|
43
|
+
'"' => '"',
|
44
|
+
"'" => '''
|
45
|
+
}
|
46
|
+
|
47
|
+
# Matches an unencoded ampersand that is not part of a valid character entity
|
48
|
+
# reference.
|
49
|
+
REGEX_AMPERSAND = /&(?!(?:[a-z]+|#[0-9]+|#x[0-9a-f]+);)/i
|
50
|
+
|
42
51
|
# Matches an attribute value that could be treated by a browser as a URL
|
43
52
|
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
|
44
53
|
# or more characters followed by a colon is considered a match, even if the
|
@@ -46,24 +55,6 @@ class Sanitize
|
|
46
55
|
# IE6 and Opera will still parse).
|
47
56
|
REGEX_PROTOCOL = /^([^:]*)(?:\:|�*58|�*3a)/i
|
48
57
|
|
49
|
-
#--
|
50
|
-
# Class Methods
|
51
|
-
#++
|
52
|
-
|
53
|
-
# Returns a sanitized copy of _html_, using the settings in _config_ if
|
54
|
-
# specified.
|
55
|
-
def self.clean(html, config = {})
|
56
|
-
sanitize = Sanitize.new(config)
|
57
|
-
sanitize.clean(html)
|
58
|
-
end
|
59
|
-
|
60
|
-
# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
|
61
|
-
# were made.
|
62
|
-
def self.clean!(html, config = {})
|
63
|
-
sanitize = Sanitize.new(config)
|
64
|
-
sanitize.clean!(html)
|
65
|
-
end
|
66
|
-
|
67
58
|
#--
|
68
59
|
# Instance Methods
|
69
60
|
#++
|
@@ -101,10 +92,19 @@ class Sanitize
|
|
101
92
|
next
|
102
93
|
end
|
103
94
|
|
104
|
-
|
95
|
+
node.raw_attributes ||= {}
|
96
|
+
|
97
|
+
attr_whitelist = ((@config[:attributes][name] || []) +
|
98
|
+
(@config[:attributes][:all] || [])).uniq
|
99
|
+
|
100
|
+
if attr_whitelist.empty?
|
101
|
+
# Delete all attributes from elements with no whitelisted
|
102
|
+
# attributes.
|
103
|
+
node.raw_attributes = {}
|
104
|
+
else
|
105
105
|
# Delete any attribute that isn't in the whitelist for this element.
|
106
106
|
node.raw_attributes.delete_if do |key, value|
|
107
|
-
|
107
|
+
!attr_whitelist.include?(key.to_s.downcase)
|
108
108
|
end
|
109
109
|
|
110
110
|
# Delete remaining attributes that use unacceptable protocols.
|
@@ -122,32 +122,61 @@ class Sanitize
|
|
122
122
|
end
|
123
123
|
end
|
124
124
|
end
|
125
|
-
else
|
126
|
-
# Delete all attributes from elements with no whitelisted
|
127
|
-
# attributes.
|
128
|
-
node.raw_attributes = {}
|
129
125
|
end
|
130
126
|
|
131
127
|
# Add required attributes.
|
132
128
|
if @config[:add_attributes].has_key?(name)
|
133
129
|
node.raw_attributes.merge!(@config[:add_attributes][name])
|
134
130
|
end
|
131
|
+
|
132
|
+
# Escape special chars in attribute values.
|
133
|
+
node.raw_attributes.each do |key, value|
|
134
|
+
node.raw_attributes[key] = Sanitize.encode_html(value)
|
135
|
+
end
|
135
136
|
end
|
136
137
|
end
|
137
138
|
|
138
139
|
# Make one last pass through the fragment and encode all special HTML chars
|
139
|
-
#
|
140
|
-
#
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
fragment.traverse_element do |node|
|
145
|
-
if node.text?
|
146
|
-
node.swap(coder.encode(node.inner_text, :named))
|
147
|
-
end
|
140
|
+
# as entities. This eliminates certain types of maliciously-malformed nested
|
141
|
+
# tags.
|
142
|
+
fragment.search('*') do |node|
|
143
|
+
node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
|
148
144
|
end
|
149
145
|
|
150
146
|
result = fragment.to_s
|
151
147
|
return result == html ? nil : html[0, html.length] = result
|
152
148
|
end
|
149
|
+
|
150
|
+
#--
|
151
|
+
# Class Methods
|
152
|
+
#++
|
153
|
+
|
154
|
+
class << self
|
155
|
+
# Returns a sanitized copy of _html_, using the settings in _config_ if
|
156
|
+
# specified.
|
157
|
+
def clean(html, config = {})
|
158
|
+
sanitize = Sanitize.new(config)
|
159
|
+
sanitize.clean(html)
|
160
|
+
end
|
161
|
+
|
162
|
+
# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
|
163
|
+
# were made.
|
164
|
+
def clean!(html, config = {})
|
165
|
+
sanitize = Sanitize.new(config)
|
166
|
+
sanitize.clean!(html)
|
167
|
+
end
|
168
|
+
|
169
|
+
# Encodes special HTML characters (<, >, ", ', and &) in _html_ as entity
|
170
|
+
# references and returns the encoded string.
|
171
|
+
def encode_html(html)
|
172
|
+
str = html.dup
|
173
|
+
|
174
|
+
# Encode special chars.
|
175
|
+
ENTITY_MAP.each {|char, entity| str.gsub!(char, entity) }
|
176
|
+
|
177
|
+
# Convert unencoded ampersands to entity references.
|
178
|
+
str.gsub(REGEX_AMPERSAND, '&')
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
153
182
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sanitize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Grove
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-02-
|
12
|
+
date: 2009-02-23 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,16 +22,6 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: "0.6"
|
24
24
|
version:
|
25
|
-
- !ruby/object:Gem::Dependency
|
26
|
-
name: htmlentities
|
27
|
-
type: :runtime
|
28
|
-
version_requirement:
|
29
|
-
version_requirements: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ~>
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: 4.0.0
|
34
|
-
version:
|
35
25
|
description:
|
36
26
|
email: ryan@wonko.com
|
37
27
|
executables: []
|
@@ -49,7 +39,6 @@ files:
|
|
49
39
|
- lib/sanitize/config/basic.rb
|
50
40
|
- lib/sanitize/config/relaxed.rb
|
51
41
|
- lib/sanitize/config/restricted.rb
|
52
|
-
- lib/sanitize/monkeypatch/hpricot.rb
|
53
42
|
has_rdoc: false
|
54
43
|
homepage: http://github.com/rgrove/sanitize/
|
55
44
|
post_install_message:
|
@@ -1,33 +0,0 @@
|
|
1
|
-
#--
|
2
|
-
# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>
|
3
|
-
#
|
4
|
-
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
5
|
-
# of this software and associated documentation files (the 'Software'), to deal
|
6
|
-
# in the Software without restriction, including without limitation the rights
|
7
|
-
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
8
|
-
# copies of the Software, and to permit persons to whom the Software is
|
9
|
-
# furnished to do so, subject to the following conditions:
|
10
|
-
#
|
11
|
-
# The above copyright notice and this permission notice shall be included in all
|
12
|
-
# copies or substantial portions of the Software.
|
13
|
-
#
|
14
|
-
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
15
|
-
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
16
|
-
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
17
|
-
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
18
|
-
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
19
|
-
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
20
|
-
# SOFTWARE.
|
21
|
-
#++
|
22
|
-
|
23
|
-
module Hpricot
|
24
|
-
|
25
|
-
# Monkeypatch to fix an Hpricot bug that causes HTML entities to be decoded
|
26
|
-
# incorrectly.
|
27
|
-
def self.uxs(str)
|
28
|
-
str.to_s.
|
29
|
-
gsub(/&(\w+);/) { [Hpricot::NamedCharacters[$1] || ??].pack("U*") }.
|
30
|
-
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
|
31
|
-
end
|
32
|
-
|
33
|
-
end
|