rgrove-sanitize 1.0.8 → 1.0.8.2
Sign up to get free protection for your applications and to get access to all the features.
- data/HISTORY +5 -0
- data/README.rdoc +5 -3
- data/lib/sanitize.rb +26 -39
- metadata +4 -4
data/HISTORY
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
Sanitize History
|
2
2
|
================================================================================
|
3
3
|
|
4
|
+
Version 1.0.8.2 (git)
|
5
|
+
* Migrated from Hpricot to Nokogiri. Requires libxml2 >= 2.7.2 [Adam Hooper]
|
6
|
+
* Changed protocol regex to ensure Sanitize doesn't kill URLs with colons in
|
7
|
+
path segments. [Peter Cooper]
|
8
|
+
|
4
9
|
Version 1.0.8 (2009-04-23)
|
5
10
|
* Added a workaround for an Hpricot bug that prevents attribute names from
|
6
11
|
being downcased in recent versions of Hpricot. This was exploitable to
|
data/README.rdoc
CHANGED
@@ -9,13 +9,13 @@ elements, certain attributes within those elements, and even certain URL
|
|
9
9
|
protocols within attributes that contain URLs. Any HTML elements or attributes
|
10
10
|
that you don't explicitly allow will be removed.
|
11
11
|
|
12
|
-
Because it's based on
|
12
|
+
Because it's based on Nokogiri, a full-fledged HTML parser, rather than a bunch
|
13
13
|
of fragile regular expressions, Sanitize has no trouble dealing with malformed
|
14
14
|
or maliciously-formed HTML. When in doubt, Sanitize always errs on the side of
|
15
15
|
caution.
|
16
16
|
|
17
17
|
*Author*:: Ryan Grove (mailto:ryan@wonko.com)
|
18
|
-
*Version*:: 1.0.8 (
|
18
|
+
*Version*:: 1.0.8.2 (git)
|
19
19
|
*Copyright*:: Copyright (c) 2009 Ryan Grove. All rights reserved.
|
20
20
|
*License*:: MIT License (http://opensource.org/licenses/mit-license.php)
|
21
21
|
*Website*:: http://github.com/rgrove/sanitize
|
@@ -23,7 +23,8 @@ caution.
|
|
23
23
|
== Requires
|
24
24
|
|
25
25
|
* RubyGems
|
26
|
-
*
|
26
|
+
* Nokogiri
|
27
|
+
* libxml2 >= 2.7.2
|
27
28
|
|
28
29
|
== Usage
|
29
30
|
|
@@ -141,6 +142,7 @@ include the symbol <code>:relative</code> in the protocol array:
|
|
141
142
|
The following lovely people have contributed to Sanitize in the form of patches
|
142
143
|
or ideas that later became code:
|
143
144
|
|
145
|
+
* Peter Cooper <git@peterc.org>
|
144
146
|
* Ryan Grove <ryan@wonko.com>
|
145
147
|
* Adam Hooper <adam@adamhooper.com>
|
146
148
|
* Mutwin Kraus <mutle@blogage.de>
|
data/lib/sanitize.rb
CHANGED
@@ -26,9 +26,9 @@ $:.uniq!
|
|
26
26
|
|
27
27
|
require 'rubygems'
|
28
28
|
|
29
|
-
gem '
|
29
|
+
gem 'nokogiri', '~> 1.3.3'
|
30
30
|
|
31
|
-
require '
|
31
|
+
require 'nokogiri'
|
32
32
|
require 'sanitize/config'
|
33
33
|
require 'sanitize/config/restricted'
|
34
34
|
require 'sanitize/config/basic'
|
@@ -53,7 +53,7 @@ class Sanitize
|
|
53
53
|
# or more characters followed by a colon is considered a match, even if the
|
54
54
|
# colon is encoded as an entity and even if it's an incomplete entity (which
|
55
55
|
# IE6 and Opera will still parse).
|
56
|
-
REGEX_PROTOCOL = /^([
|
56
|
+
REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|�*58|�*3a)/i
|
57
57
|
|
58
58
|
#--
|
59
59
|
# Instance Methods
|
@@ -73,78 +73,65 @@ class Sanitize
|
|
73
73
|
# Performs clean in place, returning _html_, or +nil+ if no changes were
|
74
74
|
# made.
|
75
75
|
def clean!(html)
|
76
|
-
fragment =
|
77
|
-
|
78
|
-
fragment.search('*') do |node|
|
79
|
-
if node.bogusetag? || node.doctype? || node.procins? || node.xmldecl?
|
80
|
-
node.parent.replace_child(node, '')
|
81
|
-
next
|
82
|
-
end
|
76
|
+
fragment = Nokogiri::HTML::DocumentFragment.parse(html)
|
83
77
|
|
78
|
+
fragment.traverse do |node|
|
84
79
|
if node.comment?
|
85
|
-
node.
|
86
|
-
elsif node.
|
80
|
+
node.unlink unless @config[:allow_comments]
|
81
|
+
elsif node.element?
|
87
82
|
name = node.name.to_s.downcase
|
88
83
|
|
89
84
|
# Delete any element that isn't in the whitelist.
|
90
85
|
unless @config[:elements].include?(name)
|
91
|
-
node.
|
86
|
+
node.children.each { |n| node.add_previous_sibling(n) }
|
87
|
+
node.unlink
|
92
88
|
next
|
93
89
|
end
|
94
90
|
|
95
|
-
node.raw_attributes ||= {}
|
96
|
-
|
97
91
|
attr_whitelist = ((@config[:attributes][name] || []) +
|
98
92
|
(@config[:attributes][:all] || [])).uniq
|
99
93
|
|
100
94
|
if attr_whitelist.empty?
|
101
95
|
# Delete all attributes from elements with no whitelisted
|
102
96
|
# attributes.
|
103
|
-
node.
|
97
|
+
node.attribute_nodes.each { |attr| attr.remove }
|
104
98
|
else
|
105
99
|
# Delete any attribute that isn't in the whitelist for this element.
|
106
|
-
node.
|
107
|
-
|
100
|
+
node.attribute_nodes.each do |attr|
|
101
|
+
attr.unlink unless attr_whitelist.include?(attr.name.downcase)
|
108
102
|
end
|
109
103
|
|
110
104
|
# Delete remaining attributes that use unacceptable protocols.
|
111
105
|
if @config[:protocols].has_key?(name)
|
112
106
|
protocol = @config[:protocols][name]
|
113
107
|
|
114
|
-
node.
|
115
|
-
|
116
|
-
next false unless protocol.has_key?(
|
117
|
-
next true if value.nil?
|
108
|
+
node.attribute_nodes.each do |attr|
|
109
|
+
attr_name = attr.name.downcase
|
110
|
+
next false unless protocol.has_key?(attr_name)
|
118
111
|
|
119
|
-
if value.to_s.downcase =~ REGEX_PROTOCOL
|
120
|
-
!protocol[
|
112
|
+
del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
|
113
|
+
!protocol[attr_name].include?($1.downcase)
|
121
114
|
else
|
122
|
-
!protocol[
|
115
|
+
!protocol[attr_name].include?(:relative)
|
123
116
|
end
|
117
|
+
|
118
|
+
attr.unlink if del
|
124
119
|
end
|
125
120
|
end
|
126
121
|
end
|
127
122
|
|
128
123
|
# Add required attributes.
|
129
124
|
if @config[:add_attributes].has_key?(name)
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
# Escape special chars in attribute values.
|
134
|
-
node.raw_attributes.each do |key, value|
|
135
|
-
node.raw_attributes[key] = Sanitize.encode_html(value)
|
125
|
+
@config[:add_attributes][name].each do |key, val|
|
126
|
+
node[key] = val
|
127
|
+
end
|
136
128
|
end
|
129
|
+
elsif node.cdata?
|
130
|
+
node.replace(Nokogiri::XML::Text.new(node.text, node.document))
|
137
131
|
end
|
138
132
|
end
|
139
133
|
|
140
|
-
|
141
|
-
# as entities. This eliminates certain types of maliciously-malformed nested
|
142
|
-
# tags.
|
143
|
-
fragment.search('*') do |node|
|
144
|
-
node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
|
145
|
-
end
|
146
|
-
|
147
|
-
result = fragment.to_s
|
134
|
+
result = fragment.to_xhtml(:encoding => 'UTF-8', :indent => 0).gsub(/>\n/, '>')
|
148
135
|
return result == html ? nil : html[0, html.length] = result
|
149
136
|
end
|
150
137
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rgrove-sanitize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.8
|
4
|
+
version: 1.0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Grove
|
@@ -9,18 +9,18 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-05-16 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: nokogiri
|
17
17
|
type: :runtime
|
18
18
|
version_requirement:
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ~>
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 1.3.3
|
24
24
|
version:
|
25
25
|
description:
|
26
26
|
email: ryan@wonko.com
|