rgrove-sanitize 1.0.8 → 1.0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/HISTORY +5 -0
- data/README.rdoc +5 -3
- data/lib/sanitize.rb +26 -39
- metadata +4 -4
data/HISTORY
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
Sanitize History
|
2
2
|
================================================================================
|
3
3
|
|
4
|
+
Version 1.0.8.2 (git)
|
5
|
+
* Migrated from Hpricot to Nokogiri. Requires libxml2 >= 2.7.2 [Adam Hooper]
|
6
|
+
* Changed protocol regex to ensure Sanitize doesn't kill URLs with colons in
|
7
|
+
path segments. [Peter Cooper]
|
8
|
+
|
4
9
|
Version 1.0.8 (2009-04-23)
|
5
10
|
* Added a workaround for an Hpricot bug that prevents attribute names from
|
6
11
|
being downcased in recent versions of Hpricot. This was exploitable to
|
data/README.rdoc
CHANGED
@@ -9,13 +9,13 @@ elements, certain attributes within those elements, and even certain URL
|
|
9
9
|
protocols within attributes that contain URLs. Any HTML elements or attributes
|
10
10
|
that you don't explicitly allow will be removed.
|
11
11
|
|
12
|
-
Because it's based on
|
12
|
+
Because it's based on Nokogiri, a full-fledged HTML parser, rather than a bunch
|
13
13
|
of fragile regular expressions, Sanitize has no trouble dealing with malformed
|
14
14
|
or maliciously-formed HTML. When in doubt, Sanitize always errs on the side of
|
15
15
|
caution.
|
16
16
|
|
17
17
|
*Author*:: Ryan Grove (mailto:ryan@wonko.com)
|
18
|
-
*Version*:: 1.0.8 (
|
18
|
+
*Version*:: 1.0.8.2 (git)
|
19
19
|
*Copyright*:: Copyright (c) 2009 Ryan Grove. All rights reserved.
|
20
20
|
*License*:: MIT License (http://opensource.org/licenses/mit-license.php)
|
21
21
|
*Website*:: http://github.com/rgrove/sanitize
|
@@ -23,7 +23,8 @@ caution.
|
|
23
23
|
== Requires
|
24
24
|
|
25
25
|
* RubyGems
|
26
|
-
*
|
26
|
+
* Nokogiri
|
27
|
+
* libxml2 >= 2.7.2
|
27
28
|
|
28
29
|
== Usage
|
29
30
|
|
@@ -141,6 +142,7 @@ include the symbol <code>:relative</code> in the protocol array:
|
|
141
142
|
The following lovely people have contributed to Sanitize in the form of patches
|
142
143
|
or ideas that later became code:
|
143
144
|
|
145
|
+
* Peter Cooper <git@peterc.org>
|
144
146
|
* Ryan Grove <ryan@wonko.com>
|
145
147
|
* Adam Hooper <adam@adamhooper.com>
|
146
148
|
* Mutwin Kraus <mutle@blogage.de>
|
data/lib/sanitize.rb
CHANGED
@@ -26,9 +26,9 @@ $:.uniq!
|
|
26
26
|
|
27
27
|
require 'rubygems'
|
28
28
|
|
29
|
-
gem '
|
29
|
+
gem 'nokogiri', '~> 1.3.3'
|
30
30
|
|
31
|
-
require '
|
31
|
+
require 'nokogiri'
|
32
32
|
require 'sanitize/config'
|
33
33
|
require 'sanitize/config/restricted'
|
34
34
|
require 'sanitize/config/basic'
|
@@ -53,7 +53,7 @@ class Sanitize
|
|
53
53
|
# or more characters followed by a colon is considered a match, even if the
|
54
54
|
# colon is encoded as an entity and even if it's an incomplete entity (which
|
55
55
|
# IE6 and Opera will still parse).
|
56
|
-
REGEX_PROTOCOL = /^([
|
56
|
+
REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|�*58|�*3a)/i
|
57
57
|
|
58
58
|
#--
|
59
59
|
# Instance Methods
|
@@ -73,78 +73,65 @@ class Sanitize
|
|
73
73
|
# Performs clean in place, returning _html_, or +nil+ if no changes were
|
74
74
|
# made.
|
75
75
|
def clean!(html)
|
76
|
-
fragment =
|
77
|
-
|
78
|
-
fragment.search('*') do |node|
|
79
|
-
if node.bogusetag? || node.doctype? || node.procins? || node.xmldecl?
|
80
|
-
node.parent.replace_child(node, '')
|
81
|
-
next
|
82
|
-
end
|
76
|
+
fragment = Nokogiri::HTML::DocumentFragment.parse(html)
|
83
77
|
|
78
|
+
fragment.traverse do |node|
|
84
79
|
if node.comment?
|
85
|
-
node.
|
86
|
-
elsif node.
|
80
|
+
node.unlink unless @config[:allow_comments]
|
81
|
+
elsif node.element?
|
87
82
|
name = node.name.to_s.downcase
|
88
83
|
|
89
84
|
# Delete any element that isn't in the whitelist.
|
90
85
|
unless @config[:elements].include?(name)
|
91
|
-
node.
|
86
|
+
node.children.each { |n| node.add_previous_sibling(n) }
|
87
|
+
node.unlink
|
92
88
|
next
|
93
89
|
end
|
94
90
|
|
95
|
-
node.raw_attributes ||= {}
|
96
|
-
|
97
91
|
attr_whitelist = ((@config[:attributes][name] || []) +
|
98
92
|
(@config[:attributes][:all] || [])).uniq
|
99
93
|
|
100
94
|
if attr_whitelist.empty?
|
101
95
|
# Delete all attributes from elements with no whitelisted
|
102
96
|
# attributes.
|
103
|
-
node.
|
97
|
+
node.attribute_nodes.each { |attr| attr.remove }
|
104
98
|
else
|
105
99
|
# Delete any attribute that isn't in the whitelist for this element.
|
106
|
-
node.
|
107
|
-
|
100
|
+
node.attribute_nodes.each do |attr|
|
101
|
+
attr.unlink unless attr_whitelist.include?(attr.name.downcase)
|
108
102
|
end
|
109
103
|
|
110
104
|
# Delete remaining attributes that use unacceptable protocols.
|
111
105
|
if @config[:protocols].has_key?(name)
|
112
106
|
protocol = @config[:protocols][name]
|
113
107
|
|
114
|
-
node.
|
115
|
-
|
116
|
-
next false unless protocol.has_key?(
|
117
|
-
next true if value.nil?
|
108
|
+
node.attribute_nodes.each do |attr|
|
109
|
+
attr_name = attr.name.downcase
|
110
|
+
next false unless protocol.has_key?(attr_name)
|
118
111
|
|
119
|
-
if value.to_s.downcase =~ REGEX_PROTOCOL
|
120
|
-
!protocol[
|
112
|
+
del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
|
113
|
+
!protocol[attr_name].include?($1.downcase)
|
121
114
|
else
|
122
|
-
!protocol[
|
115
|
+
!protocol[attr_name].include?(:relative)
|
123
116
|
end
|
117
|
+
|
118
|
+
attr.unlink if del
|
124
119
|
end
|
125
120
|
end
|
126
121
|
end
|
127
122
|
|
128
123
|
# Add required attributes.
|
129
124
|
if @config[:add_attributes].has_key?(name)
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
# Escape special chars in attribute values.
|
134
|
-
node.raw_attributes.each do |key, value|
|
135
|
-
node.raw_attributes[key] = Sanitize.encode_html(value)
|
125
|
+
@config[:add_attributes][name].each do |key, val|
|
126
|
+
node[key] = val
|
127
|
+
end
|
136
128
|
end
|
129
|
+
elsif node.cdata?
|
130
|
+
node.replace(Nokogiri::XML::Text.new(node.text, node.document))
|
137
131
|
end
|
138
132
|
end
|
139
133
|
|
140
|
-
|
141
|
-
# as entities. This eliminates certain types of maliciously-malformed nested
|
142
|
-
# tags.
|
143
|
-
fragment.search('*') do |node|
|
144
|
-
node.swap(Sanitize.encode_html(node.to_original_html)) if node.text?
|
145
|
-
end
|
146
|
-
|
147
|
-
result = fragment.to_s
|
134
|
+
result = fragment.to_xhtml(:encoding => 'UTF-8', :indent => 0).gsub(/>\n/, '>')
|
148
135
|
return result == html ? nil : html[0, html.length] = result
|
149
136
|
end
|
150
137
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rgrove-sanitize
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.8
|
4
|
+
version: 1.0.8.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan Grove
|
@@ -9,18 +9,18 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-05-16 00:00:00 -07:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
|
-
name:
|
16
|
+
name: nokogiri
|
17
17
|
type: :runtime
|
18
18
|
version_requirement:
|
19
19
|
version_requirements: !ruby/object:Gem::Requirement
|
20
20
|
requirements:
|
21
21
|
- - ~>
|
22
22
|
- !ruby/object:Gem::Version
|
23
|
-
version:
|
23
|
+
version: 1.3.3
|
24
24
|
version:
|
25
25
|
description:
|
26
26
|
email: ryan@wonko.com
|