loofah 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of loofah might be problematic. Click here for more details.
- data.tar.gz.sig +0 -0
- data/CHANGELOG.rdoc +9 -0
- data/Manifest.txt +3 -1
- data/README.rdoc +223 -92
- data/Rakefile +11 -3
- data/TODO.rdoc +0 -5
- data/lib/loofah.rb +27 -138
- data/lib/loofah/active_record.rb +10 -18
- data/lib/loofah/html/document.rb +4 -4
- data/lib/loofah/html/document_fragment.rb +5 -5
- data/lib/loofah/html5/scrub.rb +1 -1
- data/lib/loofah/html5/whitelist.rb +1 -1
- data/lib/loofah/instance_methods.rb +47 -0
- data/lib/loofah/scrubber.rb +98 -76
- data/lib/loofah/scrubbers.rb +199 -0
- data/lib/loofah/xss_foliate.rb +71 -69
- data/test/html5/test_sanitizer.rb +12 -9
- data/test/test_active_record.rb +22 -0
- data/test/test_ad_hoc.rb +42 -0
- data/test/test_api.rb +47 -1
- data/test/test_scrubber.rb +204 -102
- data/test/test_scrubbers.rb +144 -0
- metadata +44 -12
- metadata.gz.sig +0 -0
- data/test/html5/testdata/tests1.dat +0 -501
@@ -0,0 +1,199 @@
|
|
1
|
+
module Loofah
|
2
|
+
#
|
3
|
+
# Loofah provides some built-in scrubbers for sanitizing with
|
4
|
+
# HTML5lib's whitelist and for accomplishing some common
|
5
|
+
# transformation tasks.
|
6
|
+
#
|
7
|
+
#
|
8
|
+
# === Loofah::Scrubbers::Strip / scrub!(:strip)
|
9
|
+
#
|
10
|
+
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
11
|
+
#
|
12
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
13
|
+
# Loofah.fragment(unsafe_html).scrub!(:strip)
|
14
|
+
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
15
|
+
#
|
16
|
+
#
|
17
|
+
# === Loofah::Scrubbers::Prune / scrub!(:prune)
|
18
|
+
#
|
19
|
+
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
20
|
+
#
|
21
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
22
|
+
# Loofah.fragment(unsafe_html).scrub!(:prune)
|
23
|
+
# => "ohai! <div>div is safe</div> "
|
24
|
+
#
|
25
|
+
#
|
26
|
+
# === Loofah::Scrubbers::Escape / scrub!(:escape)
|
27
|
+
#
|
28
|
+
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
29
|
+
#
|
30
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
31
|
+
# Loofah.fragment(unsafe_html).scrub!(:escape)
|
32
|
+
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
33
|
+
#
|
34
|
+
#
|
35
|
+
# === Loofah::Scrubbers::Whitewash / scrub!(:whitewash)
|
36
|
+
#
|
37
|
+
# +:whitewash+ removes all comments, styling and attributes in
|
38
|
+
# addition to doing markup-fixer-uppery and pruning unsafe tags. I
|
39
|
+
# like to call this "whitewashing", since it's like putting a new
|
40
|
+
# layer of paint on top of the HTML input to make it look nice.
|
41
|
+
#
|
42
|
+
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
43
|
+
# Loofah.fragment(messy_markup).scrub!(:whitewash)
|
44
|
+
# => "ohai! <div>div with attributes</div>"
|
45
|
+
#
|
46
|
+
# One use case for this scrubber is to clean up HTML that was
|
47
|
+
# cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
|
48
|
+
# rich text editor. Microsoft's software is famous for injecting
|
49
|
+
# all kinds of cruft into its HTML output. Who needs that crap?
|
50
|
+
# Certainly not me.
|
51
|
+
#
|
52
|
+
#
|
53
|
+
# === Loofah::Scrubbers::NoFollow / scrub!(:nofollow)
|
54
|
+
#
|
55
|
+
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
56
|
+
#
|
57
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
58
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
|
59
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
60
|
+
#
|
61
|
+
#
|
62
|
+
module Scrubbers
|
63
|
+
|
64
|
+
#
|
65
|
+
# === scrub!(:strip)
|
66
|
+
#
|
67
|
+
# +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
|
68
|
+
#
|
69
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
70
|
+
# Loofah.fragment(unsafe_html).scrub!(:strip)
|
71
|
+
# => "ohai! <div>div is safe</div> but foo is <b>not</b>"
|
72
|
+
#
|
73
|
+
class Strip < Scrubber
|
74
|
+
def initialize
|
75
|
+
@direction = :bottom_up
|
76
|
+
end
|
77
|
+
|
78
|
+
def scrub(node)
|
79
|
+
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
80
|
+
node.before node.inner_html
|
81
|
+
node.remove
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
#
|
86
|
+
# === scrub!(:prune)
|
87
|
+
#
|
88
|
+
# +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
|
89
|
+
#
|
90
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
91
|
+
# Loofah.fragment(unsafe_html).scrub!(:prune)
|
92
|
+
# => "ohai! <div>div is safe</div> "
|
93
|
+
#
|
94
|
+
class Prune < Scrubber
|
95
|
+
def initialize
|
96
|
+
@direction = :top_down
|
97
|
+
end
|
98
|
+
|
99
|
+
def scrub(node)
|
100
|
+
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
101
|
+
node.remove
|
102
|
+
return STOP
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# === scrub!(:escape)
|
108
|
+
#
|
109
|
+
# +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
|
110
|
+
#
|
111
|
+
# unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
112
|
+
# Loofah.fragment(unsafe_html).scrub!(:escape)
|
113
|
+
# => "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
|
114
|
+
#
|
115
|
+
class Escape < Scrubber
|
116
|
+
def initialize
|
117
|
+
@direction = :top_down
|
118
|
+
end
|
119
|
+
|
120
|
+
def scrub(node)
|
121
|
+
return CONTINUE if html5lib_sanitize(node) == CONTINUE
|
122
|
+
replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
|
123
|
+
node.add_next_sibling replacement_killer
|
124
|
+
node.remove
|
125
|
+
return STOP
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
#
|
130
|
+
# === scrub!(:whitewash)
|
131
|
+
#
|
132
|
+
# +:whitewash+ removes all comments, styling and attributes in
|
133
|
+
# addition to doing markup-fixer-uppery and pruning unsafe tags. I
|
134
|
+
# like to call this "whitewashing", since it's like putting a new
|
135
|
+
# layer of paint on top of the HTML input to make it look nice.
|
136
|
+
#
|
137
|
+
# messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
|
138
|
+
# Loofah.fragment(messy_markup).scrub!(:whitewash)
|
139
|
+
# => "ohai! <div>div with attributes</div>"
|
140
|
+
#
|
141
|
+
# One use case for this scrubber is to clean up HTML that was
|
142
|
+
# cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
|
143
|
+
# rich text editor. Microsoft's software is famous for injecting
|
144
|
+
# all kinds of cruft into its HTML output. Who needs that crap?
|
145
|
+
# Certainly not me.
|
146
|
+
#
|
147
|
+
class Whitewash < Scrubber
|
148
|
+
def initialize
|
149
|
+
@direction = :top_down
|
150
|
+
end
|
151
|
+
|
152
|
+
def scrub(node)
|
153
|
+
case node.type
|
154
|
+
when Nokogiri::XML::Node::ELEMENT_NODE
|
155
|
+
if HTML5::HashedWhiteList::ALLOWED_ELEMENTS[node.name]
|
156
|
+
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
157
|
+
return CONTINUE if node.namespaces.empty?
|
158
|
+
end
|
159
|
+
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
|
160
|
+
return CONTINUE
|
161
|
+
end
|
162
|
+
node.remove
|
163
|
+
STOP
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
#
|
168
|
+
# === scrub!(:nofollow)
|
169
|
+
#
|
170
|
+
# +:nofollow+ adds a rel="nofollow" attribute to all links
|
171
|
+
#
|
172
|
+
# link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
|
173
|
+
# Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
|
174
|
+
# => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
|
175
|
+
#
|
176
|
+
class NoFollow < Scrubber
|
177
|
+
def initialize
|
178
|
+
@direction = :top_down
|
179
|
+
end
|
180
|
+
|
181
|
+
def scrub(node)
|
182
|
+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
|
183
|
+
node.set_attribute('rel', 'nofollow')
|
184
|
+
return STOP
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
#
|
189
|
+
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
|
190
|
+
#
|
191
|
+
MAP = {
|
192
|
+
:escape => Escape,
|
193
|
+
:prune => Prune,
|
194
|
+
:whitewash => Whitewash,
|
195
|
+
:strip => Strip,
|
196
|
+
:nofollow => NoFollow
|
197
|
+
}
|
198
|
+
end
|
199
|
+
end
|
data/lib/loofah/xss_foliate.rb
CHANGED
@@ -5,7 +5,76 @@ module Loofah
|
|
5
5
|
# XssFoliate will strip all tags from your ActiveRecord models'
|
6
6
|
# string and text attributes.
|
7
7
|
#
|
8
|
-
#
|
8
|
+
# Please read the Loofah documentation for an explanation of the
|
9
|
+
# different scrubbing methods, and
|
10
|
+
# Loofah::XssFoliate::ClassMethods for more information on the
|
11
|
+
# methods.
|
12
|
+
#
|
13
|
+
# If you'd like to scrub all fields in all your models (and perhaps *opt-out* in specific models):
|
14
|
+
#
|
15
|
+
# # config/environment
|
16
|
+
# LOOFAH_XSS_FOLIATE_ALL_MODELS = true
|
17
|
+
# Rails::Initializer.run do |config|
|
18
|
+
# config.gem "loofah"
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# # db/schema.rb
|
22
|
+
# create_table "posts" do |t|
|
23
|
+
# t.string "title"
|
24
|
+
# t.text "body"
|
25
|
+
# t.string "author"
|
26
|
+
# end
|
27
|
+
#
|
28
|
+
# # app/model/post.rb
|
29
|
+
# class Post < ActiveRecord::Base
|
30
|
+
# # by default, title, body and author will all be scrubbed down to their inner text
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# OR
|
34
|
+
#
|
35
|
+
# # app/model/post.rb
|
36
|
+
# class Post < ActiveRecord::Base
|
37
|
+
# xss_foliate :except => :author # opt-out of sanitizing author
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# OR
|
41
|
+
#
|
42
|
+
# xss_foliate :strip => [:title, body] # strip unsafe tags from both title and body
|
43
|
+
#
|
44
|
+
# OR
|
45
|
+
#
|
46
|
+
# xss_foliate :except => :title # scrub body and author but not title
|
47
|
+
#
|
48
|
+
# OR
|
49
|
+
#
|
50
|
+
# # remove all tags from title, remove unsafe tags from body
|
51
|
+
# xss_foliate :sanitize => :title, :scrub => :body
|
52
|
+
#
|
53
|
+
# OR
|
54
|
+
#
|
55
|
+
# # old xss_terminate code will work if you s/_terminate/_foliate/
|
56
|
+
# # was: xss_terminate :except => [:title], :sanitize => [:body]
|
57
|
+
# xss_foliate :except => [:title], :sanitize => [:body]
|
58
|
+
#
|
59
|
+
# Alternatively, if you would like to *opt-in* to the models and attributes that are sanitized:
|
60
|
+
#
|
61
|
+
# # config/environment.rb
|
62
|
+
# LOOFAH_XSS_FOLIATE_ALL_MODELS = false # default, this line could be omitted
|
63
|
+
# Rails::Initializer.run do |config|
|
64
|
+
# config.gem "loofah"
|
65
|
+
# end
|
66
|
+
#
|
67
|
+
# # db/schema.rb
|
68
|
+
# create_table "posts" do |t|
|
69
|
+
# t.string "title"
|
70
|
+
# t.text "body"
|
71
|
+
# t.string "author"
|
72
|
+
# end
|
73
|
+
#
|
74
|
+
# # app/model/post.rb
|
75
|
+
# class Post < ActiveRecord::Base
|
76
|
+
# xss_foliate # scrub title, body and author down to their inner text
|
77
|
+
# end
|
9
78
|
#
|
10
79
|
module XssFoliate
|
11
80
|
#
|
@@ -14,74 +83,7 @@ module Loofah
|
|
14
83
|
# XssFoliate will strip all tags from your ActiveRecord models'
|
15
84
|
# string and text attributes.
|
16
85
|
#
|
17
|
-
#
|
18
|
-
# different scrubbing methods.
|
19
|
-
#
|
20
|
-
# If you'd like to scrub all fields in all your models (and perhaps *opt-out* in specific models):
|
21
|
-
#
|
22
|
-
# # config/environment
|
23
|
-
# LOOFAH_XSS_FOLIATE_ALL_MODELS = true
|
24
|
-
# Rails::Initializer.run do |config|
|
25
|
-
# config.gem "loofah"
|
26
|
-
# end
|
27
|
-
#
|
28
|
-
# # db/schema.rb
|
29
|
-
# create_table "posts" do |t|
|
30
|
-
# t.string "title"
|
31
|
-
# t.text "body"
|
32
|
-
# t.string "author"
|
33
|
-
# end
|
34
|
-
#
|
35
|
-
# # app/model/post.rb
|
36
|
-
# class Post < ActiveRecord::Base
|
37
|
-
# # by default, title, body and author will all be scrubbed down to their inner text
|
38
|
-
# end
|
39
|
-
#
|
40
|
-
# OR
|
41
|
-
#
|
42
|
-
# # app/model/post.rb
|
43
|
-
# class Post < ActiveRecord::Base
|
44
|
-
# xss_foliate :except => :author # opt-out of sanitizing author
|
45
|
-
# end
|
46
|
-
#
|
47
|
-
# OR
|
48
|
-
#
|
49
|
-
# xss_foliate :strip => [:title, body] # strip unsafe tags from both title and body
|
50
|
-
#
|
51
|
-
# OR
|
52
|
-
#
|
53
|
-
# xss_foliate :except => :title # scrub body and author but not title
|
54
|
-
#
|
55
|
-
# OR
|
56
|
-
#
|
57
|
-
# # remove all tags from title, remove unsafe tags from body
|
58
|
-
# xss_foliate :sanitize => :title, :scrub => :body
|
59
|
-
#
|
60
|
-
# OR
|
61
|
-
#
|
62
|
-
# # old xss_terminate code will work if you s/_terminate/_foliate/
|
63
|
-
# # was: xss_terminate :except => [:title], :sanitize => [:body]
|
64
|
-
# xss_foliate :except => [:title], :sanitize => [:body]
|
65
|
-
#
|
66
|
-
# Alternatively, if you would like to *opt-in* to the models and attributes that are sanitized:
|
67
|
-
#
|
68
|
-
# # config/environment.rb
|
69
|
-
# LOOFAH_XSS_FOLIATE_ALL_MODELS = false # default, this line could be omitted
|
70
|
-
# Rails::Initializer.run do |config|
|
71
|
-
# config.gem "loofah"
|
72
|
-
# end
|
73
|
-
#
|
74
|
-
# # db/schema.rb
|
75
|
-
# create_table "posts" do |t|
|
76
|
-
# t.string "title"
|
77
|
-
# t.text "body"
|
78
|
-
# t.string "author"
|
79
|
-
# end
|
80
|
-
#
|
81
|
-
# # app/model/post.rb
|
82
|
-
# class Post < ActiveRecord::Base
|
83
|
-
# xss_foliate # scrub title, body and author down to their inner text
|
84
|
-
# end
|
86
|
+
# See Loofah::XssFoliate for more example usage.
|
85
87
|
#
|
86
88
|
module ClassMethods
|
87
89
|
# :stopdoc:
|
@@ -143,15 +143,18 @@ class Html5TestSanitizer < Test::Unit::TestCase
|
|
143
143
|
end
|
144
144
|
end
|
145
145
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
146
|
+
##
|
147
|
+
## as tenderlove says, "care < 0"
|
148
|
+
##
|
149
|
+
# def test_should_handle_astral_plane_characters
|
150
|
+
# input = "<p>𝒵 𝔸</p>"
|
151
|
+
# output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
152
|
+
# check_sanitization(input, output, output, output)
|
153
|
+
|
154
|
+
# input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
155
|
+
# output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
156
|
+
# check_sanitization(input, output, output, output)
|
157
|
+
# end
|
155
158
|
|
156
159
|
# This affects only NS4. Is it worth fixing?
|
157
160
|
# def test_javascript_includes
|
data/test/test_active_record.rb
CHANGED
@@ -119,6 +119,28 @@ class TestActiveRecord < Test::Unit::TestCase
|
|
119
119
|
end
|
120
120
|
end
|
121
121
|
|
122
|
+
context "passing a Scrubber" do
|
123
|
+
setup do
|
124
|
+
@called = false
|
125
|
+
@scrubber = Loofah::Scrubber.new do |node|
|
126
|
+
@called = true
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
should "not raise ArgumentError" do
|
131
|
+
assert_nothing_raised {
|
132
|
+
Post.html_fragment :html_string, :scrub => @scrubber
|
133
|
+
}
|
134
|
+
end
|
135
|
+
|
136
|
+
should "scrub properly" do
|
137
|
+
Post.html_fragment :html_string, :scrub => @scrubber
|
138
|
+
post = Post.new :html_string => HTML_STRING, :plain_text => PLAIN_TEXT
|
139
|
+
post.valid?
|
140
|
+
assert @called
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
122
144
|
end
|
123
145
|
|
124
146
|
end
|
data/test/test_ad_hoc.rb
CHANGED
@@ -10,6 +10,48 @@ class TestAdHoc < Test::Unit::TestCase
|
|
10
10
|
assert_equal Loofah.scrub_document("", :prune).text, ""
|
11
11
|
end
|
12
12
|
|
13
|
+
def test_xml_document_scrub
|
14
|
+
xml = Loofah.xml_document <<-EOXML
|
15
|
+
<root>
|
16
|
+
<employee deceased='true'>Abraham Lincoln</employee>
|
17
|
+
<employee deceased='false'>Abe Vigoda</employee>
|
18
|
+
</root>
|
19
|
+
EOXML
|
20
|
+
bring_out_your_dead = Loofah::Scrubber.new do |node|
|
21
|
+
if node.name == "employee" and node["deceased"] == "true"
|
22
|
+
node.remove
|
23
|
+
Loofah::Scrubber::STOP # don't bother with the rest of the subtree
|
24
|
+
end
|
25
|
+
end
|
26
|
+
assert_equal 2, xml.css("employee").length
|
27
|
+
|
28
|
+
xml.scrub!(bring_out_your_dead)
|
29
|
+
|
30
|
+
employees = xml.css "employee"
|
31
|
+
assert_equal 1, employees.length
|
32
|
+
assert_equal "Abe Vigoda", employees.first.inner_text
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_xml_fragment_scrub
|
36
|
+
xml = Loofah.xml_fragment <<-EOXML
|
37
|
+
<employee deceased='true'>Abraham Lincoln</employee>
|
38
|
+
<employee deceased='false'>Abe Vigoda</employee>
|
39
|
+
EOXML
|
40
|
+
bring_out_your_dead = Loofah::Scrubber.new do |node|
|
41
|
+
if node.name == "employee" and node["deceased"] == "true"
|
42
|
+
node.remove
|
43
|
+
Loofah::Scrubber::STOP # don't bother with the rest of the subtree
|
44
|
+
end
|
45
|
+
end
|
46
|
+
assert_equal 2, xml.css("employee").length
|
47
|
+
|
48
|
+
xml.scrub!(bring_out_your_dead)
|
49
|
+
|
50
|
+
employees = xml.css "employee"
|
51
|
+
assert_equal 1, employees.length
|
52
|
+
assert_equal "Abe Vigoda", employees.first.inner_text
|
53
|
+
end
|
54
|
+
|
13
55
|
def test_removal_of_illegal_tag
|
14
56
|
html = <<-HTML
|
15
57
|
following this there should be no jim tag
|