loofah 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of loofah might be problematic. Click here for more details.

@@ -0,0 +1,199 @@
1
+ module Loofah
2
+ #
3
+ # Loofah provides some built-in scrubbers for sanitizing with
4
+ # HTML5lib's whitelist and for accomplishing some common
5
+ # transformation tasks.
6
+ #
7
+ #
8
+ # === Loofah::Scrubbers::Strip / scrub!(:strip)
9
+ #
10
+ # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
11
+ #
12
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
13
+ # Loofah.fragment(unsafe_html).scrub!(:strip)
14
+ # => "ohai! <div>div is safe</div> but foo is <b>not</b>"
15
+ #
16
+ #
17
+ # === Loofah::Scrubbers::Prune / scrub!(:prune)
18
+ #
19
+ # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
20
+ #
21
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
22
+ # Loofah.fragment(unsafe_html).scrub!(:prune)
23
+ # => "ohai! <div>div is safe</div> "
24
+ #
25
+ #
26
+ # === Loofah::Scrubbers::Escape / scrub!(:escape)
27
+ #
28
+ # +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
29
+ #
30
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
31
+ # Loofah.fragment(unsafe_html).scrub!(:escape)
32
+ # => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
33
+ #
34
+ #
35
+ # === Loofah::Scrubbers::Whitewash / scrub!(:whitewash)
36
+ #
37
+ # +:whitewash+ removes all comments, styling and attributes in
38
+ # addition to doing markup-fixer-uppery and pruning unsafe tags. I
39
+ # like to call this "whitewashing", since it's like putting a new
40
+ # layer of paint on top of the HTML input to make it look nice.
41
+ #
42
+ # messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
43
+ # Loofah.fragment(messy_markup).scrub!(:whitewash)
44
+ # => "ohai! <div>div with attributes</div>"
45
+ #
46
+ # One use case for this scrubber is to clean up HTML that was
47
+ # cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
48
+ # rich text editor. Microsoft's software is famous for injecting
49
+ # all kinds of cruft into its HTML output. Who needs that crap?
50
+ # Certainly not me.
51
+ #
52
+ #
53
+ # === Loofah::Scrubbers::NoFollow / scrub!(:nofollow)
54
+ #
55
+ # +:nofollow+ adds a rel="nofollow" attribute to all links
56
+ #
57
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
58
+ # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
59
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
60
+ #
61
+ #
62
+ module Scrubbers
63
+
64
+ #
65
+ # === scrub!(:strip)
66
+ #
67
+ # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
68
+ #
69
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
70
+ # Loofah.fragment(unsafe_html).scrub!(:strip)
71
+ # => "ohai! <div>div is safe</div> but foo is <b>not</b>"
72
+ #
73
+ class Strip < Scrubber
74
+ def initialize
75
+ @direction = :bottom_up
76
+ end
77
+
78
+ def scrub(node)
79
+ return CONTINUE if html5lib_sanitize(node) == CONTINUE
80
+ node.before node.inner_html
81
+ node.remove
82
+ end
83
+ end
84
+
85
+ #
86
+ # === scrub!(:prune)
87
+ #
88
+ # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
89
+ #
90
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
91
+ # Loofah.fragment(unsafe_html).scrub!(:prune)
92
+ # => "ohai! <div>div is safe</div> "
93
+ #
94
+ class Prune < Scrubber
95
+ def initialize
96
+ @direction = :top_down
97
+ end
98
+
99
+ def scrub(node)
100
+ return CONTINUE if html5lib_sanitize(node) == CONTINUE
101
+ node.remove
102
+ return STOP
103
+ end
104
+ end
105
+
106
+ #
107
+ # === scrub!(:escape)
108
+ #
109
+ # +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
110
+ #
111
+ # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
112
+ # Loofah.fragment(unsafe_html).scrub!(:escape)
113
+ # => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
114
+ #
115
+ class Escape < Scrubber
116
+ def initialize
117
+ @direction = :top_down
118
+ end
119
+
120
+ def scrub(node)
121
+ return CONTINUE if html5lib_sanitize(node) == CONTINUE
122
+ replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
123
+ node.add_next_sibling replacement_killer
124
+ node.remove
125
+ return STOP
126
+ end
127
+ end
128
+
129
+ #
130
+ # === scrub!(:whitewash)
131
+ #
132
+ # +:whitewash+ removes all comments, styling and attributes in
133
+ # addition to doing markup-fixer-uppery and pruning unsafe tags. I
134
+ # like to call this "whitewashing", since it's like putting a new
135
+ # layer of paint on top of the HTML input to make it look nice.
136
+ #
137
+ # messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
138
+ # Loofah.fragment(messy_markup).scrub!(:whitewash)
139
+ # => "ohai! <div>div with attributes</div>"
140
+ #
141
+ # One use case for this scrubber is to clean up HTML that was
142
+ # cut-and-pasted from Microsoft Word into a WYSIWYG editor or a
143
+ # rich text editor. Microsoft's software is famous for injecting
144
+ # all kinds of cruft into its HTML output. Who needs that crap?
145
+ # Certainly not me.
146
+ #
147
+ class Whitewash < Scrubber
148
+ def initialize
149
+ @direction = :top_down
150
+ end
151
+
152
+ def scrub(node)
153
+ case node.type
154
+ when Nokogiri::XML::Node::ELEMENT_NODE
155
+ if HTML5::HashedWhiteList::ALLOWED_ELEMENTS[node.name]
156
+ node.attributes.each { |attr| node.remove_attribute(attr.first) }
157
+ return CONTINUE if node.namespaces.empty?
158
+ end
159
+ when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
160
+ return CONTINUE
161
+ end
162
+ node.remove
163
+ STOP
164
+ end
165
+ end
166
+
167
+ #
168
+ # === scrub!(:nofollow)
169
+ #
170
+ # +:nofollow+ adds a rel="nofollow" attribute to all links
171
+ #
172
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
173
+ # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
174
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
175
+ #
176
+ class NoFollow < Scrubber
177
+ def initialize
178
+ @direction = :top_down
179
+ end
180
+
181
+ def scrub(node)
182
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
183
+ node.set_attribute('rel', 'nofollow')
184
+ return STOP
185
+ end
186
+ end
187
+
188
+ #
189
+ # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
190
+ #
191
+ MAP = {
192
+ :escape => Escape,
193
+ :prune => Prune,
194
+ :whitewash => Whitewash,
195
+ :strip => Strip,
196
+ :nofollow => NoFollow
197
+ }
198
+ end
199
+ end
@@ -5,7 +5,76 @@ module Loofah
5
5
  # XssFoliate will strip all tags from your ActiveRecord models'
6
6
  # string and text attributes.
7
7
  #
8
- # See Loofah::XssFoliate::ClassMethods for more information.
8
+ # Please read the Loofah documentation for an explanation of the
9
+ # different scrubbing methods, and
10
+ # Loofah::XssFoliate::ClassMethods for more information on the
11
+ # methods.
12
+ #
13
+ # If you'd like to scrub all fields in all your models (and perhaps *opt-out* in specific models):
14
+ #
15
+ # # config/environment
16
+ # LOOFAH_XSS_FOLIATE_ALL_MODELS = true
17
+ # Rails::Initializer.run do |config|
18
+ # config.gem "loofah"
19
+ # end
20
+ #
21
+ # # db/schema.rb
22
+ # create_table "posts" do |t|
23
+ # t.string "title"
24
+ # t.text "body"
25
+ # t.string "author"
26
+ # end
27
+ #
28
+ # # app/model/post.rb
29
+ # class Post < ActiveRecord::Base
30
+ # # by default, title, body and author will all be scrubbed down to their inner text
31
+ # end
32
+ #
33
+ # OR
34
+ #
35
+ # # app/model/post.rb
36
+ # class Post < ActiveRecord::Base
37
+ # xss_foliate :except => :author # opt-out of sanitizing author
38
+ # end
39
+ #
40
+ # OR
41
+ #
42
+ # xss_foliate :strip => [:title, body] # strip unsafe tags from both title and body
43
+ #
44
+ # OR
45
+ #
46
+ # xss_foliate :except => :title # scrub body and author but not title
47
+ #
48
+ # OR
49
+ #
50
+ # # remove all tags from title, remove unsafe tags from body
51
+ # xss_foliate :sanitize => :title, :scrub => :body
52
+ #
53
+ # OR
54
+ #
55
+ # # old xss_terminate code will work if you s/_terminate/_foliate/
56
+ # # was: xss_terminate :except => [:title], :sanitize => [:body]
57
+ # xss_foliate :except => [:title], :sanitize => [:body]
58
+ #
59
+ # Alternatively, if you would like to *opt-in* to the models and attributes that are sanitized:
60
+ #
61
+ # # config/environment.rb
62
+ # LOOFAH_XSS_FOLIATE_ALL_MODELS = false # default, this line could be omitted
63
+ # Rails::Initializer.run do |config|
64
+ # config.gem "loofah"
65
+ # end
66
+ #
67
+ # # db/schema.rb
68
+ # create_table "posts" do |t|
69
+ # t.string "title"
70
+ # t.text "body"
71
+ # t.string "author"
72
+ # end
73
+ #
74
+ # # app/model/post.rb
75
+ # class Post < ActiveRecord::Base
76
+ # xss_foliate # scrub title, body and author down to their inner text
77
+ # end
9
78
  #
10
79
  module XssFoliate
11
80
  #
@@ -14,74 +83,7 @@ module Loofah
14
83
  # XssFoliate will strip all tags from your ActiveRecord models'
15
84
  # string and text attributes.
16
85
  #
17
- # Please read the Loofah documentation for an explanation of the
18
- # different scrubbing methods.
19
- #
20
- # If you'd like to scrub all fields in all your models (and perhaps *opt-out* in specific models):
21
- #
22
- # # config/environment
23
- # LOOFAH_XSS_FOLIATE_ALL_MODELS = true
24
- # Rails::Initializer.run do |config|
25
- # config.gem "loofah"
26
- # end
27
- #
28
- # # db/schema.rb
29
- # create_table "posts" do |t|
30
- # t.string "title"
31
- # t.text "body"
32
- # t.string "author"
33
- # end
34
- #
35
- # # app/model/post.rb
36
- # class Post < ActiveRecord::Base
37
- # # by default, title, body and author will all be scrubbed down to their inner text
38
- # end
39
- #
40
- # OR
41
- #
42
- # # app/model/post.rb
43
- # class Post < ActiveRecord::Base
44
- # xss_foliate :except => :author # opt-out of sanitizing author
45
- # end
46
- #
47
- # OR
48
- #
49
- # xss_foliate :strip => [:title, body] # strip unsafe tags from both title and body
50
- #
51
- # OR
52
- #
53
- # xss_foliate :except => :title # scrub body and author but not title
54
- #
55
- # OR
56
- #
57
- # # remove all tags from title, remove unsafe tags from body
58
- # xss_foliate :sanitize => :title, :scrub => :body
59
- #
60
- # OR
61
- #
62
- # # old xss_terminate code will work if you s/_terminate/_foliate/
63
- # # was: xss_terminate :except => [:title], :sanitize => [:body]
64
- # xss_foliate :except => [:title], :sanitize => [:body]
65
- #
66
- # Alternatively, if you would like to *opt-in* to the models and attributes that are sanitized:
67
- #
68
- # # config/environment.rb
69
- # LOOFAH_XSS_FOLIATE_ALL_MODELS = false # default, this line could be omitted
70
- # Rails::Initializer.run do |config|
71
- # config.gem "loofah"
72
- # end
73
- #
74
- # # db/schema.rb
75
- # create_table "posts" do |t|
76
- # t.string "title"
77
- # t.text "body"
78
- # t.string "author"
79
- # end
80
- #
81
- # # app/model/post.rb
82
- # class Post < ActiveRecord::Base
83
- # xss_foliate # scrub title, body and author down to their inner text
84
- # end
86
+ # See Loofah::XssFoliate for more example usage.
85
87
  #
86
88
  module ClassMethods
87
89
  # :stopdoc:
@@ -143,15 +143,18 @@ class Html5TestSanitizer < Test::Unit::TestCase
143
143
  end
144
144
  end
145
145
 
146
- def test_should_handle_astral_plane_characters
147
- input = "<p>&#x1d4b5; &#x1d538;</p>"
148
- output = "<p>\360\235\222\265 \360\235\224\270</p>"
149
- check_sanitization(input, output, output, output)
150
-
151
- input = "<p><tspan>\360\235\224\270</tspan> a</p>"
152
- output = "<p><tspan>\360\235\224\270</tspan> a</p>"
153
- check_sanitization(input, output, output, output)
154
- end
146
+ ##
147
+ ## as tenderlove says, "care < 0"
148
+ ##
149
+ # def test_should_handle_astral_plane_characters
150
+ # input = "<p>&#x1d4b5; &#x1d538;</p>"
151
+ # output = "<p>\360\235\222\265 \360\235\224\270</p>"
152
+ # check_sanitization(input, output, output, output)
153
+
154
+ # input = "<p><tspan>\360\235\224\270</tspan> a</p>"
155
+ # output = "<p><tspan>\360\235\224\270</tspan> a</p>"
156
+ # check_sanitization(input, output, output, output)
157
+ # end
155
158
 
156
159
  # This affects only NS4. Is it worth fixing?
157
160
  # def test_javascript_includes
@@ -119,6 +119,28 @@ class TestActiveRecord < Test::Unit::TestCase
119
119
  end
120
120
  end
121
121
 
122
+ context "passing a Scrubber" do
123
+ setup do
124
+ @called = false
125
+ @scrubber = Loofah::Scrubber.new do |node|
126
+ @called = true
127
+ end
128
+ end
129
+
130
+ should "not raise ArgumentError" do
131
+ assert_nothing_raised {
132
+ Post.html_fragment :html_string, :scrub => @scrubber
133
+ }
134
+ end
135
+
136
+ should "scrub properly" do
137
+ Post.html_fragment :html_string, :scrub => @scrubber
138
+ post = Post.new :html_string => HTML_STRING, :plain_text => PLAIN_TEXT
139
+ post.valid?
140
+ assert @called
141
+ end
142
+ end
143
+
122
144
  end
123
145
 
124
146
  end
@@ -10,6 +10,48 @@ class TestAdHoc < Test::Unit::TestCase
10
10
  assert_equal Loofah.scrub_document("", :prune).text, ""
11
11
  end
12
12
 
13
+ def test_xml_document_scrub
14
+ xml = Loofah.xml_document <<-EOXML
15
+ <root>
16
+ <employee deceased='true'>Abraham Lincoln</employee>
17
+ <employee deceased='false'>Abe Vigoda</employee>
18
+ </root>
19
+ EOXML
20
+ bring_out_your_dead = Loofah::Scrubber.new do |node|
21
+ if node.name == "employee" and node["deceased"] == "true"
22
+ node.remove
23
+ Loofah::Scrubber::STOP # don't bother with the rest of the subtree
24
+ end
25
+ end
26
+ assert_equal 2, xml.css("employee").length
27
+
28
+ xml.scrub!(bring_out_your_dead)
29
+
30
+ employees = xml.css "employee"
31
+ assert_equal 1, employees.length
32
+ assert_equal "Abe Vigoda", employees.first.inner_text
33
+ end
34
+
35
+ def test_xml_fragment_scrub
36
+ xml = Loofah.xml_fragment <<-EOXML
37
+ <employee deceased='true'>Abraham Lincoln</employee>
38
+ <employee deceased='false'>Abe Vigoda</employee>
39
+ EOXML
40
+ bring_out_your_dead = Loofah::Scrubber.new do |node|
41
+ if node.name == "employee" and node["deceased"] == "true"
42
+ node.remove
43
+ Loofah::Scrubber::STOP # don't bother with the rest of the subtree
44
+ end
45
+ end
46
+ assert_equal 2, xml.css("employee").length
47
+
48
+ xml.scrub!(bring_out_your_dead)
49
+
50
+ employees = xml.css "employee"
51
+ assert_equal 1, employees.length
52
+ assert_equal "Abe Vigoda", employees.first.inner_text
53
+ end
54
+
13
55
  def test_removal_of_illegal_tag
14
56
  html = <<-HTML
15
57
  following this there should be no jim tag