rails-html-sanitizer 1.5.0 → 1.6.0.rc2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,155 +1,422 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Rails
2
- module Html
3
- XPATHS_TO_REMOVE = %w{.//script .//form comment()}
4
+ module HTML
5
+ class Sanitizer
6
+ class << self
7
+ def html5_support?
8
+ return @html5_support if defined?(@html5_support)
9
+
10
+ @html5_support = Loofah.respond_to?(:html5_support?) && Loofah.html5_support?
11
+ end
12
+
13
+ def best_supported_vendor
14
+ html5_support? ? Rails::HTML5::Sanitizer : Rails::HTML4::Sanitizer
15
+ end
16
+ end
4
17
 
5
- class Sanitizer # :nodoc:
6
18
  def sanitize(html, options = {})
7
19
  raise NotImplementedError, "subclasses must implement sanitize method."
8
20
  end
9
21
 
10
22
  private
23
+ def remove_xpaths(node, xpaths)
24
+ node.xpath(*xpaths).remove
25
+ node
26
+ end
27
+
28
+ def properly_encode(fragment, options)
29
+ fragment.xml? ? fragment.to_xml(options) : fragment.to_html(options)
30
+ end
31
+ end
32
+
33
+ module Concern
34
+ module ComposedSanitize
35
+ def sanitize(html, options = {})
36
+ return unless html
37
+ return html if html.empty?
38
+
39
+ serialize(scrub(parse_fragment(html), options))
40
+ end
41
+ end
42
+
43
+ module Parser
44
+ module HTML4
45
+ def parse_fragment(html)
46
+ Loofah.html4_fragment(html)
47
+ end
48
+ end
49
+
50
+ module HTML5
51
+ def parse_fragment(html)
52
+ Loofah.html5_fragment(html)
53
+ end
54
+ end if Rails::HTML::Sanitizer.html5_support?
55
+ end
56
+
57
+ module Scrubber
58
+ module Full
59
+ def scrub(fragment, options = {})
60
+ fragment.scrub!(TextOnlyScrubber.new)
61
+ end
62
+ end
63
+
64
+ module Link
65
+ def initialize
66
+ super
67
+ @link_scrubber = TargetScrubber.new
68
+ @link_scrubber.tags = %w(a)
69
+ @link_scrubber.attributes = %w(href)
70
+ end
71
+
72
+ def scrub(fragment, options = {})
73
+ fragment.scrub!(@link_scrubber)
74
+ end
75
+ end
76
+
77
+ module SafeList
78
+ # The default safe list for tags
79
+ DEFAULT_ALLOWED_TAGS = Set.new([
80
+ "a",
81
+ "abbr",
82
+ "acronym",
83
+ "address",
84
+ "b",
85
+ "big",
86
+ "blockquote",
87
+ "br",
88
+ "cite",
89
+ "code",
90
+ "dd",
91
+ "del",
92
+ "dfn",
93
+ "div",
94
+ "dl",
95
+ "dt",
96
+ "em",
97
+ "h1",
98
+ "h2",
99
+ "h3",
100
+ "h4",
101
+ "h5",
102
+ "h6",
103
+ "hr",
104
+ "i",
105
+ "img",
106
+ "ins",
107
+ "kbd",
108
+ "li",
109
+ "ol",
110
+ "p",
111
+ "pre",
112
+ "samp",
113
+ "small",
114
+ "span",
115
+ "strong",
116
+ "sub",
117
+ "sup",
118
+ "time",
119
+ "tt",
120
+ "ul",
121
+ "var",
122
+ ]).freeze
123
+
124
+ # The default safe list for attributes
125
+ DEFAULT_ALLOWED_ATTRIBUTES = Set.new([
126
+ "abbr",
127
+ "alt",
128
+ "cite",
129
+ "class",
130
+ "datetime",
131
+ "height",
132
+ "href",
133
+ "lang",
134
+ "name",
135
+ "src",
136
+ "title",
137
+ "width",
138
+ "xml:lang",
139
+ ]).freeze
11
140
 
12
- def remove_xpaths(node, xpaths)
13
- node.xpath(*xpaths).remove
14
- node
141
+ def self.included(klass)
142
+ class << klass
143
+ attr_accessor :allowed_tags
144
+ attr_accessor :allowed_attributes
145
+ end
146
+
147
+ klass.allowed_tags = DEFAULT_ALLOWED_TAGS.dup
148
+ klass.allowed_attributes = DEFAULT_ALLOWED_ATTRIBUTES.dup
149
+ end
150
+
151
+ def initialize(prune: false)
152
+ @permit_scrubber = PermitScrubber.new(prune: prune)
153
+ end
154
+
155
+ def scrub(fragment, options = {})
156
+ if scrubber = options[:scrubber]
157
+ # No duck typing, Loofah ensures subclass of Loofah::Scrubber
158
+ fragment.scrub!(scrubber)
159
+ elsif allowed_tags(options) || allowed_attributes(options)
160
+ @permit_scrubber.tags = allowed_tags(options)
161
+ @permit_scrubber.attributes = allowed_attributes(options)
162
+ fragment.scrub!(@permit_scrubber)
163
+ else
164
+ fragment.scrub!(:strip)
165
+ end
166
+ end
167
+
168
+ def sanitize_css(style_string)
169
+ Loofah::HTML5::Scrub.scrub_css(style_string)
170
+ end
171
+
172
+ private
173
+ def allowed_tags(options)
174
+ options[:tags] || self.class.allowed_tags
175
+ end
176
+
177
+ def allowed_attributes(options)
178
+ options[:attributes] || self.class.allowed_attributes
179
+ end
180
+ end
15
181
  end
16
182
 
17
- def properly_encode(fragment, options)
18
- fragment.xml? ? fragment.to_xml(options) : fragment.to_html(options)
183
+ module Serializer
184
+ module UTF8Encode
185
+ def serialize(fragment)
186
+ properly_encode(fragment, encoding: "UTF-8")
187
+ end
188
+ end
19
189
  end
20
190
  end
191
+ end
21
192
 
22
- # === Rails::Html::FullSanitizer
23
- # Removes all tags but strips out scripts, forms and comments.
24
- #
25
- # full_sanitizer = Rails::Html::FullSanitizer.new
26
- # full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
27
- # # => Bold no more! See more here...
28
- class FullSanitizer < Sanitizer
29
- def sanitize(html, options = {})
30
- return unless html
31
- return html if html.empty?
193
+ module HTML4
194
+ module Sanitizer
195
+ module VendorMethods
196
+ def full_sanitizer
197
+ Rails::HTML4::FullSanitizer
198
+ end
32
199
 
33
- loofah_fragment = Loofah.fragment(html)
200
+ def link_sanitizer
201
+ Rails::HTML4::LinkSanitizer
202
+ end
34
203
 
35
- remove_xpaths(loofah_fragment, XPATHS_TO_REMOVE)
36
- loofah_fragment.scrub!(TextOnlyScrubber.new)
204
+ def safe_list_sanitizer
205
+ Rails::HTML4::SafeListSanitizer
206
+ end
37
207
 
38
- properly_encode(loofah_fragment, encoding: 'UTF-8')
208
+ def white_list_sanitizer # :nodoc:
209
+ safe_list_sanitizer
210
+ end
39
211
  end
212
+
213
+ extend VendorMethods
40
214
  end
41
215
 
42
- # === Rails::Html::LinkSanitizer
43
- # Removes +a+ tags and +href+ attributes leaving only the link text.
216
+ # == Rails::HTML4::FullSanitizer
44
217
  #
45
- # link_sanitizer = Rails::Html::LinkSanitizer.new
46
- # link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
218
+ # Removes all tags from HTML4 but strips out scripts, forms and comments.
47
219
  #
48
- # => 'Only the link text will be kept.'
49
- class LinkSanitizer < Sanitizer
50
- def initialize
51
- @link_scrubber = TargetScrubber.new
52
- @link_scrubber.tags = %w(a)
53
- @link_scrubber.attributes = %w(href)
54
- end
220
+ # full_sanitizer = Rails::HTML4::FullSanitizer.new
221
+ # full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
222
+ # # => "Bold no more! See more here..."
223
+ #
224
+ class FullSanitizer < Rails::HTML::Sanitizer
225
+ include HTML::Concern::ComposedSanitize
226
+ include HTML::Concern::Parser::HTML4
227
+ include HTML::Concern::Scrubber::Full
228
+ include HTML::Concern::Serializer::UTF8Encode
229
+ end
55
230
 
56
- def sanitize(html, options = {})
57
- Loofah.scrub_fragment(html, @link_scrubber).to_s
58
- end
231
+ # == Rails::HTML4::LinkSanitizer
232
+ #
233
+ # Removes +a+ tags and +href+ attributes from HTML4 leaving only the link text.
234
+ #
235
+ # link_sanitizer = Rails::HTML4::LinkSanitizer.new
236
+ # link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
237
+ # # => "Only the link text will be kept."
238
+ #
239
+ class LinkSanitizer < Rails::HTML::Sanitizer
240
+ include HTML::Concern::ComposedSanitize
241
+ include HTML::Concern::Parser::HTML4
242
+ include HTML::Concern::Scrubber::Link
243
+ include HTML::Concern::Serializer::UTF8Encode
59
244
  end
60
245
 
61
- # === Rails::Html::SafeListSanitizer
62
- # Sanitizes html and css from an extensive safe list (see link further down).
246
+ # == Rails::HTML4::SafeListSanitizer
247
+ #
248
+ # Sanitizes HTML4 and CSS from an extensive safe list.
63
249
  #
64
250
  # === Whitespace
65
- # We can't make any guarantees about whitespace being kept or stripped.
66
- # Loofah uses Nokogiri, which wraps either a C or Java parser for the
67
- # respective Ruby implementation.
68
- # Those two parsers determine how whitespace is ultimately handled.
69
251
  #
70
- # When the stripped markup will be rendered the users browser won't take
71
- # whitespace into account anyway. It might be better to suggest your users
72
- # wrap their whitespace sensitive content in pre tags or that you do
73
- # so automatically.
252
+ # We can't make any guarantees about whitespace being kept or stripped. Loofah uses Nokogiri,
253
+ # which wraps either a C or Java parser for the respective Ruby implementation. Those two
254
+ # parsers determine how whitespace is ultimately handled.
255
+ #
256
+ # When the stripped markup will be rendered the users browser won't take whitespace into account
257
+ # anyway. It might be better to suggest your users wrap their whitespace sensitive content in
258
+ # pre tags or that you do so automatically.
74
259
  #
75
260
  # === Options
76
- # Sanitizes both html and css via the safe lists found here:
77
- # https://github.com/flavorjones/loofah/blob/master/lib/loofah/html5/safelist.rb
78
261
  #
79
- # SafeListSanitizer also accepts options to configure
80
- # the safe list used when sanitizing html.
262
+ # Sanitizes both html and css via the safe lists found in
263
+ # Rails::HTML::Concern::Scrubber::SafeList
264
+ #
265
+ # SafeListSanitizer also accepts options to configure the safe list used when sanitizing html.
81
266
  # There's a class level option:
82
- # Rails::Html::SafeListSanitizer.allowed_tags = %w(table tr td)
83
- # Rails::Html::SafeListSanitizer.allowed_attributes = %w(id class style)
84
267
  #
85
- # Tags and attributes can also be passed to +sanitize+.
86
- # Passed options take precedence over the class level options.
268
+ # Rails::HTML4::SafeListSanitizer.allowed_tags = %w(table tr td)
269
+ # Rails::HTML4::SafeListSanitizer.allowed_attributes = %w(id class style)
270
+ #
271
+ # Tags and attributes can also be passed to +sanitize+. Passed options take precedence over the
272
+ # class level options.
87
273
  #
88
274
  # === Examples
89
- # safe_list_sanitizer = Rails::Html::SafeListSanitizer.new
90
275
  #
91
- # Sanitize css doesn't take options
92
- # safe_list_sanitizer.sanitize_css('background-color: #000;')
276
+ # safe_list_sanitizer = Rails::HTML4::SafeListSanitizer.new
93
277
  #
94
- # Default: sanitize via a extensive safe list of allowed elements
95
- # safe_list_sanitizer.sanitize(@article.body)
278
+ # # default: sanitize via a extensive safe list of allowed elements
279
+ # safe_list_sanitizer.sanitize(@article.body)
96
280
  #
97
- # Safe list via the supplied tags and attributes
98
- # safe_list_sanitizer.sanitize(@article.body, tags: %w(table tr td),
99
- # attributes: %w(id class style))
281
+ # # sanitize via the supplied tags and attributes
282
+ # safe_list_sanitizer.sanitize(
283
+ # @article.body,
284
+ # tags: %w(table tr td),
285
+ # attributes: %w(id class style),
286
+ # )
100
287
  #
101
- # Safe list via a custom scrubber
102
- # safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
103
- class SafeListSanitizer < Sanitizer
104
- class << self
105
- attr_accessor :allowed_tags
106
- attr_accessor :allowed_attributes
107
- end
108
- self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
109
- sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
110
- acronym a img blockquote del ins))
111
- self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
112
-
113
- def initialize(prune: false)
114
- @permit_scrubber = PermitScrubber.new(prune: prune)
115
- end
116
-
117
- def sanitize(html, options = {})
118
- return unless html
119
- return html if html.empty?
288
+ # # sanitize via a custom Loofah scrubber
289
+ # safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
290
+ #
291
+ # # prune nodes from the tree instead of stripping tags and leaving inner content
292
+ # safe_list_sanitizer = Rails::HTML4::SafeListSanitizer.new(prune: true)
293
+ #
294
+ # # the sanitizer can also sanitize CSS
295
+ # safe_list_sanitizer.sanitize_css('background-color: #000;')
296
+ #
297
+ class SafeListSanitizer < Rails::HTML::Sanitizer
298
+ include HTML::Concern::ComposedSanitize
299
+ include HTML::Concern::Parser::HTML4
300
+ include HTML::Concern::Scrubber::SafeList
301
+ include HTML::Concern::Serializer::UTF8Encode
302
+ end
303
+ end
120
304
 
121
- loofah_fragment = Loofah.fragment(html)
305
+ module HTML5
306
+ class Sanitizer
307
+ class << self
308
+ def full_sanitizer
309
+ Rails::HTML5::FullSanitizer
310
+ end
122
311
 
123
- if scrubber = options[:scrubber]
124
- # No duck typing, Loofah ensures subclass of Loofah::Scrubber
125
- loofah_fragment.scrub!(scrubber)
126
- elsif allowed_tags(options) || allowed_attributes(options)
127
- @permit_scrubber.tags = allowed_tags(options)
128
- @permit_scrubber.attributes = allowed_attributes(options)
129
- loofah_fragment.scrub!(@permit_scrubber)
130
- else
131
- remove_xpaths(loofah_fragment, XPATHS_TO_REMOVE)
132
- loofah_fragment.scrub!(:strip)
312
+ def link_sanitizer
313
+ Rails::HTML5::LinkSanitizer
133
314
  end
134
315
 
135
- properly_encode(loofah_fragment, encoding: 'UTF-8')
136
- end
316
+ def safe_list_sanitizer
317
+ Rails::HTML5::SafeListSanitizer
318
+ end
137
319
 
138
- def sanitize_css(style_string)
139
- Loofah::HTML5::Scrub.scrub_css(style_string)
320
+ def white_list_sanitizer # :nodoc:
321
+ safe_list_sanitizer
322
+ end
140
323
  end
324
+ end
141
325
 
142
- private
326
+ # == Rails::HTML5::FullSanitizer
327
+ #
328
+ # Removes all tags from HTML5 but strips out scripts, forms and comments.
329
+ #
330
+ # full_sanitizer = Rails::HTML5::FullSanitizer.new
331
+ # full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
332
+ # # => "Bold no more! See more here..."
333
+ #
334
+ class FullSanitizer < Rails::HTML::Sanitizer
335
+ include HTML::Concern::ComposedSanitize
336
+ include HTML::Concern::Parser::HTML5
337
+ include HTML::Concern::Scrubber::Full
338
+ include HTML::Concern::Serializer::UTF8Encode
339
+ end
143
340
 
144
- def allowed_tags(options)
145
- options[:tags] || self.class.allowed_tags
146
- end
341
+ # == Rails::HTML5::LinkSanitizer
342
+ #
343
+ # Removes +a+ tags and +href+ attributes from HTML5 leaving only the link text.
344
+ #
345
+ # link_sanitizer = Rails::HTML5::LinkSanitizer.new
346
+ # link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
347
+ # # => "Only the link text will be kept."
348
+ #
349
+ class LinkSanitizer < Rails::HTML::Sanitizer
350
+ include HTML::Concern::ComposedSanitize
351
+ include HTML::Concern::Parser::HTML5
352
+ include HTML::Concern::Scrubber::Link
353
+ include HTML::Concern::Serializer::UTF8Encode
354
+ end
147
355
 
148
- def allowed_attributes(options)
149
- options[:attributes] || self.class.allowed_attributes
150
- end
356
+ # == Rails::HTML5::SafeListSanitizer
357
+ #
358
+ # Sanitizes HTML5 and CSS from an extensive safe list.
359
+ #
360
+ # === Whitespace
361
+ #
362
+ # We can't make any guarantees about whitespace being kept or stripped. Loofah uses Nokogiri,
363
+ # which wraps either a C or Java parser for the respective Ruby implementation. Those two
364
+ # parsers determine how whitespace is ultimately handled.
365
+ #
366
+ # When the stripped markup will be rendered the users browser won't take whitespace into account
367
+ # anyway. It might be better to suggest your users wrap their whitespace sensitive content in
368
+ # pre tags or that you do so automatically.
369
+ #
370
+ # === Options
371
+ #
372
+ # Sanitizes both html and css via the safe lists found in
373
+ # Rails::HTML::Concern::Scrubber::SafeList
374
+ #
375
+ # SafeListSanitizer also accepts options to configure the safe list used when sanitizing html.
376
+ # There's a class level option:
377
+ #
378
+ # Rails::HTML5::SafeListSanitizer.allowed_tags = %w(table tr td)
379
+ # Rails::HTML5::SafeListSanitizer.allowed_attributes = %w(id class style)
380
+ #
381
+ # Tags and attributes can also be passed to +sanitize+. Passed options take precedence over the
382
+ # class level options.
383
+ #
384
+ # === Examples
385
+ #
386
+ # safe_list_sanitizer = Rails::HTML5::SafeListSanitizer.new
387
+ #
388
+ # # default: sanitize via a extensive safe list of allowed elements
389
+ # safe_list_sanitizer.sanitize(@article.body)
390
+ #
391
+ # # sanitize via the supplied tags and attributes
392
+ # safe_list_sanitizer.sanitize(
393
+ # @article.body,
394
+ # tags: %w(table tr td),
395
+ # attributes: %w(id class style),
396
+ # )
397
+ #
398
+ # # sanitize via a custom Loofah scrubber
399
+ # safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
400
+ #
401
+ # # prune nodes from the tree instead of stripping tags and leaving inner content
402
+ # safe_list_sanitizer = Rails::HTML5::SafeListSanitizer.new(prune: true)
403
+ #
404
+ # # the sanitizer can also sanitize CSS
405
+ # safe_list_sanitizer.sanitize_css('background-color: #000;')
406
+ #
407
+ class SafeListSanitizer < Rails::HTML::Sanitizer
408
+ include HTML::Concern::ComposedSanitize
409
+ include HTML::Concern::Parser::HTML5
410
+ include HTML::Concern::Scrubber::SafeList
411
+ include HTML::Concern::Serializer::UTF8Encode
151
412
  end
413
+ end if Rails::HTML::Sanitizer.html5_support?
152
414
 
153
- WhiteListSanitizer = SafeListSanitizer
415
+ module HTML
416
+ Sanitizer.extend(HTML4::Sanitizer::VendorMethods) # :nodoc:
417
+ FullSanitizer = HTML4::FullSanitizer # :nodoc:
418
+ LinkSanitizer = HTML4::LinkSanitizer # :nodoc:
419
+ SafeListSanitizer = HTML4::SafeListSanitizer # :nodoc:
420
+ WhiteListSanitizer = SafeListSanitizer # :nodoc:
154
421
  end
155
422
  end