rails-html-sanitizer 1.4.3 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of rails-html-sanitizer might be problematic. Click here for more details.

@@ -1,172 +1,422 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Rails
2
- module Html
3
- XPATHS_TO_REMOVE = %w{.//script .//form comment()}
4
+ module HTML
5
+ class Sanitizer
6
+ class << self
7
+ def html5_support?
8
+ return @html5_support if defined?(@html5_support)
9
+
10
+ @html5_support = Loofah.respond_to?(:html5_support?) && Loofah.html5_support?
11
+ end
12
+
13
+ def best_supported_vendor
14
+ html5_support? ? Rails::HTML5::Sanitizer : Rails::HTML4::Sanitizer
15
+ end
16
+ end
4
17
 
5
- class Sanitizer # :nodoc:
6
18
  def sanitize(html, options = {})
7
19
  raise NotImplementedError, "subclasses must implement sanitize method."
8
20
  end
9
21
 
10
22
  private
23
+ def remove_xpaths(node, xpaths)
24
+ node.xpath(*xpaths).remove
25
+ node
26
+ end
27
+
28
+ def properly_encode(fragment, options)
29
+ fragment.xml? ? fragment.to_xml(options) : fragment.to_html(options)
30
+ end
31
+ end
11
32
 
12
- def remove_xpaths(node, xpaths)
13
- node.xpath(*xpaths).remove
14
- node
33
+ module Concern
34
+ module ComposedSanitize
35
+ def sanitize(html, options = {})
36
+ return unless html
37
+ return html if html.empty?
38
+
39
+ serialize(scrub(parse_fragment(html), options))
40
+ end
15
41
  end
16
42
 
17
- def properly_encode(fragment, options)
18
- fragment.xml? ? fragment.to_xml(options) : fragment.to_html(options)
43
+ module Parser
44
+ module HTML4
45
+ def parse_fragment(html)
46
+ Loofah.html4_fragment(html)
47
+ end
48
+ end
49
+
50
+ module HTML5
51
+ def parse_fragment(html)
52
+ Loofah.html5_fragment(html)
53
+ end
54
+ end if Rails::HTML::Sanitizer.html5_support?
55
+ end
56
+
57
+ module Scrubber
58
+ module Full
59
+ def scrub(fragment, options = {})
60
+ fragment.scrub!(TextOnlyScrubber.new)
61
+ end
62
+ end
63
+
64
+ module Link
65
+ def initialize
66
+ super
67
+ @link_scrubber = TargetScrubber.new
68
+ @link_scrubber.tags = %w(a)
69
+ @link_scrubber.attributes = %w(href)
70
+ end
71
+
72
+ def scrub(fragment, options = {})
73
+ fragment.scrub!(@link_scrubber)
74
+ end
75
+ end
76
+
77
+ module SafeList
78
+ # The default safe list for tags
79
+ DEFAULT_ALLOWED_TAGS = Set.new([
80
+ "a",
81
+ "abbr",
82
+ "acronym",
83
+ "address",
84
+ "b",
85
+ "big",
86
+ "blockquote",
87
+ "br",
88
+ "cite",
89
+ "code",
90
+ "dd",
91
+ "del",
92
+ "dfn",
93
+ "div",
94
+ "dl",
95
+ "dt",
96
+ "em",
97
+ "h1",
98
+ "h2",
99
+ "h3",
100
+ "h4",
101
+ "h5",
102
+ "h6",
103
+ "hr",
104
+ "i",
105
+ "img",
106
+ "ins",
107
+ "kbd",
108
+ "li",
109
+ "ol",
110
+ "p",
111
+ "pre",
112
+ "samp",
113
+ "small",
114
+ "span",
115
+ "strong",
116
+ "sub",
117
+ "sup",
118
+ "time",
119
+ "tt",
120
+ "ul",
121
+ "var",
122
+ ]).freeze
123
+
124
+ # The default safe list for attributes
125
+ DEFAULT_ALLOWED_ATTRIBUTES = Set.new([
126
+ "abbr",
127
+ "alt",
128
+ "cite",
129
+ "class",
130
+ "datetime",
131
+ "height",
132
+ "href",
133
+ "lang",
134
+ "name",
135
+ "src",
136
+ "title",
137
+ "width",
138
+ "xml:lang",
139
+ ]).freeze
140
+
141
+ def self.included(klass)
142
+ class << klass
143
+ attr_accessor :allowed_tags
144
+ attr_accessor :allowed_attributes
145
+ end
146
+
147
+ klass.allowed_tags = DEFAULT_ALLOWED_TAGS.dup
148
+ klass.allowed_attributes = DEFAULT_ALLOWED_ATTRIBUTES.dup
149
+ end
150
+
151
+ def initialize(prune: false)
152
+ @permit_scrubber = PermitScrubber.new(prune: prune)
153
+ end
154
+
155
+ def scrub(fragment, options = {})
156
+ if scrubber = options[:scrubber]
157
+ # No duck typing, Loofah ensures subclass of Loofah::Scrubber
158
+ fragment.scrub!(scrubber)
159
+ elsif allowed_tags(options) || allowed_attributes(options)
160
+ @permit_scrubber.tags = allowed_tags(options)
161
+ @permit_scrubber.attributes = allowed_attributes(options)
162
+ fragment.scrub!(@permit_scrubber)
163
+ else
164
+ fragment.scrub!(:strip)
165
+ end
166
+ end
167
+
168
+ def sanitize_css(style_string)
169
+ Loofah::HTML5::Scrub.scrub_css(style_string)
170
+ end
171
+
172
+ private
173
+ def allowed_tags(options)
174
+ options[:tags] || self.class.allowed_tags
175
+ end
176
+
177
+ def allowed_attributes(options)
178
+ options[:attributes] || self.class.allowed_attributes
179
+ end
180
+ end
181
+ end
182
+
183
+ module Serializer
184
+ module UTF8Encode
185
+ def serialize(fragment)
186
+ properly_encode(fragment, encoding: "UTF-8")
187
+ end
188
+ end
19
189
  end
20
190
  end
191
+ end
21
192
 
22
- # === Rails::Html::FullSanitizer
23
- # Removes all tags but strips out scripts, forms and comments.
24
- #
25
- # full_sanitizer = Rails::Html::FullSanitizer.new
26
- # full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
27
- # # => Bold no more! See more here...
28
- class FullSanitizer < Sanitizer
29
- def sanitize(html, options = {})
30
- return unless html
31
- return html if html.empty?
193
+ module HTML4
194
+ module Sanitizer
195
+ module VendorMethods
196
+ def full_sanitizer
197
+ Rails::HTML4::FullSanitizer
198
+ end
32
199
 
33
- loofah_fragment = Loofah.fragment(html)
200
+ def link_sanitizer
201
+ Rails::HTML4::LinkSanitizer
202
+ end
34
203
 
35
- remove_xpaths(loofah_fragment, XPATHS_TO_REMOVE)
36
- loofah_fragment.scrub!(TextOnlyScrubber.new)
204
+ def safe_list_sanitizer
205
+ Rails::HTML4::SafeListSanitizer
206
+ end
37
207
 
38
- properly_encode(loofah_fragment, encoding: 'UTF-8')
208
+ def white_list_sanitizer # :nodoc:
209
+ safe_list_sanitizer
210
+ end
39
211
  end
212
+
213
+ extend VendorMethods
40
214
  end
41
215
 
42
- # === Rails::Html::LinkSanitizer
43
- # Removes +a+ tags and +href+ attributes leaving only the link text.
216
+ # == Rails::HTML4::FullSanitizer
44
217
  #
45
- # link_sanitizer = Rails::Html::LinkSanitizer.new
46
- # link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
218
+ # Removes all tags from HTML4 but strips out scripts, forms and comments.
47
219
  #
48
- # => 'Only the link text will be kept.'
49
- class LinkSanitizer < Sanitizer
50
- def initialize
51
- @link_scrubber = TargetScrubber.new
52
- @link_scrubber.tags = %w(a)
53
- @link_scrubber.attributes = %w(href)
54
- end
220
+ # full_sanitizer = Rails::HTML4::FullSanitizer.new
221
+ # full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
222
+ # # => "Bold no more! See more here..."
223
+ #
224
+ class FullSanitizer < Rails::HTML::Sanitizer
225
+ include HTML::Concern::ComposedSanitize
226
+ include HTML::Concern::Parser::HTML4
227
+ include HTML::Concern::Scrubber::Full
228
+ include HTML::Concern::Serializer::UTF8Encode
229
+ end
55
230
 
56
- def sanitize(html, options = {})
57
- Loofah.scrub_fragment(html, @link_scrubber).to_s
58
- end
231
+ # == Rails::HTML4::LinkSanitizer
232
+ #
233
+ # Removes +a+ tags and +href+ attributes from HTML4 leaving only the link text.
234
+ #
235
+ # link_sanitizer = Rails::HTML4::LinkSanitizer.new
236
+ # link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
237
+ # # => "Only the link text will be kept."
238
+ #
239
+ class LinkSanitizer < Rails::HTML::Sanitizer
240
+ include HTML::Concern::ComposedSanitize
241
+ include HTML::Concern::Parser::HTML4
242
+ include HTML::Concern::Scrubber::Link
243
+ include HTML::Concern::Serializer::UTF8Encode
59
244
  end
60
245
 
61
- # === Rails::Html::SafeListSanitizer
62
- # Sanitizes html and css from an extensive safe list (see link further down).
246
+ # == Rails::HTML4::SafeListSanitizer
247
+ #
248
+ # Sanitizes HTML4 and CSS from an extensive safe list.
63
249
  #
64
250
  # === Whitespace
65
- # We can't make any guarantees about whitespace being kept or stripped.
66
- # Loofah uses Nokogiri, which wraps either a C or Java parser for the
67
- # respective Ruby implementation.
68
- # Those two parsers determine how whitespace is ultimately handled.
69
251
  #
70
- # When the stripped markup will be rendered the users browser won't take
71
- # whitespace into account anyway. It might be better to suggest your users
72
- # wrap their whitespace sensitive content in pre tags or that you do
73
- # so automatically.
252
+ # We can't make any guarantees about whitespace being kept or stripped. Loofah uses Nokogiri,
253
+ # which wraps either a C or Java parser for the respective Ruby implementation. Those two
254
+ # parsers determine how whitespace is ultimately handled.
255
+ #
256
+ # When the stripped markup will be rendered the users browser won't take whitespace into account
257
+ # anyway. It might be better to suggest your users wrap their whitespace sensitive content in
258
+ # pre tags or that you do so automatically.
74
259
  #
75
260
  # === Options
76
- # Sanitizes both html and css via the safe lists found here:
77
- # https://github.com/flavorjones/loofah/blob/master/lib/loofah/html5/safelist.rb
78
261
  #
79
- # SafeListSanitizer also accepts options to configure
80
- # the safe list used when sanitizing html.
262
+ # Sanitizes both html and css via the safe lists found in
263
+ # Rails::HTML::Concern::Scrubber::SafeList
264
+ #
265
+ # SafeListSanitizer also accepts options to configure the safe list used when sanitizing html.
81
266
  # There's a class level option:
82
- # Rails::Html::SafeListSanitizer.allowed_tags = %w(table tr td)
83
- # Rails::Html::SafeListSanitizer.allowed_attributes = %w(id class style)
84
267
  #
85
- # Tags and attributes can also be passed to +sanitize+.
86
- # Passed options take precedence over the class level options.
268
+ # Rails::HTML4::SafeListSanitizer.allowed_tags = %w(table tr td)
269
+ # Rails::HTML4::SafeListSanitizer.allowed_attributes = %w(id class style)
270
+ #
271
+ # Tags and attributes can also be passed to +sanitize+. Passed options take precedence over the
272
+ # class level options.
87
273
  #
88
274
  # === Examples
89
- # safe_list_sanitizer = Rails::Html::SafeListSanitizer.new
90
275
  #
91
- # Sanitize css doesn't take options
92
- # safe_list_sanitizer.sanitize_css('background-color: #000;')
276
+ # safe_list_sanitizer = Rails::HTML4::SafeListSanitizer.new
93
277
  #
94
- # Default: sanitize via a extensive safe list of allowed elements
95
- # safe_list_sanitizer.sanitize(@article.body)
278
+ # # default: sanitize via a extensive safe list of allowed elements
279
+ # safe_list_sanitizer.sanitize(@article.body)
96
280
  #
97
- # Safe list via the supplied tags and attributes
98
- # safe_list_sanitizer.sanitize(@article.body, tags: %w(table tr td),
99
- # attributes: %w(id class style))
281
+ # # sanitize via the supplied tags and attributes
282
+ # safe_list_sanitizer.sanitize(
283
+ # @article.body,
284
+ # tags: %w(table tr td),
285
+ # attributes: %w(id class style),
286
+ # )
100
287
  #
101
- # Safe list via a custom scrubber
102
- # safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
103
- class SafeListSanitizer < Sanitizer
104
- class << self
105
- attr_accessor :allowed_tags
106
- attr_accessor :allowed_attributes
107
- end
108
- self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
109
- sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
110
- acronym a img blockquote del ins))
111
- self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
112
-
113
- def initialize
114
- @permit_scrubber = PermitScrubber.new
115
- end
116
-
117
- def sanitize(html, options = {})
118
- return unless html
119
- return html if html.empty?
120
-
121
- loofah_fragment = Loofah.fragment(html)
288
+ # # sanitize via a custom Loofah scrubber
289
+ # safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
290
+ #
291
+ # # prune nodes from the tree instead of stripping tags and leaving inner content
292
+ # safe_list_sanitizer = Rails::HTML4::SafeListSanitizer.new(prune: true)
293
+ #
294
+ # # the sanitizer can also sanitize CSS
295
+ # safe_list_sanitizer.sanitize_css('background-color: #000;')
296
+ #
297
+ class SafeListSanitizer < Rails::HTML::Sanitizer
298
+ include HTML::Concern::ComposedSanitize
299
+ include HTML::Concern::Parser::HTML4
300
+ include HTML::Concern::Scrubber::SafeList
301
+ include HTML::Concern::Serializer::UTF8Encode
302
+ end
303
+ end
122
304
 
123
- if scrubber = options[:scrubber]
124
- # No duck typing, Loofah ensures subclass of Loofah::Scrubber
125
- loofah_fragment.scrub!(scrubber)
126
- elsif allowed_tags(options) || allowed_attributes(options)
127
- @permit_scrubber.tags = allowed_tags(options)
128
- @permit_scrubber.attributes = allowed_attributes(options)
129
- loofah_fragment.scrub!(@permit_scrubber)
130
- else
131
- remove_xpaths(loofah_fragment, XPATHS_TO_REMOVE)
132
- loofah_fragment.scrub!(:strip)
305
+ module HTML5
306
+ class Sanitizer
307
+ class << self
308
+ def full_sanitizer
309
+ Rails::HTML5::FullSanitizer
133
310
  end
134
311
 
135
- properly_encode(loofah_fragment, encoding: 'UTF-8')
136
- end
137
-
138
- def sanitize_css(style_string)
139
- Loofah::HTML5::Scrub.scrub_css(style_string)
140
- end
141
-
142
- private
143
-
144
- def loofah_using_html5?
145
- # future-proofing, see https://github.com/flavorjones/loofah/pull/239
146
- Loofah.respond_to?(:html5_mode?) && Loofah.html5_mode?
147
- end
312
+ def link_sanitizer
313
+ Rails::HTML5::LinkSanitizer
314
+ end
148
315
 
149
- def remove_safelist_tag_combinations(tags)
150
- if !loofah_using_html5? && tags.include?("select") && tags.include?("style")
151
- warn("WARNING: #{self.class}: removing 'style' from safelist, should not be combined with 'select'")
152
- tags.delete("style")
316
+ def safe_list_sanitizer
317
+ Rails::HTML5::SafeListSanitizer
153
318
  end
154
- tags
155
- end
156
319
 
157
- def allowed_tags(options)
158
- if options[:tags]
159
- remove_safelist_tag_combinations(options[:tags])
160
- else
161
- self.class.allowed_tags
320
+ def white_list_sanitizer # :nodoc:
321
+ safe_list_sanitizer
162
322
  end
163
323
  end
324
+ end
164
325
 
165
- def allowed_attributes(options)
166
- options[:attributes] || self.class.allowed_attributes
167
- end
326
+ # == Rails::HTML5::FullSanitizer
327
+ #
328
+ # Removes all tags from HTML5 but strips out scripts, forms and comments.
329
+ #
330
+ # full_sanitizer = Rails::HTML5::FullSanitizer.new
331
+ # full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
332
+ # # => "Bold no more! See more here..."
333
+ #
334
+ class FullSanitizer < Rails::HTML::Sanitizer
335
+ include HTML::Concern::ComposedSanitize
336
+ include HTML::Concern::Parser::HTML5
337
+ include HTML::Concern::Scrubber::Full
338
+ include HTML::Concern::Serializer::UTF8Encode
339
+ end
340
+
341
+ # == Rails::HTML5::LinkSanitizer
342
+ #
343
+ # Removes +a+ tags and +href+ attributes from HTML5 leaving only the link text.
344
+ #
345
+ # link_sanitizer = Rails::HTML5::LinkSanitizer.new
346
+ # link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
347
+ # # => "Only the link text will be kept."
348
+ #
349
+ class LinkSanitizer < Rails::HTML::Sanitizer
350
+ include HTML::Concern::ComposedSanitize
351
+ include HTML::Concern::Parser::HTML5
352
+ include HTML::Concern::Scrubber::Link
353
+ include HTML::Concern::Serializer::UTF8Encode
354
+ end
355
+
356
+ # == Rails::HTML5::SafeListSanitizer
357
+ #
358
+ # Sanitizes HTML5 and CSS from an extensive safe list.
359
+ #
360
+ # === Whitespace
361
+ #
362
+ # We can't make any guarantees about whitespace being kept or stripped. Loofah uses Nokogiri,
363
+ # which wraps either a C or Java parser for the respective Ruby implementation. Those two
364
+ # parsers determine how whitespace is ultimately handled.
365
+ #
366
+ # When the stripped markup will be rendered the users browser won't take whitespace into account
367
+ # anyway. It might be better to suggest your users wrap their whitespace sensitive content in
368
+ # pre tags or that you do so automatically.
369
+ #
370
+ # === Options
371
+ #
372
+ # Sanitizes both html and css via the safe lists found in
373
+ # Rails::HTML::Concern::Scrubber::SafeList
374
+ #
375
+ # SafeListSanitizer also accepts options to configure the safe list used when sanitizing html.
376
+ # There's a class level option:
377
+ #
378
+ # Rails::HTML5::SafeListSanitizer.allowed_tags = %w(table tr td)
379
+ # Rails::HTML5::SafeListSanitizer.allowed_attributes = %w(id class style)
380
+ #
381
+ # Tags and attributes can also be passed to +sanitize+. Passed options take precedence over the
382
+ # class level options.
383
+ #
384
+ # === Examples
385
+ #
386
+ # safe_list_sanitizer = Rails::HTML5::SafeListSanitizer.new
387
+ #
388
+ # # default: sanitize via a extensive safe list of allowed elements
389
+ # safe_list_sanitizer.sanitize(@article.body)
390
+ #
391
+ # # sanitize via the supplied tags and attributes
392
+ # safe_list_sanitizer.sanitize(
393
+ # @article.body,
394
+ # tags: %w(table tr td),
395
+ # attributes: %w(id class style),
396
+ # )
397
+ #
398
+ # # sanitize via a custom Loofah scrubber
399
+ # safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
400
+ #
401
+ # # prune nodes from the tree instead of stripping tags and leaving inner content
402
+ # safe_list_sanitizer = Rails::HTML5::SafeListSanitizer.new(prune: true)
403
+ #
404
+ # # the sanitizer can also sanitize CSS
405
+ # safe_list_sanitizer.sanitize_css('background-color: #000;')
406
+ #
407
+ class SafeListSanitizer < Rails::HTML::Sanitizer
408
+ include HTML::Concern::ComposedSanitize
409
+ include HTML::Concern::Parser::HTML5
410
+ include HTML::Concern::Scrubber::SafeList
411
+ include HTML::Concern::Serializer::UTF8Encode
168
412
  end
413
+ end if Rails::HTML::Sanitizer.html5_support?
169
414
 
170
- WhiteListSanitizer = SafeListSanitizer
415
+ module HTML
416
+ Sanitizer.extend(HTML4::Sanitizer::VendorMethods) # :nodoc:
417
+ FullSanitizer = HTML4::FullSanitizer # :nodoc:
418
+ LinkSanitizer = HTML4::LinkSanitizer # :nodoc:
419
+ SafeListSanitizer = HTML4::SafeListSanitizer # :nodoc:
420
+ WhiteListSanitizer = SafeListSanitizer # :nodoc:
171
421
  end
172
422
  end