rails-html-sanitizer 1.5.0 → 1.6.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,155 +1,418 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Rails
2
- module Html
3
- XPATHS_TO_REMOVE = %w{.//script .//form comment()}
4
+ module HTML
5
+ class Sanitizer
6
+ class << self
7
+ def html5_support?
8
+ return @html5_support if defined?(@html5_support)
9
+
10
+ @html5_support = Loofah.respond_to?(:html5_support?) && Loofah.html5_support?
11
+ end
12
+ end
4
13
 
5
- class Sanitizer # :nodoc:
6
14
  def sanitize(html, options = {})
7
15
  raise NotImplementedError, "subclasses must implement sanitize method."
8
16
  end
9
17
 
10
18
  private
19
+ def remove_xpaths(node, xpaths)
20
+ node.xpath(*xpaths).remove
21
+ node
22
+ end
11
23
 
12
- def remove_xpaths(node, xpaths)
13
- node.xpath(*xpaths).remove
14
- node
24
+ def properly_encode(fragment, options)
25
+ fragment.xml? ? fragment.to_xml(options) : fragment.to_html(options)
26
+ end
27
+ end
28
+
29
+ module Concern
30
+ module ComposedSanitize
31
+ def sanitize(html, options = {})
32
+ return unless html
33
+ return html if html.empty?
34
+
35
+ serialize(scrub(parse_fragment(html), options))
36
+ end
15
37
  end
16
38
 
17
- def properly_encode(fragment, options)
18
- fragment.xml? ? fragment.to_xml(options) : fragment.to_html(options)
39
+ module Parser
40
+ module HTML4
41
+ def parse_fragment(html)
42
+ Loofah.html4_fragment(html)
43
+ end
44
+ end
45
+
46
+ module HTML5
47
+ def parse_fragment(html)
48
+ Loofah.html5_fragment(html)
49
+ end
50
+ end if Rails::HTML::Sanitizer.html5_support?
51
+ end
52
+
53
+ module Scrubber
54
+ module Full
55
+ def scrub(fragment, options = {})
56
+ fragment.scrub!(TextOnlyScrubber.new)
57
+ end
58
+ end
59
+
60
+ module Link
61
+ def initialize
62
+ super
63
+ @link_scrubber = TargetScrubber.new
64
+ @link_scrubber.tags = %w(a)
65
+ @link_scrubber.attributes = %w(href)
66
+ end
67
+
68
+ def scrub(fragment, options = {})
69
+ fragment.scrub!(@link_scrubber)
70
+ end
71
+ end
72
+
73
+ module SafeList
74
+ # The default safe list for tags
75
+ DEFAULT_ALLOWED_TAGS = Set.new([
76
+ "a",
77
+ "abbr",
78
+ "acronym",
79
+ "address",
80
+ "b",
81
+ "big",
82
+ "blockquote",
83
+ "br",
84
+ "cite",
85
+ "code",
86
+ "dd",
87
+ "del",
88
+ "dfn",
89
+ "div",
90
+ "dl",
91
+ "dt",
92
+ "em",
93
+ "h1",
94
+ "h2",
95
+ "h3",
96
+ "h4",
97
+ "h5",
98
+ "h6",
99
+ "hr",
100
+ "i",
101
+ "img",
102
+ "ins",
103
+ "kbd",
104
+ "li",
105
+ "ol",
106
+ "p",
107
+ "pre",
108
+ "samp",
109
+ "small",
110
+ "span",
111
+ "strong",
112
+ "sub",
113
+ "sup",
114
+ "time",
115
+ "tt",
116
+ "ul",
117
+ "var",
118
+ ]).freeze
119
+
120
+ # The default safe list for attributes
121
+ DEFAULT_ALLOWED_ATTRIBUTES = Set.new([
122
+ "abbr",
123
+ "alt",
124
+ "cite",
125
+ "class",
126
+ "datetime",
127
+ "height",
128
+ "href",
129
+ "lang",
130
+ "name",
131
+ "src",
132
+ "title",
133
+ "width",
134
+ "xml:lang",
135
+ ]).freeze
136
+
137
+ def self.included(klass)
138
+ class << klass
139
+ attr_accessor :allowed_tags
140
+ attr_accessor :allowed_attributes
141
+ end
142
+
143
+ klass.allowed_tags = DEFAULT_ALLOWED_TAGS.dup
144
+ klass.allowed_attributes = DEFAULT_ALLOWED_ATTRIBUTES.dup
145
+ end
146
+
147
+ def initialize(prune: false)
148
+ @permit_scrubber = PermitScrubber.new(prune: prune)
149
+ end
150
+
151
+ def scrub(fragment, options = {})
152
+ if scrubber = options[:scrubber]
153
+ # No duck typing, Loofah ensures subclass of Loofah::Scrubber
154
+ fragment.scrub!(scrubber)
155
+ elsif allowed_tags(options) || allowed_attributes(options)
156
+ @permit_scrubber.tags = allowed_tags(options)
157
+ @permit_scrubber.attributes = allowed_attributes(options)
158
+ fragment.scrub!(@permit_scrubber)
159
+ else
160
+ fragment.scrub!(:strip)
161
+ end
162
+ end
163
+
164
+ def sanitize_css(style_string)
165
+ Loofah::HTML5::Scrub.scrub_css(style_string)
166
+ end
167
+
168
+ private
169
+ def allowed_tags(options)
170
+ options[:tags] || self.class.allowed_tags
171
+ end
172
+
173
+ def allowed_attributes(options)
174
+ options[:attributes] || self.class.allowed_attributes
175
+ end
176
+ end
177
+ end
178
+
179
+ module Serializer
180
+ module UTF8Encode
181
+ def serialize(fragment)
182
+ properly_encode(fragment, encoding: "UTF-8")
183
+ end
184
+ end
19
185
  end
20
186
  end
187
+ end
21
188
 
22
- # === Rails::Html::FullSanitizer
23
- # Removes all tags but strips out scripts, forms and comments.
24
- #
25
- # full_sanitizer = Rails::Html::FullSanitizer.new
26
- # full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
27
- # # => Bold no more! See more here...
28
- class FullSanitizer < Sanitizer
29
- def sanitize(html, options = {})
30
- return unless html
31
- return html if html.empty?
189
+ module HTML4
190
+ module Sanitizer
191
+ module VendorMethods
192
+ def full_sanitizer
193
+ Rails::HTML4::FullSanitizer
194
+ end
32
195
 
33
- loofah_fragment = Loofah.fragment(html)
196
+ def link_sanitizer
197
+ Rails::HTML4::LinkSanitizer
198
+ end
34
199
 
35
- remove_xpaths(loofah_fragment, XPATHS_TO_REMOVE)
36
- loofah_fragment.scrub!(TextOnlyScrubber.new)
200
+ def safe_list_sanitizer
201
+ Rails::HTML4::SafeListSanitizer
202
+ end
37
203
 
38
- properly_encode(loofah_fragment, encoding: 'UTF-8')
204
+ def white_list_sanitizer # :nodoc:
205
+ safe_list_sanitizer
206
+ end
39
207
  end
208
+
209
+ extend VendorMethods
40
210
  end
41
211
 
42
- # === Rails::Html::LinkSanitizer
43
- # Removes +a+ tags and +href+ attributes leaving only the link text.
212
+ # == Rails::HTML4::FullSanitizer
44
213
  #
45
- # link_sanitizer = Rails::Html::LinkSanitizer.new
46
- # link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
214
+ # Removes all tags from HTML4 but strips out scripts, forms and comments.
47
215
  #
48
- # => 'Only the link text will be kept.'
49
- class LinkSanitizer < Sanitizer
50
- def initialize
51
- @link_scrubber = TargetScrubber.new
52
- @link_scrubber.tags = %w(a)
53
- @link_scrubber.attributes = %w(href)
54
- end
216
+ # full_sanitizer = Rails::HTML4::FullSanitizer.new
217
+ # full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
218
+ # # => "Bold no more! See more here..."
219
+ #
220
+ class FullSanitizer < Rails::HTML::Sanitizer
221
+ include HTML::Concern::ComposedSanitize
222
+ include HTML::Concern::Parser::HTML4
223
+ include HTML::Concern::Scrubber::Full
224
+ include HTML::Concern::Serializer::UTF8Encode
225
+ end
55
226
 
56
- def sanitize(html, options = {})
57
- Loofah.scrub_fragment(html, @link_scrubber).to_s
58
- end
227
+ # == Rails::HTML4::LinkSanitizer
228
+ #
229
+ # Removes +a+ tags and +href+ attributes from HTML4 leaving only the link text.
230
+ #
231
+ # link_sanitizer = Rails::HTML4::LinkSanitizer.new
232
+ # link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
233
+ # # => "Only the link text will be kept."
234
+ #
235
+ class LinkSanitizer < Rails::HTML::Sanitizer
236
+ include HTML::Concern::ComposedSanitize
237
+ include HTML::Concern::Parser::HTML4
238
+ include HTML::Concern::Scrubber::Link
239
+ include HTML::Concern::Serializer::UTF8Encode
59
240
  end
60
241
 
61
- # === Rails::Html::SafeListSanitizer
62
- # Sanitizes html and css from an extensive safe list (see link further down).
242
+ # == Rails::HTML4::SafeListSanitizer
243
+ #
244
+ # Sanitizes HTML4 and CSS from an extensive safe list.
63
245
  #
64
246
  # === Whitespace
65
- # We can't make any guarantees about whitespace being kept or stripped.
66
- # Loofah uses Nokogiri, which wraps either a C or Java parser for the
67
- # respective Ruby implementation.
68
- # Those two parsers determine how whitespace is ultimately handled.
69
247
  #
70
- # When the stripped markup will be rendered the users browser won't take
71
- # whitespace into account anyway. It might be better to suggest your users
72
- # wrap their whitespace sensitive content in pre tags or that you do
73
- # so automatically.
248
+ # We can't make any guarantees about whitespace being kept or stripped. Loofah uses Nokogiri,
249
+ # which wraps either a C or Java parser for the respective Ruby implementation. Those two
250
+ # parsers determine how whitespace is ultimately handled.
251
+ #
252
+ # When the stripped markup will be rendered the users browser won't take whitespace into account
253
+ # anyway. It might be better to suggest your users wrap their whitespace sensitive content in
254
+ # pre tags or that you do so automatically.
74
255
  #
75
256
  # === Options
76
- # Sanitizes both html and css via the safe lists found here:
77
- # https://github.com/flavorjones/loofah/blob/master/lib/loofah/html5/safelist.rb
78
257
  #
79
- # SafeListSanitizer also accepts options to configure
80
- # the safe list used when sanitizing html.
258
+ # Sanitizes both html and css via the safe lists found in
259
+ # Rails::HTML::Concern::Scrubber::SafeList
260
+ #
261
+ # SafeListSanitizer also accepts options to configure the safe list used when sanitizing html.
81
262
  # There's a class level option:
82
- # Rails::Html::SafeListSanitizer.allowed_tags = %w(table tr td)
83
- # Rails::Html::SafeListSanitizer.allowed_attributes = %w(id class style)
84
263
  #
85
- # Tags and attributes can also be passed to +sanitize+.
86
- # Passed options take precedence over the class level options.
264
+ # Rails::HTML4::SafeListSanitizer.allowed_tags = %w(table tr td)
265
+ # Rails::HTML4::SafeListSanitizer.allowed_attributes = %w(id class style)
266
+ #
267
+ # Tags and attributes can also be passed to +sanitize+. Passed options take precedence over the
268
+ # class level options.
87
269
  #
88
270
  # === Examples
89
- # safe_list_sanitizer = Rails::Html::SafeListSanitizer.new
90
271
  #
91
- # Sanitize css doesn't take options
92
- # safe_list_sanitizer.sanitize_css('background-color: #000;')
272
+ # safe_list_sanitizer = Rails::HTML4::SafeListSanitizer.new
93
273
  #
94
- # Default: sanitize via a extensive safe list of allowed elements
95
- # safe_list_sanitizer.sanitize(@article.body)
274
+ # # default: sanitize via a extensive safe list of allowed elements
275
+ # safe_list_sanitizer.sanitize(@article.body)
96
276
  #
97
- # Safe list via the supplied tags and attributes
98
- # safe_list_sanitizer.sanitize(@article.body, tags: %w(table tr td),
99
- # attributes: %w(id class style))
277
+ # # sanitize via the supplied tags and attributes
278
+ # safe_list_sanitizer.sanitize(
279
+ # @article.body,
280
+ # tags: %w(table tr td),
281
+ # attributes: %w(id class style),
282
+ # )
100
283
  #
101
- # Safe list via a custom scrubber
102
- # safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
103
- class SafeListSanitizer < Sanitizer
104
- class << self
105
- attr_accessor :allowed_tags
106
- attr_accessor :allowed_attributes
107
- end
108
- self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub
109
- sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dl dt dd abbr
110
- acronym a img blockquote del ins))
111
- self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
112
-
113
- def initialize(prune: false)
114
- @permit_scrubber = PermitScrubber.new(prune: prune)
115
- end
116
-
117
- def sanitize(html, options = {})
118
- return unless html
119
- return html if html.empty?
284
+ # # sanitize via a custom Loofah scrubber
285
+ # safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
286
+ #
287
+ # # prune nodes from the tree instead of stripping tags and leaving inner content
288
+ # safe_list_sanitizer = Rails::HTML4::SafeListSanitizer.new(prune: true)
289
+ #
290
+ # # the sanitizer can also sanitize CSS
291
+ # safe_list_sanitizer.sanitize_css('background-color: #000;')
292
+ #
293
+ class SafeListSanitizer < Rails::HTML::Sanitizer
294
+ include HTML::Concern::ComposedSanitize
295
+ include HTML::Concern::Parser::HTML4
296
+ include HTML::Concern::Scrubber::SafeList
297
+ include HTML::Concern::Serializer::UTF8Encode
298
+ end
299
+ end
120
300
 
121
- loofah_fragment = Loofah.fragment(html)
301
+ module HTML5
302
+ class Sanitizer
303
+ class << self
304
+ def full_sanitizer
305
+ Rails::HTML5::FullSanitizer
306
+ end
122
307
 
123
- if scrubber = options[:scrubber]
124
- # No duck typing, Loofah ensures subclass of Loofah::Scrubber
125
- loofah_fragment.scrub!(scrubber)
126
- elsif allowed_tags(options) || allowed_attributes(options)
127
- @permit_scrubber.tags = allowed_tags(options)
128
- @permit_scrubber.attributes = allowed_attributes(options)
129
- loofah_fragment.scrub!(@permit_scrubber)
130
- else
131
- remove_xpaths(loofah_fragment, XPATHS_TO_REMOVE)
132
- loofah_fragment.scrub!(:strip)
308
+ def link_sanitizer
309
+ Rails::HTML5::LinkSanitizer
133
310
  end
134
311
 
135
- properly_encode(loofah_fragment, encoding: 'UTF-8')
136
- end
312
+ def safe_list_sanitizer
313
+ Rails::HTML5::SafeListSanitizer
314
+ end
137
315
 
138
- def sanitize_css(style_string)
139
- Loofah::HTML5::Scrub.scrub_css(style_string)
316
+ def white_list_sanitizer # :nodoc:
317
+ safe_list_sanitizer
318
+ end
140
319
  end
320
+ end
141
321
 
142
- private
322
+ # == Rails::HTML5::FullSanitizer
323
+ #
324
+ # Removes all tags from HTML5 but strips out scripts, forms and comments.
325
+ #
326
+ # full_sanitizer = Rails::HTML5::FullSanitizer.new
327
+ # full_sanitizer.sanitize("<b>Bold</b> no more! <a href='more.html'>See more here</a>...")
328
+ # # => "Bold no more! See more here..."
329
+ #
330
+ class FullSanitizer < Rails::HTML::Sanitizer
331
+ include HTML::Concern::ComposedSanitize
332
+ include HTML::Concern::Parser::HTML5
333
+ include HTML::Concern::Scrubber::Full
334
+ include HTML::Concern::Serializer::UTF8Encode
335
+ end
143
336
 
144
- def allowed_tags(options)
145
- options[:tags] || self.class.allowed_tags
146
- end
337
+ # == Rails::HTML5::LinkSanitizer
338
+ #
339
+ # Removes +a+ tags and +href+ attributes from HTML5 leaving only the link text.
340
+ #
341
+ # link_sanitizer = Rails::HTML5::LinkSanitizer.new
342
+ # link_sanitizer.sanitize('<a href="example.com">Only the link text will be kept.</a>')
343
+ # # => "Only the link text will be kept."
344
+ #
345
+ class LinkSanitizer < Rails::HTML::Sanitizer
346
+ include HTML::Concern::ComposedSanitize
347
+ include HTML::Concern::Parser::HTML5
348
+ include HTML::Concern::Scrubber::Link
349
+ include HTML::Concern::Serializer::UTF8Encode
350
+ end
147
351
 
148
- def allowed_attributes(options)
149
- options[:attributes] || self.class.allowed_attributes
150
- end
352
+ # == Rails::HTML5::SafeListSanitizer
353
+ #
354
+ # Sanitizes HTML5 and CSS from an extensive safe list.
355
+ #
356
+ # === Whitespace
357
+ #
358
+ # We can't make any guarantees about whitespace being kept or stripped. Loofah uses Nokogiri,
359
+ # which wraps either a C or Java parser for the respective Ruby implementation. Those two
360
+ # parsers determine how whitespace is ultimately handled.
361
+ #
362
+ # When the stripped markup will be rendered the users browser won't take whitespace into account
363
+ # anyway. It might be better to suggest your users wrap their whitespace sensitive content in
364
+ # pre tags or that you do so automatically.
365
+ #
366
+ # === Options
367
+ #
368
+ # Sanitizes both html and css via the safe lists found in
369
+ # Rails::HTML::Concern::Scrubber::SafeList
370
+ #
371
+ # SafeListSanitizer also accepts options to configure the safe list used when sanitizing html.
372
+ # There's a class level option:
373
+ #
374
+ # Rails::HTML5::SafeListSanitizer.allowed_tags = %w(table tr td)
375
+ # Rails::HTML5::SafeListSanitizer.allowed_attributes = %w(id class style)
376
+ #
377
+ # Tags and attributes can also be passed to +sanitize+. Passed options take precedence over the
378
+ # class level options.
379
+ #
380
+ # === Examples
381
+ #
382
+ # safe_list_sanitizer = Rails::HTML5::SafeListSanitizer.new
383
+ #
384
+ # # default: sanitize via a extensive safe list of allowed elements
385
+ # safe_list_sanitizer.sanitize(@article.body)
386
+ #
387
+ # # sanitize via the supplied tags and attributes
388
+ # safe_list_sanitizer.sanitize(
389
+ # @article.body,
390
+ # tags: %w(table tr td),
391
+ # attributes: %w(id class style),
392
+ # )
393
+ #
394
+ # # sanitize via a custom Loofah scrubber
395
+ # safe_list_sanitizer.sanitize(@article.body, scrubber: ArticleScrubber.new)
396
+ #
397
+ # # prune nodes from the tree instead of stripping tags and leaving inner content
398
+ # safe_list_sanitizer = Rails::HTML5::SafeListSanitizer.new(prune: true)
399
+ #
400
+ # # the sanitizer can also sanitize CSS
401
+ # safe_list_sanitizer.sanitize_css('background-color: #000;')
402
+ #
403
+ class SafeListSanitizer < Rails::HTML::Sanitizer
404
+ include HTML::Concern::ComposedSanitize
405
+ include HTML::Concern::Parser::HTML5
406
+ include HTML::Concern::Scrubber::SafeList
407
+ include HTML::Concern::Serializer::UTF8Encode
151
408
  end
409
+ end if Rails::HTML::Sanitizer.html5_support?
152
410
 
153
- WhiteListSanitizer = SafeListSanitizer
411
+ module HTML
412
+ Sanitizer.extend(HTML4::Sanitizer::VendorMethods) # :nodoc:
413
+ FullSanitizer = HTML4::FullSanitizer # :nodoc:
414
+ LinkSanitizer = HTML4::LinkSanitizer # :nodoc:
415
+ SafeListSanitizer = HTML4::SafeListSanitizer # :nodoc:
416
+ WhiteListSanitizer = SafeListSanitizer # :nodoc:
154
417
  end
155
418
  end