loofah 0.4.2 → 2.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +604 -0
  3. data/MIT-LICENSE.txt +3 -1
  4. data/README.md +410 -0
  5. data/SECURITY.md +18 -0
  6. data/lib/loofah/concerns.rb +207 -0
  7. data/lib/loofah/elements.rb +98 -0
  8. data/lib/loofah/helpers.rb +91 -4
  9. data/lib/loofah/html4/document.rb +17 -0
  10. data/lib/loofah/html4/document_fragment.rb +15 -0
  11. data/lib/loofah/html5/document.rb +17 -0
  12. data/lib/loofah/html5/document_fragment.rb +15 -0
  13. data/lib/loofah/html5/libxml2_workarounds.rb +28 -0
  14. data/lib/loofah/html5/safelist.rb +1058 -0
  15. data/lib/loofah/html5/scrub.rb +211 -40
  16. data/lib/loofah/metahelpers.rb +18 -0
  17. data/lib/loofah/scrubber.rb +31 -13
  18. data/lib/loofah/scrubbers.rb +262 -31
  19. data/lib/loofah/version.rb +6 -0
  20. data/lib/loofah/xml/document.rb +2 -0
  21. data/lib/loofah/xml/document_fragment.rb +6 -9
  22. data/lib/loofah.rb +131 -52
  23. metadata +79 -158
  24. data/CHANGELOG.rdoc +0 -92
  25. data/DEPRECATED.rdoc +0 -12
  26. data/Manifest.txt +0 -34
  27. data/README.rdoc +0 -330
  28. data/Rakefile +0 -61
  29. data/TODO.rdoc +0 -4
  30. data/benchmark/benchmark.rb +0 -149
  31. data/benchmark/fragment.html +0 -96
  32. data/benchmark/helper.rb +0 -73
  33. data/benchmark/www.slashdot.com.html +0 -2560
  34. data/init.rb +0 -1
  35. data/lib/loofah/active_record.rb +0 -62
  36. data/lib/loofah/html/document.rb +0 -22
  37. data/lib/loofah/html/document_fragment.rb +0 -46
  38. data/lib/loofah/html5/whitelist.rb +0 -174
  39. data/lib/loofah/instance_methods.rb +0 -77
  40. data/lib/loofah/xss_foliate.rb +0 -212
  41. data/test/helper.rb +0 -8
  42. data/test/html5/test_sanitizer.rb +0 -248
  43. data/test/test_active_record.rb +0 -146
  44. data/test/test_ad_hoc.rb +0 -272
  45. data/test/test_api.rb +0 -128
  46. data/test/test_helpers.rb +0 -28
  47. data/test/test_scrubber.rb +0 -227
  48. data/test/test_scrubbers.rb +0 -144
  49. data/test/test_xss_foliate.rb +0 -171
  50. data.tar.gz.sig +0 -0
  51. metadata.gz.sig +0 -2
@@ -1,7 +1,9 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Loofah
2
4
  #
3
5
  # Loofah provides some built-in scrubbers for sanitizing with
4
- # HTML5lib's whitelist and for accomplishing some common
6
+ # HTML5lib's safelist and for accomplishing some common
5
7
  # transformation tasks.
6
8
  #
7
9
  #
@@ -10,7 +12,7 @@ module Loofah
10
12
  # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
11
13
  #
12
14
  # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
13
- # Loofah.fragment(unsafe_html).scrub!(:strip)
15
+ # Loofah.html5_fragment(unsafe_html).scrub!(:strip)
14
16
  # => "ohai! <div>div is safe</div> but foo is <b>not</b>"
15
17
  #
16
18
  #
@@ -19,7 +21,7 @@ module Loofah
19
21
  # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
20
22
  #
21
23
  # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
22
- # Loofah.fragment(unsafe_html).scrub!(:prune)
24
+ # Loofah.html5_fragment(unsafe_html).scrub!(:prune)
23
25
  # => "ohai! <div>div is safe</div> "
24
26
  #
25
27
  #
@@ -28,7 +30,7 @@ module Loofah
28
30
  # +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
29
31
  #
30
32
  # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
31
- # Loofah.fragment(unsafe_html).scrub!(:escape)
33
+ # Loofah.html5_fragment(unsafe_html).scrub!(:escape)
32
34
  # => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
33
35
  #
34
36
  #
@@ -40,7 +42,7 @@ module Loofah
40
42
  # layer of paint on top of the HTML input to make it look nice.
41
43
  #
42
44
  # messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
43
- # Loofah.fragment(messy_markup).scrub!(:whitewash)
45
+ # Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
44
46
  # => "ohai! <div>div with attributes</div>"
45
47
  #
46
48
  # One use case for this scrubber is to clean up HTML that was
@@ -55,30 +57,71 @@ module Loofah
55
57
  # +:nofollow+ adds a rel="nofollow" attribute to all links
56
58
  #
57
59
  # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
58
- # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
60
+ # Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
59
61
  # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
60
62
  #
61
63
  #
64
+ # === Loofah::Scrubbers::TargetBlank / scrub!(:targetblank)
65
+ #
66
+ # +:targetblank+ adds a target="_blank" attribute to all links
67
+ #
68
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
69
+ # Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
70
+ # => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
71
+ #
72
+ #
73
+ # === Loofah::Scrubbers::NoOpener / scrub!(:noopener)
74
+ #
75
+ # +:noopener+ adds a rel="noopener" attribute to all links
76
+ #
77
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
78
+ # Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
79
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
80
+ #
81
+ # === Loofah::Scrubbers::NoReferrer / scrub!(:noreferrer)
82
+ #
83
+ # +:noreferrer+ adds a rel="noreferrer" attribute to all links
84
+ #
85
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
86
+ # Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
87
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
88
+ #
89
+ #
90
+ # === Loofah::Scrubbers::Unprintable / scrub!(:unprintable)
91
+ #
92
+ # +:unprintable+ removes unprintable Unicode characters.
93
+ #
94
+ # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
95
+ # Loofah.html5_fragment(markup).scrub!(:unprintable)
96
+ # => "<p>Some text with an unprintable character at the end</p>"
97
+ #
98
+ # You may not be able to see the unprintable character in the above example, but there is a
99
+ # U+2028 character right before the closing </p> tag. These characters can cause issues if
100
+ # the content is ever parsed by JavaScript - more information here:
101
+ #
102
+ # http://timelessrepo.com/json-isnt-a-javascript-subset
103
+ #
62
104
  module Scrubbers
63
-
64
105
  #
65
106
  # === scrub!(:strip)
66
107
  #
67
108
  # +:strip+ removes unknown/unsafe tags, but leaves behind the pristine contents:
68
109
  #
69
110
  # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
70
- # Loofah.fragment(unsafe_html).scrub!(:strip)
111
+ # Loofah.html5_fragment(unsafe_html).scrub!(:strip)
71
112
  # => "ohai! <div>div is safe</div> but foo is <b>not</b>"
72
113
  #
73
114
  class Strip < Scrubber
74
- def initialize
115
+ def initialize # rubocop:disable Lint/MissingSuper
75
116
  @direction = :bottom_up
76
117
  end
77
118
 
78
119
  def scrub(node)
79
120
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
80
- node.before node.inner_html
121
+
122
+ node.before(node.children)
81
123
  node.remove
124
+ STOP
82
125
  end
83
126
  end
84
127
 
@@ -88,18 +131,19 @@ module Loofah
88
131
  # +:prune+ removes unknown/unsafe tags and their contents (including their subtrees):
89
132
  #
90
133
  # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
91
- # Loofah.fragment(unsafe_html).scrub!(:prune)
134
+ # Loofah.html5_fragment(unsafe_html).scrub!(:prune)
92
135
  # => "ohai! <div>div is safe</div> "
93
136
  #
94
137
  class Prune < Scrubber
95
- def initialize
138
+ def initialize # rubocop:disable Lint/MissingSuper
96
139
  @direction = :top_down
97
140
  end
98
141
 
99
142
  def scrub(node)
100
143
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
144
+
101
145
  node.remove
102
- return STOP
146
+ STOP
103
147
  end
104
148
  end
105
149
 
@@ -109,20 +153,20 @@ module Loofah
109
153
  # +:escape+ performs HTML entity escaping on the unknown/unsafe tags:
110
154
  #
111
155
  # unsafe_html = "ohai! <div>div is safe</div> <foo>but foo is <b>not</b></foo>"
112
- # Loofah.fragment(unsafe_html).scrub!(:escape)
156
+ # Loofah.html5_fragment(unsafe_html).scrub!(:escape)
113
157
  # => "ohai! <div>div is safe</div> &lt;foo&gt;but foo is &lt;b&gt;not&lt;/b&gt;&lt;/foo&gt;"
114
158
  #
115
159
  class Escape < Scrubber
116
- def initialize
160
+ def initialize # rubocop:disable Lint/MissingSuper
117
161
  @direction = :top_down
118
162
  end
119
163
 
120
164
  def scrub(node)
121
165
  return CONTINUE if html5lib_sanitize(node) == CONTINUE
122
- replacement_killer = Nokogiri::XML::Text.new(node.to_s, node.document)
123
- node.add_next_sibling replacement_killer
166
+
167
+ node.add_next_sibling(Nokogiri::XML::Text.new(node.to_s, node.document))
124
168
  node.remove
125
- return STOP
169
+ STOP
126
170
  end
127
171
  end
128
172
 
@@ -135,7 +179,7 @@ module Loofah
135
179
  # layer of paint on top of the HTML input to make it look nice.
136
180
  #
137
181
  # messy_markup = "ohai! <div id='foo' class='bar' style='margin: 10px'>div with attributes</div>"
138
- # Loofah.fragment(messy_markup).scrub!(:whitewash)
182
+ # Loofah.html5_fragment(messy_markup).scrub!(:whitewash)
139
183
  # => "ohai! <div>div with attributes</div>"
140
184
  #
141
185
  # One use case for this scrubber is to clean up HTML that was
@@ -145,14 +189,14 @@ module Loofah
145
189
  # Certainly not me.
146
190
  #
147
191
  class Whitewash < Scrubber
148
- def initialize
192
+ def initialize # rubocop:disable Lint/MissingSuper
149
193
  @direction = :top_down
150
194
  end
151
195
 
152
196
  def scrub(node)
153
197
  case node.type
154
198
  when Nokogiri::XML::Node::ELEMENT_NODE
155
- if HTML5::HashedWhiteList::ALLOWED_ELEMENTS_WITH_LIBXML2[node.name]
199
+ if HTML5::Scrub.allowed_element?(node.name)
156
200
  node.attributes.each { |attr| node.remove_attribute(attr.first) }
157
201
  return CONTINUE if node.namespaces.empty?
158
202
  end
@@ -170,30 +214,217 @@ module Loofah
170
214
  # +:nofollow+ adds a rel="nofollow" attribute to all links
171
215
  #
172
216
  # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
173
- # Loofah.fragment(link_farmers_markup).scrub!(:nofollow)
217
+ # Loofah.html5_fragment(link_farmers_markup).scrub!(:nofollow)
174
218
  # => "ohai! <a href='http://www.myswarmysite.com/' rel="nofollow">I like your blog post</a>"
175
219
  #
176
220
  class NoFollow < Scrubber
177
- def initialize
221
+ def initialize # rubocop:disable Lint/MissingSuper
222
+ @direction = :top_down
223
+ end
224
+
225
+ def scrub(node)
226
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
227
+
228
+ append_attribute(node, "rel", "nofollow")
229
+ STOP
230
+ end
231
+ end
232
+
233
+ #
234
+ # === scrub!(:targetblank)
235
+ #
236
+ # +:targetblank+ adds a target="_blank" attribute to all links.
237
+ # If there is a target already set, replaces it with target="_blank".
238
+ #
239
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
240
+ # Loofah.html5_fragment(link_farmers_markup).scrub!(:targetblank)
241
+ # => "ohai! <a href='http://www.myswarmysite.com/' target="_blank">I like your blog post</a>"
242
+ #
243
+ # On modern browsers, setting target="_blank" on anchor elements implicitly provides the same
244
+ # behavior as setting rel="noopener".
245
+ #
246
+ class TargetBlank < Scrubber
247
+ def initialize # rubocop:disable Lint/MissingSuper
248
+ @direction = :top_down
249
+ end
250
+
251
+ def scrub(node)
252
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
253
+
254
+ href = node["href"]
255
+
256
+ node.set_attribute("target", "_blank") if href && href[0] != "#"
257
+
258
+ STOP
259
+ end
260
+ end
261
+
262
+ #
263
+ # === scrub!(:noopener)
264
+ #
265
+ # +:noopener+ adds a rel="noopener" attribute to all links
266
+ #
267
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
268
+ # Loofah.html5_fragment(link_farmers_markup).scrub!(:noopener)
269
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noopener">I like your blog post</a>"
270
+ #
271
+ class NoOpener < Scrubber
272
+ def initialize # rubocop:disable Lint/MissingSuper
273
+ @direction = :top_down
274
+ end
275
+
276
+ def scrub(node)
277
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
278
+
279
+ append_attribute(node, "rel", "noopener")
280
+ STOP
281
+ end
282
+ end
283
+
284
+ #
285
+ # === scrub!(:noreferrer)
286
+ #
287
+ # +:noreferrer+ adds a rel="noreferrer" attribute to all links
288
+ #
289
+ # link_farmers_markup = "ohai! <a href='http://www.myswarmysite.com/'>I like your blog post</a>"
290
+ # Loofah.html5_fragment(link_farmers_markup).scrub!(:noreferrer)
291
+ # => "ohai! <a href='http://www.myswarmysite.com/' rel="noreferrer">I like your blog post</a>"
292
+ #
293
+ class NoReferrer < Scrubber
294
+ def initialize # rubocop:disable Lint/MissingSuper
295
+ @direction = :top_down
296
+ end
297
+
298
+ def scrub(node)
299
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "a")
300
+
301
+ append_attribute(node, "rel", "noreferrer")
302
+ STOP
303
+ end
304
+ end
305
+
306
+ # This class probably isn't useful publicly, but is used for #to_text's current implemention
307
+ class NewlineBlockElements < Scrubber # :nodoc:
308
+ def initialize # rubocop:disable Lint/MissingSuper
309
+ @direction = :bottom_up
310
+ end
311
+
312
+ def scrub(node)
313
+ return CONTINUE unless Loofah::Elements::LINEBREAKERS.include?(node.name)
314
+
315
+ replacement = if Loofah::Elements::INLINE_LINE_BREAK.include?(node.name)
316
+ "\n"
317
+ else
318
+ "\n#{node.content}\n"
319
+ end
320
+ node.add_next_sibling(Nokogiri::XML::Text.new(replacement, node.document))
321
+ node.remove
322
+ end
323
+ end
324
+
325
+ #
326
+ # === scrub!(:unprintable)
327
+ #
328
+ # +:unprintable+ removes unprintable Unicode characters.
329
+ #
330
+ # markup = "<p>Some text with an unprintable character at the end\u2028</p>"
331
+ # Loofah.html5_fragment(markup).scrub!(:unprintable)
332
+ # => "<p>Some text with an unprintable character at the end</p>"
333
+ #
334
+ # You may not be able to see the unprintable character in the above example, but there is a
335
+ # U+2028 character right before the closing </p> tag. These characters can cause issues if
336
+ # the content is ever parsed by JavaScript - more information here:
337
+ #
338
+ # http://timelessrepo.com/json-isnt-a-javascript-subset
339
+ #
340
+ class Unprintable < Scrubber
341
+ def initialize # rubocop:disable Lint/MissingSuper
178
342
  @direction = :top_down
179
343
  end
180
344
 
181
345
  def scrub(node)
182
- return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == 'a')
183
- node.set_attribute('rel', 'nofollow')
184
- return STOP
346
+ if node.type == Nokogiri::XML::Node::TEXT_NODE || node.type == Nokogiri::XML::Node::CDATA_SECTION_NODE
347
+ node.content = node.content.gsub(/\u2028|\u2029/, "")
348
+ end
349
+ CONTINUE
185
350
  end
186
351
  end
187
352
 
353
+ #
354
+ # === scrub!(:double_breakpoint)
355
+ #
356
+ # +:double_breakpoint+ replaces double-break tags with closing/opening paragraph tags.
357
+ #
358
+ # markup = "<p>Some text here in a logical paragraph.<br><br>Some more text, apparently a second paragraph.</p>"
359
+ # Loofah.html5_fragment(markup).scrub!(:double_breakpoint)
360
+ # => "<p>Some text here in a logical paragraph.</p><p>Some more text, apparently a second paragraph.</p>"
361
+ #
362
+ class DoubleBreakpoint < Scrubber
363
+ def initialize # rubocop:disable Lint/MissingSuper
364
+ @direction = :top_down
365
+ end
366
+
367
+ def scrub(node)
368
+ return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "p")
369
+
370
+ paragraph_with_break_point_nodes = node.xpath("//p[br[following-sibling::br]]")
371
+
372
+ paragraph_with_break_point_nodes.each do |paragraph_node|
373
+ new_paragraph = paragraph_node.add_previous_sibling("<p>").first
374
+
375
+ paragraph_node.children.each do |child|
376
+ remove_blank_text_nodes(child)
377
+ end
378
+
379
+ paragraph_node.children.each do |child|
380
+ # already unlinked
381
+ next if child.parent.nil?
382
+
383
+ if child.name == "br" && child.next_sibling.name == "br"
384
+ new_paragraph = paragraph_node.add_previous_sibling("<p>").first
385
+ child.next_sibling.unlink
386
+ child.unlink
387
+ else
388
+ child.parent = new_paragraph
389
+ end
390
+ end
391
+
392
+ paragraph_node.unlink
393
+ end
394
+
395
+ CONTINUE
396
+ end
397
+
398
+ private
399
+
400
+ def remove_blank_text_nodes(node)
401
+ node.unlink if node.text? && node.blank?
402
+ end
403
+ end
188
404
  #
189
405
  # A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
190
406
  #
191
407
  MAP = {
192
- :escape => Escape,
193
- :prune => Prune,
194
- :whitewash => Whitewash,
195
- :strip => Strip,
196
- :nofollow => NoFollow
408
+ escape: Escape,
409
+ prune: Prune,
410
+ whitewash: Whitewash,
411
+ strip: Strip,
412
+ nofollow: NoFollow,
413
+ noopener: NoOpener,
414
+ noreferrer: NoReferrer,
415
+ targetblank: TargetBlank,
416
+ newline_block_elements: NewlineBlockElements,
417
+ unprintable: Unprintable,
418
+ double_breakpoint: DoubleBreakpoint,
197
419
  }
420
+
421
+ class << self
422
+ #
423
+ # Returns an array of symbols representing the built-in scrubbers
424
+ #
425
+ def scrubber_symbols
426
+ MAP.keys
427
+ end
428
+ end
198
429
  end
199
430
  end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Loofah
4
+ # The version of Loofah you are using
5
+ VERSION = "2.25.0"
6
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Loofah
2
4
  module XML # :nodoc:
3
5
  #
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Loofah
2
4
  module XML # :nodoc:
3
5
  #
@@ -6,16 +8,11 @@ module Loofah
6
8
  # See Loofah::ScrubBehavior for additional methods.
7
9
  #
8
10
  class DocumentFragment < Nokogiri::XML::DocumentFragment
9
- include Loofah::ScrubBehavior::Node
10
-
11
11
  class << self
12
- #
13
- # Overridden Nokogiri::XML::DocumentFragment
14
- # constructor. Applications should use Loofah.fragment to
15
- # parse a fragment.
16
- #
17
- def parse tags
18
- self.new(Loofah::XML::Document.new, tags)
12
+ def parse(tags)
13
+ doc = Loofah::XML::Document.new
14
+ doc.encoding = tags.encoding.name if tags.respond_to?(:encoding)
15
+ new(doc, tags)
19
16
  end
20
17
  end
21
18
  end
data/lib/loofah.rb CHANGED
@@ -1,66 +1,155 @@
1
- $LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
1
+ # frozen_string_literal: true
2
2
 
3
- require 'nokogiri'
3
+ require "nokogiri"
4
4
 
5
- require 'loofah/html5/whitelist'
6
- require 'loofah/html5/scrub'
5
+ module Loofah
6
+ class << self
7
+ def html5_support?
8
+ # Note that Loofah can only support HTML5 in Nokogiri >= 1.14.0 because it requires the
9
+ # subclassing fix from https://github.com/sparklemotion/nokogiri/pull/2534
10
+ return @html5_support if defined? @html5_support
11
+
12
+ @html5_support =
13
+ Gem::Version.new(Nokogiri::VERSION) > Gem::Version.new("1.14.0") &&
14
+ Nokogiri.uses_gumbo?
15
+ end
16
+ end
17
+ end
18
+
19
+ require_relative "loofah/version"
20
+ require_relative "loofah/metahelpers"
21
+ require_relative "loofah/elements"
7
22
 
8
- require 'loofah/scrubber'
9
- require 'loofah/scrubbers'
23
+ require_relative "loofah/html5/safelist"
24
+ require_relative "loofah/html5/libxml2_workarounds"
25
+ require_relative "loofah/html5/scrub"
10
26
 
11
- require 'loofah/instance_methods'
12
- require 'loofah/xml/document'
13
- require 'loofah/xml/document_fragment'
14
- require 'loofah/html/document'
15
- require 'loofah/html/document_fragment'
27
+ require_relative "loofah/scrubber"
28
+ require_relative "loofah/scrubbers"
16
29
 
17
- require 'loofah/helpers'
30
+ require_relative "loofah/concerns"
31
+ require_relative "loofah/xml/document"
32
+ require_relative "loofah/xml/document_fragment"
33
+ require_relative "loofah/html4/document"
34
+ require_relative "loofah/html4/document_fragment"
35
+
36
+ if Loofah.html5_support?
37
+ require_relative "loofah/html5/document"
38
+ require_relative "loofah/html5/document_fragment"
39
+ end
18
40
 
19
41
  # == Strings and IO Objects as Input
20
42
  #
21
- # Loofah.document and Loofah.fragment accept any IO object in addition
22
- # to accepting a string. That IO object could be a file, or a socket,
23
- # or a StringIO, or anything that responds to +read+ and
24
- # +close+. Which makes it particularly easy to sanitize mass
25
- # quantities of docs.
43
+ # The following methods accept any IO object in addition to accepting a string:
44
+ #
45
+ # - Loofah.html4_document
46
+ # - Loofah.html4_fragment
47
+ # - Loofah.scrub_html4_document
48
+ # - Loofah.scrub_html4_fragment
49
+ #
50
+ # - Loofah.html5_document
51
+ # - Loofah.html5_fragment
52
+ # - Loofah.scrub_html5_document
53
+ # - Loofah.scrub_html5_fragment
54
+ #
55
+ # - Loofah.xml_document
56
+ # - Loofah.xml_fragment
57
+ # - Loofah.scrub_xml_document
58
+ # - Loofah.scrub_xml_fragment
59
+ #
60
+ # - Loofah.document
61
+ # - Loofah.fragment
62
+ # - Loofah.scrub_document
63
+ # - Loofah.scrub_fragment
64
+ #
65
+ # That IO object could be a file, or a socket, or a StringIO, or anything that responds to +read+
66
+ # and +close+.
26
67
  #
27
68
  module Loofah
28
- # The version of Loofah you are using
29
- VERSION = '0.4.2'
30
-
31
- # The minimum required version of Nokogiri
32
- REQUIRED_NOKOGIRI_VERSION = '1.3.3'
69
+ # Alias for Loofah::HTML4
70
+ HTML = HTML4
33
71
 
34
72
  class << self
35
- # Shortcut for Loofah::HTML::Document.parse
36
- # This method accepts the same parameters as Nokogiri::HTML::Document.parse
37
- def document(*args, &block)
38
- Loofah::HTML::Document.parse(*args, &block)
73
+ # Shortcut for Loofah::HTML4::Document.parse(*args, &block)
74
+ #
75
+ # This method accepts the same parameters as Nokogiri::HTML4::Document.parse
76
+ def html4_document(*args, &block)
77
+ Loofah::HTML4::Document.parse(*args, &block)
78
+ end
79
+
80
+ # Shortcut for Loofah::HTML4::DocumentFragment.parse(*args, &block)
81
+ #
82
+ # This method accepts the same parameters as Nokogiri::HTML4::DocumentFragment.parse
83
+ def html4_fragment(*args, &block)
84
+ Loofah::HTML4::DocumentFragment.parse(*args, &block)
39
85
  end
40
86
 
41
- # Shortcut for Loofah::HTML::DocumentFragment.parse
42
- # This method accepts the same parameters as Nokogiri::HTML::DocumentFragment.parse
43
- def fragment(*args, &block)
44
- Loofah::HTML::DocumentFragment.parse(*args, &block)
87
+ # Shortcut for Loofah::HTML4::Document.parse(string_or_io).scrub!(method)
88
+ def scrub_html4_document(string_or_io, method)
89
+ Loofah::HTML4::Document.parse(string_or_io).scrub!(method)
45
90
  end
46
91
 
47
- # Shortcut for Loofah.fragment(string_or_io).scrub!(method)
48
- def scrub_fragment(string_or_io, method)
49
- Loofah.fragment(string_or_io).scrub!(method)
92
+ # Shortcut for Loofah::HTML4::DocumentFragment.parse(string_or_io).scrub!(method)
93
+ def scrub_html4_fragment(string_or_io, method)
94
+ Loofah::HTML4::DocumentFragment.parse(string_or_io).scrub!(method)
50
95
  end
51
96
 
52
- # Shortcut for Loofah.document(string_or_io).scrub!(method)
53
- def scrub_document(string_or_io, method)
54
- Loofah.document(string_or_io).scrub!(method)
97
+ if Loofah.html5_support?
98
+ # Shortcut for Loofah::HTML5::Document.parse(*args, &block)
99
+ #
100
+ # This method accepts the same parameters as Nokogiri::HTML5::Document.parse
101
+ def html5_document(*args, &block)
102
+ Loofah::HTML5::Document.parse(*args, &block)
103
+ end
104
+
105
+ # Shortcut for Loofah::HTML5::DocumentFragment.parse(*args, &block)
106
+ #
107
+ # This method accepts the same parameters as Nokogiri::HTML5::DocumentFragment.parse
108
+ def html5_fragment(*args, &block)
109
+ Loofah::HTML5::DocumentFragment.parse(*args, &block)
110
+ end
111
+
112
+ # Shortcut for Loofah::HTML5::Document.parse(string_or_io).scrub!(method)
113
+ def scrub_html5_document(string_or_io, method)
114
+ Loofah::HTML5::Document.parse(string_or_io).scrub!(method)
115
+ end
116
+
117
+ # Shortcut for Loofah::HTML5::DocumentFragment.parse(string_or_io).scrub!(method)
118
+ def scrub_html5_fragment(string_or_io, method)
119
+ Loofah::HTML5::DocumentFragment.parse(string_or_io).scrub!(method)
120
+ end
121
+ else
122
+ def html5_document(*args, &block)
123
+ raise NotImplementedError, "Loofah::HTML5 is not supported by your version of Nokogiri"
124
+ end
125
+
126
+ def html5_fragment(*args, &block)
127
+ raise NotImplementedError, "Loofah::HTML5 is not supported by your version of Nokogiri"
128
+ end
129
+
130
+ def scrub_html5_document(string_or_io, method)
131
+ raise NotImplementedError, "Loofah::HTML5 is not supported by your version of Nokogiri"
132
+ end
133
+
134
+ def scrub_html5_fragment(string_or_io, method)
135
+ raise NotImplementedError, "Loofah::HTML5 is not supported by your version of Nokogiri"
136
+ end
55
137
  end
56
138
 
57
- # Shortcut for Loofah::XML::Document.parse
139
+ alias_method :document, :html4_document
140
+ alias_method :fragment, :html4_fragment
141
+ alias_method :scrub_document, :scrub_html4_document
142
+ alias_method :scrub_fragment, :scrub_html4_fragment
143
+
144
+ # Shortcut for Loofah::XML::Document.parse(*args, &block)
145
+ #
58
146
  # This method accepts the same parameters as Nokogiri::XML::Document.parse
59
147
  def xml_document(*args, &block)
60
148
  Loofah::XML::Document.parse(*args, &block)
61
149
  end
62
150
 
63
- # Shortcut for Loofah::XML::DocumentFragment.parse
151
+ # Shortcut for Loofah::XML::DocumentFragment.parse(*args, &block)
152
+ #
64
153
  # This method accepts the same parameters as Nokogiri::XML::DocumentFragment.parse
65
154
  def xml_fragment(*args, &block)
66
155
  Loofah::XML::DocumentFragment.parse(*args, &block)
@@ -76,19 +165,9 @@ module Loofah
76
165
  Loofah.xml_document(string_or_io).scrub!(method)
77
166
  end
78
167
 
168
+ # A helper to remove extraneous whitespace from text-ified HTML
169
+ def remove_extraneous_whitespace(string)
170
+ string.gsub(/\n\s*\n\s*\n/, "\n\n")
171
+ end
79
172
  end
80
173
  end
81
-
82
- if Nokogiri::VERSION < Loofah::REQUIRED_NOKOGIRI_VERSION
83
- raise RuntimeError, "Loofah requires Nokogiri #{Loofah::REQUIRED_NOKOGIRI_VERSION} or later (currently #{Nokogiri::VERSION})"
84
- end
85
-
86
- if defined? Rails.configuration and Rails.configuration.frameworks.include?([:active_record]) # rails 2.1 and later
87
- Rails.configuration.after_initialize do
88
- require 'loofah/active_record'
89
- require 'loofah/xss_foliate'
90
- end
91
- elsif defined? ActiveRecord::Base # rails 2.0
92
- require 'loofah/active_record'
93
- require 'loofah/xss_foliate'
94
- end