loofah 2.2.3 → 2.21.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +269 -31
  3. data/README.md +109 -124
  4. data/lib/loofah/concerns.rb +207 -0
  5. data/lib/loofah/elements.rb +85 -79
  6. data/lib/loofah/helpers.rb +37 -20
  7. data/lib/loofah/{html → html4}/document.rb +6 -7
  8. data/lib/loofah/html4/document_fragment.rb +15 -0
  9. data/lib/loofah/html5/document.rb +17 -0
  10. data/lib/loofah/html5/document_fragment.rb +15 -0
  11. data/lib/loofah/html5/libxml2_workarounds.rb +10 -8
  12. data/lib/loofah/html5/safelist.rb +1055 -0
  13. data/lib/loofah/html5/scrub.rb +153 -58
  14. data/lib/loofah/metahelpers.rb +11 -6
  15. data/lib/loofah/scrubber.rb +22 -15
  16. data/lib/loofah/scrubbers.rb +66 -55
  17. data/lib/loofah/version.rb +6 -0
  18. data/lib/loofah/xml/document.rb +2 -0
  19. data/lib/loofah/xml/document_fragment.rb +4 -7
  20. data/lib/loofah.rb +131 -38
  21. metadata +28 -216
  22. data/.gemtest +0 -0
  23. data/Gemfile +0 -22
  24. data/Manifest.txt +0 -40
  25. data/Rakefile +0 -79
  26. data/benchmark/benchmark.rb +0 -149
  27. data/benchmark/fragment.html +0 -96
  28. data/benchmark/helper.rb +0 -73
  29. data/benchmark/www.slashdot.com.html +0 -2560
  30. data/lib/loofah/html/document_fragment.rb +0 -40
  31. data/lib/loofah/html5/whitelist.rb +0 -186
  32. data/lib/loofah/instance_methods.rb +0 -127
  33. data/test/assets/msword.html +0 -63
  34. data/test/assets/testdata_sanitizer_tests1.dat +0 -502
  35. data/test/helper.rb +0 -18
  36. data/test/html5/test_sanitizer.rb +0 -382
  37. data/test/integration/test_ad_hoc.rb +0 -204
  38. data/test/integration/test_helpers.rb +0 -43
  39. data/test/integration/test_html.rb +0 -72
  40. data/test/integration/test_scrubbers.rb +0 -400
  41. data/test/integration/test_xml.rb +0 -55
  42. data/test/unit/test_api.rb +0 -142
  43. data/test/unit/test_encoding.rb +0 -20
  44. data/test/unit/test_helpers.rb +0 -62
  45. data/test/unit/test_scrubber.rb +0 -229
  46. data/test/unit/test_scrubbers.rb +0 -14
@@ -1,40 +0,0 @@
1
- module Loofah
2
- module HTML # :nodoc:
3
- #
4
- # Subclass of Nokogiri::HTML::DocumentFragment.
5
- #
6
- # See Loofah::ScrubBehavior and Loofah::TextBehavior for additional methods.
7
- #
8
- class DocumentFragment < Nokogiri::HTML::DocumentFragment
9
- include Loofah::TextBehavior
10
-
11
- class << self
12
- #
13
- # Overridden Nokogiri::HTML::DocumentFragment
14
- # constructor. Applications should use Loofah.fragment to
15
- # parse a fragment.
16
- #
17
- def parse tags, encoding = nil
18
- doc = Loofah::HTML::Document.new
19
-
20
- encoding ||= tags.respond_to?(:encoding) ? tags.encoding.name : 'UTF-8'
21
- doc.encoding = encoding
22
-
23
- new(doc, tags)
24
- end
25
- end
26
-
27
- #
28
- # Returns the HTML markup contained by the fragment
29
- #
30
- def to_s
31
- serialize_root.children.to_s
32
- end
33
- alias :serialize :to_s
34
-
35
- def serialize_root
36
- at_xpath("./body") || self
37
- end
38
- end
39
- end
40
- end
@@ -1,186 +0,0 @@
1
- require 'set'
2
-
3
- module Loofah
4
- module HTML5 # :nodoc:
5
- #
6
- # HTML whitelist lifted from HTML5lib sanitizer code:
7
- #
8
- # http://code.google.com/p/html5lib/
9
- #
10
- # <html5_license>
11
- #
12
- # Copyright (c) 2006-2008 The Authors
13
- #
14
- # Contributors:
15
- # James Graham - jg307@cam.ac.uk
16
- # Anne van Kesteren - annevankesteren@gmail.com
17
- # Lachlan Hunt - lachlan.hunt@lachy.id.au
18
- # Matt McDonald - kanashii@kanashii.ca
19
- # Sam Ruby - rubys@intertwingly.net
20
- # Ian Hickson (Google) - ian@hixie.ch
21
- # Thomas Broyer - t.broyer@ltgt.net
22
- # Jacques Distler - distler@golem.ph.utexas.edu
23
- # Henri Sivonen - hsivonen@iki.fi
24
- # The Mozilla Foundation (contributions from Henri Sivonen since 2008)
25
- #
26
- # Permission is hereby granted, free of charge, to any person
27
- # obtaining a copy of this software and associated documentation
28
- # files (the "Software"), to deal in the Software without
29
- # restriction, including without limitation the rights to use, copy,
30
- # modify, merge, publish, distribute, sublicense, and/or sell copies
31
- # of the Software, and to permit persons to whom the Software is
32
- # furnished to do so, subject to the following conditions:
33
- #
34
- # The above copyright notice and this permission notice shall be
35
- # included in all copies or substantial portions of the Software.
36
- #
37
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
38
- # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
39
- # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
40
- # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
41
- # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
42
- # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
43
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
44
- # DEALINGS IN THE SOFTWARE.
45
- #
46
- # </html5_license>
47
- module WhiteList
48
-
49
- ACCEPTABLE_ELEMENTS = Set.new %w[a abbr acronym address area
50
- article aside audio b bdi bdo big blockquote br button canvas
51
- caption center cite code col colgroup command datalist dd del
52
- details dfn dir div dl dt em fieldset figcaption figure footer
53
- font form h1 h2 h3 h4 h5 h6 header hr i img input ins kbd label
54
- legend li main map mark menu meter nav ol output optgroup option p
55
- pre q s samp section select small span strike strong sub summary
56
- sup table tbody td textarea tfoot th thead time tr tt u ul var
57
- video]
58
-
59
- MATHML_ELEMENTS = Set.new %w[annotation annotation-xml maction math merror mfrac
60
- mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot mrow
61
- mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder
62
- munderover none semantics]
63
-
64
- SVG_ELEMENTS = Set.new %w[a animate animateColor animateMotion animateTransform
65
- circle clipPath defs desc ellipse feGaussianBlur filter font-face
66
- font-face-name font-face-src foreignObject
67
- g glyph hkern linearGradient line marker mask metadata missing-glyph
68
- mpath path polygon polyline radialGradient rect set stop svg switch symbol
69
- text textPath title tspan use]
70
-
71
- ACCEPTABLE_ATTRIBUTES = Set.new %w[abbr accept accept-charset accesskey action
72
- align alt axis border cellpadding cellspacing char charoff charset
73
- checked cite class clear cols colspan color compact coords datetime
74
- dir disabled enctype for frame headers height href hreflang hspace id
75
- ismap label lang longdesc loop loopcount loopend loopstart
76
- maxlength media method multiple name nohref
77
- noshade nowrap poster preload prompt readonly rel rev rows rowspan rules scope
78
- selected shape size span src start style summary tabindex target title
79
- type usemap valign value vspace width xml:lang]
80
-
81
- MATHML_ATTRIBUTES = Set.new %w[actiontype align close
82
- columnalign columnlines columnspacing columnspan depth display
83
- displaystyle encoding equalcolumns equalrows fence fontstyle fontweight
84
- frame height linethickness lspace mathbackground mathcolor mathvariant
85
- maxsize minsize open other rowalign rowlines
86
- rowspacing rowspan rspace scriptlevel selection separator separators
87
- stretchy width xlink:href xlink:show xlink:type xmlns xmlns:xlink]
88
-
89
- SVG_ATTRIBUTES = Set.new %w[accent-height accumulate additive alphabetic
90
- arabic-form ascent attributeName attributeType baseProfile bbox begin
91
- by calcMode cap-height class clip-path clip-rule color
92
- color-interpolation-filters color-rendering content cx cy d dx
93
- dy descent display dur end fill fill-opacity fill-rule
94
- filterRes filterUnits font-family
95
- font-size font-stretch font-style font-variant font-weight fx fy g1
96
- g2 glyph-name gradientUnits hanging height horiz-adv-x horiz-origin-x id
97
- ideographic k keyPoints keySplines keyTimes lang marker-end
98
- marker-mid marker-start markerHeight markerUnits markerWidth
99
- maskContentUnits maskUnits mathematical max method min name offset opacity orient origin
100
- overline-position overline-thickness panose-1 path pathLength
101
- patternContentUnits patternTransform patternUnits points
102
- preserveAspectRatio primitiveUnits r refX refY repeatCount repeatDur
103
- requiredExtensions requiredFeatures restart rotate rx ry slope spacing
104
- startOffset stdDeviation stemh
105
- stemv stop-color stop-opacity strikethrough-position
106
- strikethrough-thickness stroke stroke-dasharray stroke-dashoffset
107
- stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity
108
- stroke-width systemLanguage target text-anchor to transform type u1
109
- u2 underline-position underline-thickness unicode unicode-range
110
- units-per-em values version viewBox visibility width widths x
111
- x-height x1 x2 xlink:actuate xlink:arcrole xlink:href xlink:role
112
- xlink:show xlink:title xlink:type xml:base xml:lang xml:space xmlns
113
- xmlns:xlink y y1 y2 zoomAndPan]
114
-
115
- ATTR_VAL_IS_URI = Set.new %w[href src cite action longdesc xlink:href xml:base poster preload]
116
-
117
- SVG_ATTR_VAL_ALLOWS_REF = Set.new %w[clip-path color-profile cursor fill
118
- filter marker marker-start marker-mid marker-end mask stroke]
119
-
120
- SVG_ALLOW_LOCAL_HREF = Set.new %w[altGlyph animate animateColor animateMotion
121
- animateTransform cursor feImage filter linearGradient pattern
122
- radialGradient textpath tref set use]
123
-
124
- ACCEPTABLE_CSS_PROPERTIES = Set.new %w[azimuth background-color
125
- border-bottom-color border-collapse border-color border-left-color
126
- border-right-color border-top-color clear color cursor direction
127
- display elevation float font font-family font-size font-style
128
- font-variant font-weight height letter-spacing line-height list-style-type
129
- overflow pause pause-after pause-before pitch pitch-range richness speak
130
- speak-header speak-numeral speak-punctuation speech-rate stress
131
- text-align text-decoration text-indent unicode-bidi vertical-align
132
- voice-family volume white-space width]
133
-
134
- ACCEPTABLE_CSS_KEYWORDS = Set.new %w[auto aqua black block blue bold both bottom
135
- brown center collapse dashed dotted fuchsia gray green !important
136
- italic left lime maroon medium none navy normal nowrap olive pointer
137
- purple red right solid silver teal top transparent underline white
138
- yellow]
139
-
140
- ACCEPTABLE_CSS_FUNCTIONS = Set.new %w[calc rgb]
141
-
142
- SHORTHAND_CSS_PROPERTIES = Set.new %w[background border margin padding]
143
-
144
- ACCEPTABLE_SVG_PROPERTIES = Set.new %w[fill fill-opacity fill-rule stroke
145
- stroke-width stroke-linecap stroke-linejoin stroke-opacity]
146
-
147
- PROTOCOL_SEPARATOR = /:|(&#0*58)|(&#x70)|(&#x0*3a)|(%|&#37;)3A/i
148
-
149
- ACCEPTABLE_PROTOCOLS = Set.new %w[ed2k ftp http https irc mailto news gopher nntp
150
- telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs data]
151
-
152
- ACCEPTABLE_URI_DATA_MEDIATYPES = Set.new %w[text/plain text/css image/png image/gif
153
- image/jpeg image/svg+xml]
154
-
155
- # subclasses may define their own versions of these constants
156
- ALLOWED_ELEMENTS = ACCEPTABLE_ELEMENTS + MATHML_ELEMENTS + SVG_ELEMENTS
157
- ALLOWED_ATTRIBUTES = ACCEPTABLE_ATTRIBUTES + MATHML_ATTRIBUTES + SVG_ATTRIBUTES
158
- ALLOWED_CSS_PROPERTIES = ACCEPTABLE_CSS_PROPERTIES
159
- ALLOWED_CSS_KEYWORDS = ACCEPTABLE_CSS_KEYWORDS
160
- ALLOWED_CSS_FUNCTIONS = ACCEPTABLE_CSS_FUNCTIONS
161
- ALLOWED_SVG_PROPERTIES = ACCEPTABLE_SVG_PROPERTIES
162
- ALLOWED_PROTOCOLS = ACCEPTABLE_PROTOCOLS
163
- ALLOWED_URI_DATA_MEDIATYPES = ACCEPTABLE_URI_DATA_MEDIATYPES
164
-
165
- VOID_ELEMENTS = Set.new %w[
166
- base
167
- link
168
- meta
169
- hr
170
- br
171
- img
172
- embed
173
- param
174
- area
175
- col
176
- input
177
- ]
178
-
179
- # additional tags we should consider safe since we have libxml2 fixing up our documents.
180
- TAGS_SAFE_WITH_LIBXML2 = Set.new %w[html head body]
181
- ALLOWED_ELEMENTS_WITH_LIBXML2 = ALLOWED_ELEMENTS + TAGS_SAFE_WITH_LIBXML2
182
- end
183
-
184
- ::Loofah::MetaHelpers.add_downcased_set_members_to_all_set_constants ::Loofah::HTML5::WhiteList
185
- end
186
- end
@@ -1,127 +0,0 @@
1
- module Loofah
2
- #
3
- # Mixes +scrub!+ into Document, DocumentFragment, Node and NodeSet.
4
- #
5
- # Traverse the document or fragment, invoking the +scrubber+ on
6
- # each node.
7
- #
8
- # +scrubber+ must either be one of the symbols representing the
9
- # built-in scrubbers (see Scrubbers), or a Scrubber instance.
10
- #
11
- # span2div = Loofah::Scrubber.new do |node|
12
- # node.name = "div" if node.name == "span"
13
- # end
14
- # Loofah.fragment("<span>foo</span><p>bar</p>").scrub!(span2div).to_s
15
- # # => "<div>foo</div><p>bar</p>"
16
- #
17
- # or
18
- #
19
- # unsafe_html = "ohai! <div>div is safe</div> <script>but script is not</script>"
20
- # Loofah.fragment(unsafe_html).scrub!(:strip).to_s
21
- # # => "ohai! <div>div is safe</div> "
22
- #
23
- # Note that this method is called implicitly from
24
- # Loofah.scrub_fragment and Loofah.scrub_document.
25
- #
26
- # Please see Scrubber for more information on implementation and traversal, and
27
- # README.rdoc for more example usage.
28
- #
29
- module ScrubBehavior
30
- module Node # :nodoc:
31
- def scrub!(scrubber)
32
- #
33
- # yes. this should be three separate methods. but nokogiri
34
- # decorates (or not) based on whether the module name has
35
- # already been included. and since documents get decorated
36
- # just like their constituent nodes, we need to jam all the
37
- # logic into a single module.
38
- #
39
- scrubber = ScrubBehavior.resolve_scrubber(scrubber)
40
- case self
41
- when Nokogiri::XML::Document
42
- scrubber.traverse(root) if root
43
- when Nokogiri::XML::DocumentFragment
44
- children.scrub! scrubber
45
- else
46
- scrubber.traverse(self)
47
- end
48
- self
49
- end
50
- end
51
-
52
- module NodeSet # :nodoc:
53
- def scrub!(scrubber)
54
- each { |node| node.scrub!(scrubber) }
55
- self
56
- end
57
- end
58
-
59
- def ScrubBehavior.resolve_scrubber(scrubber) # :nodoc:
60
- scrubber = Scrubbers::MAP[scrubber].new if Scrubbers::MAP[scrubber]
61
- unless scrubber.is_a?(Loofah::Scrubber)
62
- raise Loofah::ScrubberNotFound, "not a Scrubber or a scrubber name: #{scrubber.inspect}"
63
- end
64
- scrubber
65
- end
66
- end
67
-
68
- #
69
- # Overrides +text+ in HTML::Document and HTML::DocumentFragment,
70
- # and mixes in +to_text+.
71
- #
72
- module TextBehavior
73
- #
74
- # Returns a plain-text version of the markup contained by the document,
75
- # with HTML entities encoded.
76
- #
77
- # This method is significantly faster than #to_text, but isn't
78
- # clever about whitespace around block elements.
79
- #
80
- # Loofah.document("<h1>Title</h1><div>Content</div>").text
81
- # # => "TitleContent"
82
- #
83
- # By default, the returned text will have HTML entities
84
- # escaped. If you want unescaped entities, and you understand
85
- # that the result is unsafe to render in a browser, then you
86
- # can pass an argument as shown:
87
- #
88
- # frag = Loofah.fragment("&lt;script&gt;alert('EVIL');&lt;/script&gt;")
89
- # # ok for browser:
90
- # frag.text # => "&lt;script&gt;alert('EVIL');&lt;/script&gt;"
91
- # # decidedly not ok for browser:
92
- # frag.text(:encode_special_chars => false) # => "<script>alert('EVIL');</script>"
93
- #
94
- def text(options={})
95
- result = serialize_root.children.inner_text rescue ""
96
- if options[:encode_special_chars] == false
97
- result # possibly dangerous if rendered in a browser
98
- else
99
- encode_special_chars result
100
- end
101
- end
102
- alias :inner_text :text
103
- alias :to_str :text
104
-
105
- #
106
- # Returns a plain-text version of the markup contained by the
107
- # fragment, with HTML entities encoded.
108
- #
109
- # This method is slower than #to_text, but is clever about
110
- # whitespace around block elements.
111
- #
112
- # Loofah.document("<h1>Title</h1><div>Content</div>").to_text
113
- # # => "\nTitle\n\nContent\n"
114
- #
115
- def to_text(options={})
116
- Loofah.remove_extraneous_whitespace self.dup.scrub!(:newline_block_elements).text(options)
117
- end
118
- end
119
-
120
- module DocumentDecorator # :nodoc:
121
- def initialize(*args, &block)
122
- super
123
- self.decorators(Nokogiri::XML::Node) << ScrubBehavior::Node
124
- self.decorators(Nokogiri::XML::NodeSet) << ScrubBehavior::NodeSet
125
- end
126
- end
127
- end
@@ -1,63 +0,0 @@
1
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
2
- <w:WordDocument>
3
- <w:View>Normal</w:View>
4
- <w:Zoom>0</w:Zoom>
5
- <w:PunctuationKerning/>
6
- <w:ValidateAgainstSchemas/>
7
- <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
8
- <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
9
- <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
10
- <w:Compatibility>
11
- <w:BreakWrappedTables/>
12
- <w:SnapToGridInCell/>
13
- <w:WrapTextWithPunct/>
14
- <w:UseAsianBreakRules/>
15
- <w:DontGrowAutofit/>
16
- </w:Compatibility>
17
- <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
18
- </w:WordDocument>
19
- </xml><![endif]--><!--[if gte mso 9]><xml>
20
- <w:LatentStyles DefLockedState="false" LatentStyleCount="156">
21
- </w:LatentStyles>
22
- </xml><![endif]--><style>
23
- <!--
24
- /* Style Definitions */
25
- p.MsoNormal, li.MsoNormal, div.MsoNormal
26
- {mso-style-parent:"";
27
- margin:0in;
28
- margin-bottom:.0001pt;
29
- mso-pagination:widow-orphan;
30
- font-size:12.0pt;
31
- font-family:"Times New Roman";
32
- mso-fareast-font-family:"Times New Roman";}
33
- @page Section1
34
- {size:8.5in 11.0in;
35
- margin:1.0in 1.25in 1.0in 1.25in;
36
- mso-header-margin:.5in;
37
- mso-footer-margin:.5in;
38
- mso-paper-source:0;}
39
- div.Section1
40
- {page:Section1;}
41
- -->
42
- </style><!--[if gte mso 10]>
43
- <style>
44
- /* Style Definitions */
45
- table.MsoNormalTable
46
- {mso-style-name:"Table Normal";
47
- mso-tstyle-rowband-size:0;
48
- mso-tstyle-colband-size:0;
49
- mso-style-noshow:yes;
50
- mso-style-parent:"";
51
- mso-padding-alt:0in 5.4pt 0in 5.4pt;
52
- mso-para-margin:0in;
53
- mso-para-margin-bottom:.0001pt;
54
- mso-pagination:widow-orphan;
55
- font-size:10.0pt;
56
- font-family:"Times New Roman";
57
- mso-ansi-language:#0400;
58
- mso-fareast-language:#0400;
59
- mso-bidi-language:#0400;}
60
- </style>
61
- <![endif]-->
62
-
63
- <p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>