mdalessio-dryopteris 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +13 -0
- data/lib/dryopteris/sanitize.rb +40 -0
- data/test/test_basic.rb +80 -0
- metadata +2 -1
data/README.markdown
CHANGED
@@ -51,6 +51,19 @@ The returned string will contain exactly one (1) well-formed HTML document, with
|
|
51
51
|
|
52
52
|
Coolness: <tt>dangerous\_html\_document</tt> can be a string OR an IO object (a file, or a socket, or ...). Which makes it particularly easy to sanitize large numbers of docs.
|
53
53
|
|
54
|
+
### Whitewashing HTML
|
55
|
+
|
56
|
+
Other times, you may want to allow a user to submit HTML, and remove all styling, attributes and invalid HTML tags. I like to call this "whitewashing", since it's putting a new layer of paint on top of the user's HTML input to make it look nice.
|
57
|
+
|
58
|
+
One use case for this feature is to clean up HTML that was cut-and-pasted from Microsoft(tm) Word into a WYSIWYG editor/textarea. Microsoft's editor is famous for injecting all kinds of cruft into its HTML output. Who needs that? Certainly not me.
|
59
|
+
|
60
|
+
whitewashed_html = Dryopteris.whitewash(ugly_microsoft_html_snippet)
|
61
|
+
|
62
|
+
Please note that whitewashing implicitly also sanitizes your HTML, as it uses the same HTML tag whitelist as <tt>sanitize()</tt>. It's implementation is:
|
63
|
+
|
64
|
+
1. unless the tag is on the whitelist, remove it from the document
|
65
|
+
2. if the tag has an XML namespace on it, remove it from the document
|
66
|
+
2. remove all attributes from the node
|
54
67
|
|
55
68
|
Standing on the Shoulders of Giants
|
56
69
|
-----
|
data/lib/dryopteris/sanitize.rb
CHANGED
@@ -18,6 +18,19 @@ module Dryopteris
|
|
18
18
|
body_element.inner_text
|
19
19
|
end
|
20
20
|
|
21
|
+
def whitewash(string_or_io, encoding=nil)
|
22
|
+
return nil if string_or_io.nil?
|
23
|
+
return "" if string_or_io.strip.size == 0
|
24
|
+
|
25
|
+
doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
|
26
|
+
body = doc.xpath("/html/body").first
|
27
|
+
return "" if body.nil?
|
28
|
+
body.children.each do |node|
|
29
|
+
traverse_conditionally_top_down(node, :whitewash_node)
|
30
|
+
end
|
31
|
+
body.children.map { |x| x.to_xml }.join
|
32
|
+
end
|
33
|
+
|
21
34
|
def sanitize(string, encoding=nil)
|
22
35
|
return nil if string.nil?
|
23
36
|
return "" if string.strip.size == 0
|
@@ -46,6 +59,7 @@ module Dryopteris
|
|
46
59
|
end
|
47
60
|
|
48
61
|
private
|
62
|
+
|
49
63
|
def traverse_conditionally_top_down(node, method_name)
|
50
64
|
return if send(method_name, node)
|
51
65
|
node.children.each {|j| traverse_conditionally_top_down(j, method_name)}
|
@@ -91,6 +105,32 @@ module Dryopteris
|
|
91
105
|
end
|
92
106
|
|
93
107
|
|
108
|
+
def whitewash_node(node)
|
109
|
+
case node.type
|
110
|
+
when 1 # Nokogiri::XML::Node::ELEMENT_NODE
|
111
|
+
if HashedWhiteList::ALLOWED_ELEMENTS[node.name]
|
112
|
+
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
113
|
+
has_no_namespaces = true
|
114
|
+
begin
|
115
|
+
has_no_namespaces = node.namespaces.empty?
|
116
|
+
rescue
|
117
|
+
# older versions of nokogiri raise an exception when there
|
118
|
+
# is a namespace on the node that is not declared with an href.
|
119
|
+
# see http://github.com/tenderlove/nokogiri/commit/395d7971304e1489e92c494b9c50609f4b4c4ab0
|
120
|
+
has_no_namespaces = false
|
121
|
+
end
|
122
|
+
return false if has_no_namespaces
|
123
|
+
end
|
124
|
+
when 3 # Nokogiri::XML::Node::TEXT_NODE
|
125
|
+
return false
|
126
|
+
when 4 # Nokogiri::XML::Node::CDATA_SECTION_NODE
|
127
|
+
return false
|
128
|
+
end
|
129
|
+
node.remove
|
130
|
+
return true
|
131
|
+
end
|
132
|
+
|
133
|
+
|
94
134
|
# this liftend nearly verbatim from html5
|
95
135
|
def sanitize_css(style)
|
96
136
|
# disallow urls
|
data/test/test_basic.rb
CHANGED
@@ -62,5 +62,85 @@ class TestBasic < Test::Unit::TestCase
|
|
62
62
|
def test_fragment_in_p_tag_plus_stuff
|
63
63
|
assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Dryopteris.sanitize("<p>This fragment is in a p.</p>foo<strong>bar</strong>")
|
64
64
|
end
|
65
|
+
|
66
|
+
def test_fragment_with_text_nodes_leading_and_trailing
|
67
|
+
assert_equal "text<p>fragment</p>text", Dryopteris.sanitize("text<p>fragment</p>text")
|
68
|
+
end
|
65
69
|
|
70
|
+
def test_fragment_with_body_tags
|
71
|
+
# ignore second open body tag, use first close body tag, ignore everything after that
|
72
|
+
assert_equal "textfragment", Dryopteris.sanitize("text<body>fragment</body>text")
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_whitewash_on_microsofty_markup
|
76
|
+
html = <<-EOHTML
|
77
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
|
78
|
+
<w:WordDocument>
|
79
|
+
<w:View>Normal</w:View>
|
80
|
+
<w:Zoom>0</w:Zoom>
|
81
|
+
<w:PunctuationKerning/>
|
82
|
+
<w:ValidateAgainstSchemas/>
|
83
|
+
<w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
|
84
|
+
<w:IgnoreMixedContent>false</w:IgnoreMixedContent>
|
85
|
+
<w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
|
86
|
+
<w:Compatibility>
|
87
|
+
<w:BreakWrappedTables/>
|
88
|
+
<w:SnapToGridInCell/>
|
89
|
+
<w:WrapTextWithPunct/>
|
90
|
+
<w:UseAsianBreakRules/>
|
91
|
+
<w:DontGrowAutofit/>
|
92
|
+
</w:Compatibility>
|
93
|
+
<w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
|
94
|
+
</w:WordDocument>
|
95
|
+
</xml><![endif]--><!--[if gte mso 9]><xml>
|
96
|
+
<w:LatentStyles DefLockedState="false" LatentStyleCount="156">
|
97
|
+
</w:LatentStyles>
|
98
|
+
</xml><![endif]--><style>
|
99
|
+
<!--
|
100
|
+
/* Style Definitions */
|
101
|
+
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
102
|
+
{mso-style-parent:"";
|
103
|
+
margin:0in;
|
104
|
+
margin-bottom:.0001pt;
|
105
|
+
mso-pagination:widow-orphan;
|
106
|
+
font-size:12.0pt;
|
107
|
+
font-family:"Times New Roman";
|
108
|
+
mso-fareast-font-family:"Times New Roman";}
|
109
|
+
@page Section1
|
110
|
+
{size:8.5in 11.0in;
|
111
|
+
margin:1.0in 1.25in 1.0in 1.25in;
|
112
|
+
mso-header-margin:.5in;
|
113
|
+
mso-footer-margin:.5in;
|
114
|
+
mso-paper-source:0;}
|
115
|
+
div.Section1
|
116
|
+
{page:Section1;}
|
117
|
+
-->
|
118
|
+
</style><!--[if gte mso 10]>
|
119
|
+
<style>
|
120
|
+
/* Style Definitions */
|
121
|
+
table.MsoNormalTable
|
122
|
+
{mso-style-name:"Table Normal";
|
123
|
+
mso-tstyle-rowband-size:0;
|
124
|
+
mso-tstyle-colband-size:0;
|
125
|
+
mso-style-noshow:yes;
|
126
|
+
mso-style-parent:"";
|
127
|
+
mso-padding-alt:0in 5.4pt 0in 5.4pt;
|
128
|
+
mso-para-margin:0in;
|
129
|
+
mso-para-margin-bottom:.0001pt;
|
130
|
+
mso-pagination:widow-orphan;
|
131
|
+
font-size:10.0pt;
|
132
|
+
font-family:"Times New Roman";
|
133
|
+
mso-ansi-language:#0400;
|
134
|
+
mso-fareast-language:#0400;
|
135
|
+
mso-bidi-language:#0400;}
|
136
|
+
</style>
|
137
|
+
<![endif]-->
|
138
|
+
|
139
|
+
<p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
|
140
|
+
EOHTML
|
141
|
+
|
142
|
+
whitewashed = Dryopteris.whitewash(html)
|
143
|
+
assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed
|
144
|
+
end
|
145
|
+
|
66
146
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mdalessio-dryopteris
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bryan Helmkamp
|
@@ -15,6 +15,7 @@ default_executable:
|
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: nokogiri
|
18
|
+
type: :runtime
|
18
19
|
version_requirement:
|
19
20
|
version_requirements: !ruby/object:Gem::Requirement
|
20
21
|
requirements:
|