mdalessio-dryopteris 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +10 -3
- data/lib/dryopteris/sanitize.rb +17 -1
- data/test/test_basic.rb +82 -70
- metadata +1 -1
data/README.markdown
CHANGED
@@ -24,7 +24,7 @@ Yeah, it's that easy.
|
|
24
24
|
In this example, <tt>safe\_html\_snippet</tt> will have all of its __broken markup fixed__ by libxml2, and it will also be completely __sanitized of harmful tags and attributes__. That's twice as clean!
|
25
25
|
|
26
26
|
|
27
|
-
|
27
|
+
Sanitization Usage
|
28
28
|
-----
|
29
29
|
|
30
30
|
You're still here? Ok, let me tell you a little something about the two different methods of sanitizing the Dryopteris offers.
|
@@ -51,9 +51,12 @@ The returned string will contain exactly one (1) well-formed HTML document, with
|
|
51
51
|
|
52
52
|
Coolness: <tt>dangerous\_html\_document</tt> can be a string OR an IO object (a file, or a socket, or ...). Which makes it particularly easy to sanitize large numbers of docs.
|
53
53
|
|
54
|
-
|
54
|
+
Whitewashing Usage
|
55
|
+
-----
|
56
|
+
|
57
|
+
### Whitewashing Fragments
|
55
58
|
|
56
|
-
Other times, you may want to
|
59
|
+
Other times, you may want to remove all styling, attributes and invalid HTML tags. I like to call this "whitewashing", since it's putting a new layer of paint on top of the HTML input to make it look nice.
|
57
60
|
|
58
61
|
One use case for this feature is to clean up HTML that was cut-and-pasted from Microsoft(tm) Word into a WYSIWYG editor/textarea. Microsoft's editor is famous for injecting all kinds of cruft into its HTML output. Who needs that? Certainly not me.
|
59
62
|
|
@@ -65,6 +68,10 @@ Please note that whitewashing implicitly also sanitizes your HTML, as it uses th
|
|
65
68
|
2. if the tag has an XML namespace on it, remove it from the document
|
66
69
|
2. remove all attributes from the node
|
67
70
|
|
71
|
+
### Whitewashing Documents
|
72
|
+
|
73
|
+
Also note the existence of <tt>whitewash\_document</tt>, which is analogous to <tt>sanitize\_document</tt>.
|
74
|
+
|
68
75
|
Standing on the Shoulders of Giants
|
69
76
|
-----
|
70
77
|
|
data/lib/dryopteris/sanitize.rb
CHANGED
@@ -18,7 +18,22 @@ module Dryopteris
|
|
18
18
|
body_element.inner_text
|
19
19
|
end
|
20
20
|
|
21
|
-
|
21
|
+
|
22
|
+
def whitewash(string, encoding=nil)
|
23
|
+
return nil if string.nil?
|
24
|
+
return "" if string.strip.size == 0
|
25
|
+
|
26
|
+
string = "<html><body>" + string + "</body></html>"
|
27
|
+
doc = Nokogiri::HTML.parse(string, nil, encoding)
|
28
|
+
body = doc.xpath("/html/body").first
|
29
|
+
return "" if body.nil?
|
30
|
+
body.children.each do |node|
|
31
|
+
traverse_conditionally_top_down(node, :whitewash_node)
|
32
|
+
end
|
33
|
+
body.children.map { |x| x.to_xml }.join
|
34
|
+
end
|
35
|
+
|
36
|
+
def whitewash_document(string_or_io, encoding=nil)
|
22
37
|
return nil if string_or_io.nil?
|
23
38
|
return "" if string_or_io.strip.size == 0
|
24
39
|
|
@@ -31,6 +46,7 @@ module Dryopteris
|
|
31
46
|
body.children.map { |x| x.to_xml }.join
|
32
47
|
end
|
33
48
|
|
49
|
+
|
34
50
|
def sanitize(string, encoding=nil)
|
35
51
|
return nil if string.nil?
|
36
52
|
return "" if string.strip.size == 0
|
data/test/test_basic.rb
CHANGED
@@ -1,7 +1,79 @@
|
|
1
1
|
require File.expand_path(File.join(File.dirname(__FILE__), 'helper'))
|
2
2
|
|
3
|
+
if defined? Nokogiri::VERSION_INFO
|
4
|
+
puts "=> running with Nokogiri #{Nokogiri::VERSION_INFO.inspect}"
|
5
|
+
else
|
6
|
+
puts "=> running with Nokogiri #{Nokogiri::VERSION} / libxml #{Nokogiri::LIBXML_PARSER_VERSION}"
|
7
|
+
end
|
8
|
+
|
3
9
|
class TestBasic < Test::Unit::TestCase
|
4
10
|
|
11
|
+
MSWORD_HTML = <<-EOHTML
|
12
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
|
13
|
+
<w:WordDocument>
|
14
|
+
<w:View>Normal</w:View>
|
15
|
+
<w:Zoom>0</w:Zoom>
|
16
|
+
<w:PunctuationKerning/>
|
17
|
+
<w:ValidateAgainstSchemas/>
|
18
|
+
<w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
|
19
|
+
<w:IgnoreMixedContent>false</w:IgnoreMixedContent>
|
20
|
+
<w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
|
21
|
+
<w:Compatibility>
|
22
|
+
<w:BreakWrappedTables/>
|
23
|
+
<w:SnapToGridInCell/>
|
24
|
+
<w:WrapTextWithPunct/>
|
25
|
+
<w:UseAsianBreakRules/>
|
26
|
+
<w:DontGrowAutofit/>
|
27
|
+
</w:Compatibility>
|
28
|
+
<w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
|
29
|
+
</w:WordDocument>
|
30
|
+
</xml><![endif]--><!--[if gte mso 9]><xml>
|
31
|
+
<w:LatentStyles DefLockedState="false" LatentStyleCount="156">
|
32
|
+
</w:LatentStyles>
|
33
|
+
</xml><![endif]--><style>
|
34
|
+
<!--
|
35
|
+
/* Style Definitions */
|
36
|
+
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
37
|
+
{mso-style-parent:"";
|
38
|
+
margin:0in;
|
39
|
+
margin-bottom:.0001pt;
|
40
|
+
mso-pagination:widow-orphan;
|
41
|
+
font-size:12.0pt;
|
42
|
+
font-family:"Times New Roman";
|
43
|
+
mso-fareast-font-family:"Times New Roman";}
|
44
|
+
@page Section1
|
45
|
+
{size:8.5in 11.0in;
|
46
|
+
margin:1.0in 1.25in 1.0in 1.25in;
|
47
|
+
mso-header-margin:.5in;
|
48
|
+
mso-footer-margin:.5in;
|
49
|
+
mso-paper-source:0;}
|
50
|
+
div.Section1
|
51
|
+
{page:Section1;}
|
52
|
+
-->
|
53
|
+
</style><!--[if gte mso 10]>
|
54
|
+
<style>
|
55
|
+
/* Style Definitions */
|
56
|
+
table.MsoNormalTable
|
57
|
+
{mso-style-name:"Table Normal";
|
58
|
+
mso-tstyle-rowband-size:0;
|
59
|
+
mso-tstyle-colband-size:0;
|
60
|
+
mso-style-noshow:yes;
|
61
|
+
mso-style-parent:"";
|
62
|
+
mso-padding-alt:0in 5.4pt 0in 5.4pt;
|
63
|
+
mso-para-margin:0in;
|
64
|
+
mso-para-margin-bottom:.0001pt;
|
65
|
+
mso-pagination:widow-orphan;
|
66
|
+
font-size:10.0pt;
|
67
|
+
font-family:"Times New Roman";
|
68
|
+
mso-ansi-language:#0400;
|
69
|
+
mso-fareast-language:#0400;
|
70
|
+
mso-bidi-language:#0400;}
|
71
|
+
</style>
|
72
|
+
<![endif]-->
|
73
|
+
|
74
|
+
<p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
|
75
|
+
EOHTML
|
76
|
+
|
5
77
|
def test_nil
|
6
78
|
assert_nil Dryopteris.sanitize(nil)
|
7
79
|
end
|
@@ -67,79 +139,19 @@ class TestBasic < Test::Unit::TestCase
|
|
67
139
|
assert_equal "text<p>fragment</p>text", Dryopteris.sanitize("text<p>fragment</p>text")
|
68
140
|
end
|
69
141
|
|
70
|
-
def
|
71
|
-
|
72
|
-
|
142
|
+
def test_whitewash_on_fragment
|
143
|
+
html = "safe<frameset rows=\"*\"><frame src=\"http://example.com\"></frameset> <b>description</b>"
|
144
|
+
whitewashed = Dryopteris.whitewash_document(html)
|
145
|
+
assert_equal "<p>safe</p><b>description</b>", whitewashed
|
73
146
|
end
|
74
147
|
|
75
|
-
def
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
<w:View>Normal</w:View>
|
80
|
-
<w:Zoom>0</w:Zoom>
|
81
|
-
<w:PunctuationKerning/>
|
82
|
-
<w:ValidateAgainstSchemas/>
|
83
|
-
<w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
|
84
|
-
<w:IgnoreMixedContent>false</w:IgnoreMixedContent>
|
85
|
-
<w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
|
86
|
-
<w:Compatibility>
|
87
|
-
<w:BreakWrappedTables/>
|
88
|
-
<w:SnapToGridInCell/>
|
89
|
-
<w:WrapTextWithPunct/>
|
90
|
-
<w:UseAsianBreakRules/>
|
91
|
-
<w:DontGrowAutofit/>
|
92
|
-
</w:Compatibility>
|
93
|
-
<w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
|
94
|
-
</w:WordDocument>
|
95
|
-
</xml><![endif]--><!--[if gte mso 9]><xml>
|
96
|
-
<w:LatentStyles DefLockedState="false" LatentStyleCount="156">
|
97
|
-
</w:LatentStyles>
|
98
|
-
</xml><![endif]--><style>
|
99
|
-
<!--
|
100
|
-
/* Style Definitions */
|
101
|
-
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
102
|
-
{mso-style-parent:"";
|
103
|
-
margin:0in;
|
104
|
-
margin-bottom:.0001pt;
|
105
|
-
mso-pagination:widow-orphan;
|
106
|
-
font-size:12.0pt;
|
107
|
-
font-family:"Times New Roman";
|
108
|
-
mso-fareast-font-family:"Times New Roman";}
|
109
|
-
@page Section1
|
110
|
-
{size:8.5in 11.0in;
|
111
|
-
margin:1.0in 1.25in 1.0in 1.25in;
|
112
|
-
mso-header-margin:.5in;
|
113
|
-
mso-footer-margin:.5in;
|
114
|
-
mso-paper-source:0;}
|
115
|
-
div.Section1
|
116
|
-
{page:Section1;}
|
117
|
-
-->
|
118
|
-
</style><!--[if gte mso 10]>
|
119
|
-
<style>
|
120
|
-
/* Style Definitions */
|
121
|
-
table.MsoNormalTable
|
122
|
-
{mso-style-name:"Table Normal";
|
123
|
-
mso-tstyle-rowband-size:0;
|
124
|
-
mso-tstyle-colband-size:0;
|
125
|
-
mso-style-noshow:yes;
|
126
|
-
mso-style-parent:"";
|
127
|
-
mso-padding-alt:0in 5.4pt 0in 5.4pt;
|
128
|
-
mso-para-margin:0in;
|
129
|
-
mso-para-margin-bottom:.0001pt;
|
130
|
-
mso-pagination:widow-orphan;
|
131
|
-
font-size:10.0pt;
|
132
|
-
font-family:"Times New Roman";
|
133
|
-
mso-ansi-language:#0400;
|
134
|
-
mso-fareast-language:#0400;
|
135
|
-
mso-bidi-language:#0400;}
|
136
|
-
</style>
|
137
|
-
<![endif]-->
|
138
|
-
|
139
|
-
<p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
|
140
|
-
EOHTML
|
148
|
+
def test_whitewash_fragment_on_microsofty_markup
|
149
|
+
whitewashed = Dryopteris.whitewash(MSWORD_HTML.chomp)
|
150
|
+
assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed
|
151
|
+
end
|
141
152
|
|
142
|
-
|
153
|
+
def test_whitewash_on_microsofty_markup
|
154
|
+
whitewashed = Dryopteris.whitewash_document(MSWORD_HTML)
|
143
155
|
assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed
|
144
156
|
end
|
145
157
|
|