mdalessio-dryopteris 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +13 -0
- data/lib/dryopteris/sanitize.rb +40 -0
- data/test/test_basic.rb +80 -0
- metadata +2 -1
data/README.markdown
CHANGED
@@ -51,6 +51,19 @@ The returned string will contain exactly one (1) well-formed HTML document, with
|
|
51
51
|
|
52
52
|
Coolness: <tt>dangerous\_html\_document</tt> can be a string OR an IO object (a file, or a socket, or ...). Which makes it particularly easy to sanitize large numbers of docs.
|
53
53
|
|
54
|
+
### Whitewashing HTML
|
55
|
+
|
56
|
+
Other times, you may want to allow a user to submit HTML, and remove all styling, attributes and invalid HTML tags. I like to call this "whitewashing", since it's putting a new layer of paint on top of the user's HTML input to make it look nice.
|
57
|
+
|
58
|
+
One use case for this feature is to clean up HTML that was cut-and-pasted from Microsoft(tm) Word into a WYSIWYG editor/textarea. Microsoft's editor is famous for injecting all kinds of cruft into its HTML output. Who needs that? Certainly not me.
|
59
|
+
|
60
|
+
whitewashed_html = Dryopteris.whitewash(ugly_microsoft_html_snippet)
|
61
|
+
|
62
|
+
Please note that whitewashing implicitly also sanitizes your HTML, as it uses the same HTML tag whitelist as <tt>sanitize()</tt>. It's implementation is:
|
63
|
+
|
64
|
+
1. unless the tag is on the whitelist, remove it from the document
|
65
|
+
2. if the tag has an XML namespace on it, remove it from the document
|
66
|
+
2. remove all attributes from the node
|
54
67
|
|
55
68
|
Standing on the Shoulders of Giants
|
56
69
|
-----
|
data/lib/dryopteris/sanitize.rb
CHANGED
@@ -18,6 +18,19 @@ module Dryopteris
|
|
18
18
|
body_element.inner_text
|
19
19
|
end
|
20
20
|
|
21
|
+
def whitewash(string_or_io, encoding=nil)
|
22
|
+
return nil if string_or_io.nil?
|
23
|
+
return "" if string_or_io.strip.size == 0
|
24
|
+
|
25
|
+
doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
|
26
|
+
body = doc.xpath("/html/body").first
|
27
|
+
return "" if body.nil?
|
28
|
+
body.children.each do |node|
|
29
|
+
traverse_conditionally_top_down(node, :whitewash_node)
|
30
|
+
end
|
31
|
+
body.children.map { |x| x.to_xml }.join
|
32
|
+
end
|
33
|
+
|
21
34
|
def sanitize(string, encoding=nil)
|
22
35
|
return nil if string.nil?
|
23
36
|
return "" if string.strip.size == 0
|
@@ -46,6 +59,7 @@ module Dryopteris
|
|
46
59
|
end
|
47
60
|
|
48
61
|
private
|
62
|
+
|
49
63
|
def traverse_conditionally_top_down(node, method_name)
|
50
64
|
return if send(method_name, node)
|
51
65
|
node.children.each {|j| traverse_conditionally_top_down(j, method_name)}
|
@@ -91,6 +105,32 @@ module Dryopteris
|
|
91
105
|
end
|
92
106
|
|
93
107
|
|
108
|
+
def whitewash_node(node)
|
109
|
+
case node.type
|
110
|
+
when 1 # Nokogiri::XML::Node::ELEMENT_NODE
|
111
|
+
if HashedWhiteList::ALLOWED_ELEMENTS[node.name]
|
112
|
+
node.attributes.each { |attr| node.remove_attribute(attr.first) }
|
113
|
+
has_no_namespaces = true
|
114
|
+
begin
|
115
|
+
has_no_namespaces = node.namespaces.empty?
|
116
|
+
rescue
|
117
|
+
# older versions of nokogiri raise an exception when there
|
118
|
+
# is a namespace on the node that is not declared with an href.
|
119
|
+
# see http://github.com/tenderlove/nokogiri/commit/395d7971304e1489e92c494b9c50609f4b4c4ab0
|
120
|
+
has_no_namespaces = false
|
121
|
+
end
|
122
|
+
return false if has_no_namespaces
|
123
|
+
end
|
124
|
+
when 3 # Nokogiri::XML::Node::TEXT_NODE
|
125
|
+
return false
|
126
|
+
when 4 # Nokogiri::XML::Node::CDATA_SECTION_NODE
|
127
|
+
return false
|
128
|
+
end
|
129
|
+
node.remove
|
130
|
+
return true
|
131
|
+
end
|
132
|
+
|
133
|
+
|
94
134
|
# this liftend nearly verbatim from html5
|
95
135
|
def sanitize_css(style)
|
96
136
|
# disallow urls
|
data/test/test_basic.rb
CHANGED
@@ -62,5 +62,85 @@ class TestBasic < Test::Unit::TestCase
|
|
62
62
|
def test_fragment_in_p_tag_plus_stuff
|
63
63
|
assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Dryopteris.sanitize("<p>This fragment is in a p.</p>foo<strong>bar</strong>")
|
64
64
|
end
|
65
|
+
|
66
|
+
def test_fragment_with_text_nodes_leading_and_trailing
|
67
|
+
assert_equal "text<p>fragment</p>text", Dryopteris.sanitize("text<p>fragment</p>text")
|
68
|
+
end
|
65
69
|
|
70
|
+
def test_fragment_with_body_tags
|
71
|
+
# ignore second open body tag, use first close body tag, ignore everything after that
|
72
|
+
assert_equal "textfragment", Dryopteris.sanitize("text<body>fragment</body>text")
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_whitewash_on_microsofty_markup
|
76
|
+
html = <<-EOHTML
|
77
|
+
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
|
78
|
+
<w:WordDocument>
|
79
|
+
<w:View>Normal</w:View>
|
80
|
+
<w:Zoom>0</w:Zoom>
|
81
|
+
<w:PunctuationKerning/>
|
82
|
+
<w:ValidateAgainstSchemas/>
|
83
|
+
<w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
|
84
|
+
<w:IgnoreMixedContent>false</w:IgnoreMixedContent>
|
85
|
+
<w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
|
86
|
+
<w:Compatibility>
|
87
|
+
<w:BreakWrappedTables/>
|
88
|
+
<w:SnapToGridInCell/>
|
89
|
+
<w:WrapTextWithPunct/>
|
90
|
+
<w:UseAsianBreakRules/>
|
91
|
+
<w:DontGrowAutofit/>
|
92
|
+
</w:Compatibility>
|
93
|
+
<w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
|
94
|
+
</w:WordDocument>
|
95
|
+
</xml><![endif]--><!--[if gte mso 9]><xml>
|
96
|
+
<w:LatentStyles DefLockedState="false" LatentStyleCount="156">
|
97
|
+
</w:LatentStyles>
|
98
|
+
</xml><![endif]--><style>
|
99
|
+
<!--
|
100
|
+
/* Style Definitions */
|
101
|
+
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
102
|
+
{mso-style-parent:"";
|
103
|
+
margin:0in;
|
104
|
+
margin-bottom:.0001pt;
|
105
|
+
mso-pagination:widow-orphan;
|
106
|
+
font-size:12.0pt;
|
107
|
+
font-family:"Times New Roman";
|
108
|
+
mso-fareast-font-family:"Times New Roman";}
|
109
|
+
@page Section1
|
110
|
+
{size:8.5in 11.0in;
|
111
|
+
margin:1.0in 1.25in 1.0in 1.25in;
|
112
|
+
mso-header-margin:.5in;
|
113
|
+
mso-footer-margin:.5in;
|
114
|
+
mso-paper-source:0;}
|
115
|
+
div.Section1
|
116
|
+
{page:Section1;}
|
117
|
+
-->
|
118
|
+
</style><!--[if gte mso 10]>
|
119
|
+
<style>
|
120
|
+
/* Style Definitions */
|
121
|
+
table.MsoNormalTable
|
122
|
+
{mso-style-name:"Table Normal";
|
123
|
+
mso-tstyle-rowband-size:0;
|
124
|
+
mso-tstyle-colband-size:0;
|
125
|
+
mso-style-noshow:yes;
|
126
|
+
mso-style-parent:"";
|
127
|
+
mso-padding-alt:0in 5.4pt 0in 5.4pt;
|
128
|
+
mso-para-margin:0in;
|
129
|
+
mso-para-margin-bottom:.0001pt;
|
130
|
+
mso-pagination:widow-orphan;
|
131
|
+
font-size:10.0pt;
|
132
|
+
font-family:"Times New Roman";
|
133
|
+
mso-ansi-language:#0400;
|
134
|
+
mso-fareast-language:#0400;
|
135
|
+
mso-bidi-language:#0400;}
|
136
|
+
</style>
|
137
|
+
<![endif]-->
|
138
|
+
|
139
|
+
<p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
|
140
|
+
EOHTML
|
141
|
+
|
142
|
+
whitewashed = Dryopteris.whitewash(html)
|
143
|
+
assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed
|
144
|
+
end
|
145
|
+
|
66
146
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mdalessio-dryopteris
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Bryan Helmkamp
|
@@ -15,6 +15,7 @@ default_executable:
|
|
15
15
|
dependencies:
|
16
16
|
- !ruby/object:Gem::Dependency
|
17
17
|
name: nokogiri
|
18
|
+
type: :runtime
|
18
19
|
version_requirement:
|
19
20
|
version_requirements: !ruby/object:Gem::Requirement
|
20
21
|
requirements:
|