mdalessio-dryopteris 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.markdown CHANGED
@@ -51,6 +51,19 @@ The returned string will contain exactly one (1) well-formed HTML document, with
51
51
 
52
52
  Coolness: <tt>dangerous\_html\_document</tt> can be a string OR an IO object (a file, or a socket, or ...). Which makes it particularly easy to sanitize large numbers of docs.
53
53
 
54
+ ### Whitewashing HTML
55
+
56
+ Other times, you may want to allow a user to submit HTML, and remove all styling, attributes and invalid HTML tags. I like to call this "whitewashing", since it's putting a new layer of paint on top of the user's HTML input to make it look nice.
57
+
58
+ One use case for this feature is to clean up HTML that was cut-and-pasted from Microsoft(tm) Word into a WYSIWYG editor/textarea. Microsoft's editor is famous for injecting all kinds of cruft into its HTML output. Who needs that? Certainly not me.
59
+
60
+ whitewashed_html = Dryopteris.whitewash(ugly_microsoft_html_snippet)
61
+
62
+ Please note that whitewashing implicitly also sanitizes your HTML, as it uses the same HTML tag whitelist as <tt>sanitize()</tt>. It's implementation is:
63
+
64
+ 1. unless the tag is on the whitelist, remove it from the document
65
+ 2. if the tag has an XML namespace on it, remove it from the document
66
+ 2. remove all attributes from the node
54
67
 
55
68
  Standing on the Shoulders of Giants
56
69
  -----
@@ -18,6 +18,19 @@ module Dryopteris
18
18
  body_element.inner_text
19
19
  end
20
20
 
21
+ def whitewash(string_or_io, encoding=nil)
22
+ return nil if string_or_io.nil?
23
+ return "" if string_or_io.strip.size == 0
24
+
25
+ doc = Nokogiri::HTML.parse(string_or_io, nil, encoding)
26
+ body = doc.xpath("/html/body").first
27
+ return "" if body.nil?
28
+ body.children.each do |node|
29
+ traverse_conditionally_top_down(node, :whitewash_node)
30
+ end
31
+ body.children.map { |x| x.to_xml }.join
32
+ end
33
+
21
34
  def sanitize(string, encoding=nil)
22
35
  return nil if string.nil?
23
36
  return "" if string.strip.size == 0
@@ -46,6 +59,7 @@ module Dryopteris
46
59
  end
47
60
 
48
61
  private
62
+
49
63
  def traverse_conditionally_top_down(node, method_name)
50
64
  return if send(method_name, node)
51
65
  node.children.each {|j| traverse_conditionally_top_down(j, method_name)}
@@ -91,6 +105,32 @@ module Dryopteris
91
105
  end
92
106
 
93
107
 
108
+ def whitewash_node(node)
109
+ case node.type
110
+ when 1 # Nokogiri::XML::Node::ELEMENT_NODE
111
+ if HashedWhiteList::ALLOWED_ELEMENTS[node.name]
112
+ node.attributes.each { |attr| node.remove_attribute(attr.first) }
113
+ has_no_namespaces = true
114
+ begin
115
+ has_no_namespaces = node.namespaces.empty?
116
+ rescue
117
+ # older versions of nokogiri raise an exception when there
118
+ # is a namespace on the node that is not declared with an href.
119
+ # see http://github.com/tenderlove/nokogiri/commit/395d7971304e1489e92c494b9c50609f4b4c4ab0
120
+ has_no_namespaces = false
121
+ end
122
+ return false if has_no_namespaces
123
+ end
124
+ when 3 # Nokogiri::XML::Node::TEXT_NODE
125
+ return false
126
+ when 4 # Nokogiri::XML::Node::CDATA_SECTION_NODE
127
+ return false
128
+ end
129
+ node.remove
130
+ return true
131
+ end
132
+
133
+
94
134
  # this liftend nearly verbatim from html5
95
135
  def sanitize_css(style)
96
136
  # disallow urls
data/test/test_basic.rb CHANGED
@@ -62,5 +62,85 @@ class TestBasic < Test::Unit::TestCase
62
62
  def test_fragment_in_p_tag_plus_stuff
63
63
  assert_equal "<p>This fragment is in a p.</p>foo<strong>bar</strong>", Dryopteris.sanitize("<p>This fragment is in a p.</p>foo<strong>bar</strong>")
64
64
  end
65
+
66
+ def test_fragment_with_text_nodes_leading_and_trailing
67
+ assert_equal "text<p>fragment</p>text", Dryopteris.sanitize("text<p>fragment</p>text")
68
+ end
65
69
 
70
+ def test_fragment_with_body_tags
71
+ # ignore second open body tag, use first close body tag, ignore everything after that
72
+ assert_equal "textfragment", Dryopteris.sanitize("text<body>fragment</body>text")
73
+ end
74
+
75
+ def test_whitewash_on_microsofty_markup
76
+ html = <<-EOHTML
77
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"><meta name="ProgId" content="Word.Document"><meta name="Generator" content="Microsoft Word 11"><meta name="Originator" content="Microsoft Word 11"><link rel="File-List" href="file:///C:%5CDOCUME%7E1%5CNICOLE%7E1%5CLOCALS%7E1%5CTemp%5Cmsohtml1%5C01%5Cclip_filelist.xml"><!--[if gte mso 9]><xml>
78
+ <w:WordDocument>
79
+ <w:View>Normal</w:View>
80
+ <w:Zoom>0</w:Zoom>
81
+ <w:PunctuationKerning/>
82
+ <w:ValidateAgainstSchemas/>
83
+ <w:SaveIfXMLInvalid>false</w:SaveIfXMLInvalid>
84
+ <w:IgnoreMixedContent>false</w:IgnoreMixedContent>
85
+ <w:AlwaysShowPlaceholderText>false</w:AlwaysShowPlaceholderText>
86
+ <w:Compatibility>
87
+ <w:BreakWrappedTables/>
88
+ <w:SnapToGridInCell/>
89
+ <w:WrapTextWithPunct/>
90
+ <w:UseAsianBreakRules/>
91
+ <w:DontGrowAutofit/>
92
+ </w:Compatibility>
93
+ <w:BrowserLevel>MicrosoftInternetExplorer4</w:BrowserLevel>
94
+ </w:WordDocument>
95
+ </xml><![endif]--><!--[if gte mso 9]><xml>
96
+ <w:LatentStyles DefLockedState="false" LatentStyleCount="156">
97
+ </w:LatentStyles>
98
+ </xml><![endif]--><style>
99
+ <!--
100
+ /* Style Definitions */
101
+ p.MsoNormal, li.MsoNormal, div.MsoNormal
102
+ {mso-style-parent:"";
103
+ margin:0in;
104
+ margin-bottom:.0001pt;
105
+ mso-pagination:widow-orphan;
106
+ font-size:12.0pt;
107
+ font-family:"Times New Roman";
108
+ mso-fareast-font-family:"Times New Roman";}
109
+ @page Section1
110
+ {size:8.5in 11.0in;
111
+ margin:1.0in 1.25in 1.0in 1.25in;
112
+ mso-header-margin:.5in;
113
+ mso-footer-margin:.5in;
114
+ mso-paper-source:0;}
115
+ div.Section1
116
+ {page:Section1;}
117
+ -->
118
+ </style><!--[if gte mso 10]>
119
+ <style>
120
+ /* Style Definitions */
121
+ table.MsoNormalTable
122
+ {mso-style-name:"Table Normal";
123
+ mso-tstyle-rowband-size:0;
124
+ mso-tstyle-colband-size:0;
125
+ mso-style-noshow:yes;
126
+ mso-style-parent:"";
127
+ mso-padding-alt:0in 5.4pt 0in 5.4pt;
128
+ mso-para-margin:0in;
129
+ mso-para-margin-bottom:.0001pt;
130
+ mso-pagination:widow-orphan;
131
+ font-size:10.0pt;
132
+ font-family:"Times New Roman";
133
+ mso-ansi-language:#0400;
134
+ mso-fareast-language:#0400;
135
+ mso-bidi-language:#0400;}
136
+ </style>
137
+ <![endif]-->
138
+
139
+ <p class="MsoNormal">Foo <b style="">BOLD<o:p></o:p></b></p>
140
+ EOHTML
141
+
142
+ whitewashed = Dryopteris.whitewash(html)
143
+ assert_equal "<p>Foo <b>BOLD</b></p>", whitewashed
144
+ end
145
+
66
146
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mdalessio-dryopteris
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Bryan Helmkamp
@@ -15,6 +15,7 @@ default_executable:
15
15
  dependencies:
16
16
  - !ruby/object:Gem::Dependency
17
17
  name: nokogiri
18
+ type: :runtime
18
19
  version_requirement:
19
20
  version_requirements: !ruby/object:Gem::Requirement
20
21
  requirements: