webget_ramp 1.7.1.2 → 1.7.1.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -66,6 +66,69 @@ module XML
66
66
  }
67
67
  end
68
68
 
69
+
70
+ # Santize dirty xml by removing unprintables, bad tags,
71
+ # comments, and generally anything else we might need
72
+ # to enable the XML parser to handle a dirty document.
73
+ #
74
+ # ==Example
75
+ # s="<foo a=b c=d><!--comment-->Hello<!-[if bar]>Microsoft<![endif]>World</foo>"
76
+ # XML.strip_all(s) => "<foo>HelloWorld</foo>"
77
+ #
78
+ # This method calls these in order:
79
+ # - XML.strip_unprintables
80
+ # - XML.strip_microsoft
81
+ # - XML.strip_comments
82
+ # - XML.strip_attributes
83
+
84
+ def XML.strip_all(xml_text)
85
+ return XML.strip_attributes(XML.strip_comments(XML.strip_microsoft(XML.strip_unprintables(xml_text))))
86
+ end
87
+
88
+
89
+ # Strip out all attributes from the xml text's tags.
90
+ #
91
+ # ==Example
92
+ # s="<foo a=b c=d e=f>Hello</foo>"
93
+ # XML.strip_attributes(s) => "<foo>Hello</foo>"
94
+
95
+ def XML.strip_attributes(xml_text)
96
+ return xml_text.gsub(/<(\/?\w+).*?>/im){"<#{$1}>"} # delete attributes
97
+ end
98
+
99
+
100
+ # Strip out all comments from the xml text.
101
+ #
102
+ # ==Example
103
+ # s="Hello<!--comment-->World"
104
+ # XML.strip_comments(s) => "HelloWorld"
105
+
106
+ def XML.strip_comments(xml_text)
107
+ return xml_text.gsub(/<!.*?>/im,'')
108
+ end
109
+
110
+
111
+ # Strip out all microsoft proprietary codes.
112
+ #
113
+ # ==Example
114
+ # s="Hello<!-[if foo]>Microsoft<![endif]->World"
115
+ # XML.strip_microsoft(s) => "HelloWorld"
116
+
117
+ def XML.strip_microsoft(xml_text)
118
+ return xml_text.gsub(/<!-*\[if\b.*?<!\[endif\]-*>/im,'')
119
+ end
120
+
121
+
122
+ # Strip out all unprintable characters from the input string.
123
+ #
124
+ # ==Example
125
+ # s="Hello\XXXWorld" # where XXX is unprintable
126
+ # XML.strip_unprintables(s) => "HelloWorld"
127
+
128
+ def XML.strip_unprintables(xml_text)
129
+ return xml_text.gsub(/[^[:print:]]/, "")
130
+ end
131
+
69
132
  end
70
133
 
71
134
 
data/lib/webget_ramp.rb CHANGED
@@ -176,6 +176,11 @@ Extensions that help debug Ruby programs.
176
176
  == XML
177
177
 
178
178
  * (class) load_dir: specify a one or more directory patterns and pass each XML file in the matching directories to a block; see [Dir#glob](http://www.ruby-doc.org/core/classes/Dir.html#M002347) for pattern details.
179
+ * (class) strip_all: delete exraneous junk from an XML text string, typically for sanitizing input
180
+ * (class) strip_attributes: delete all attributes from an XML text string
181
+ * (class) strip_comments: delete all comments from an XML text string
182
+ * (class) strip_microsoft: delete all proprietary Microsoft code from an XML text string
183
+ * (class) strip_unprintables: delete all unprintable characters from an XML text string
179
184
 
180
185
 
181
186
  == YAML
@@ -185,7 +190,8 @@ Extensions that help debug Ruby programs.
185
190
 
186
191
  == Changes
187
192
 
188
- - 1.7.1.1 Update gems: Gemcutter, Ruby 1.9.1, JRuby sqlite3
193
+ - 1.7.1.3 Add XML#strip_xxx
194
+ - 1.7.1.2 Update gems: Gemcutter, Ruby 1.9.1, JRuby sqlite3
189
195
  - 1.7.1.0 Add XML attributes methods #
190
196
  - 1.7.0.9 Add Enumerable #hash_by, #index_by
191
197
  - 1.7.0.7 Add Array#to_tsv, String#split_tsv, improve Array#to_csv
@@ -46,5 +46,48 @@ class XMLTest < Test::Unit::TestCase
46
46
  assert_equal(expect,actual)
47
47
  end
48
48
 
49
+ def test_strip_all
50
+ s="<foo a=b c=d><!--comment-->Hello<!-[if bar]>Microsoft<![endif]>World</foo>"
51
+ expect="<foo>HelloWorld</foo>"
52
+ actual=XML.strip_all(s)
53
+ assert_equal(expect,actual)
54
+ end
55
+
56
+ def strip_attributes
57
+ s="<foo a=b c=d e=f>Hello</foo>"
58
+ expect="<foo>Hello</foo>"
59
+ actual=XML.strip_attributes(s)
60
+ assert_equal(expect,actual)
61
+ end
62
+
63
+ def test_strip_comments
64
+ s="Hello<!--comment-->World"
65
+ expect="HelloWorld"
66
+ actual=XML.strip_comments(s)
67
+ assert_equal(expect,actual)
68
+ end
69
+
70
+ def test_strip_microsoft
71
+ s="Hello<!-[if foo]>Microsoft<![endif]->World"
72
+ expect="HelloWorld"
73
+ actual=XML.strip_microsoft(s)
74
+ assert_equal(expect,actual)
75
+ end
76
+
77
+ def test_strip_unprintables
78
+ s="HelloWorld" #TODO create test that has unprintables
79
+ expect="HelloWorld"
80
+ actual=XML.strip_unprintables(s)
81
+ assert_equal(expect,actual)
82
+ end
83
+
84
+ def test_strip_msword
85
+ clean=File.open(File.join(MYDIR,"xml_test_msword_clean.html"),"rb")
86
+ dirty=File.open(File.join(MYDIR,"xml_test_msword_dirty.html"),"rb")
87
+ expect=clean.read
88
+ actual=XML.strip_all(dirty.read)
89
+ assert_equal(expect,actual)
90
+ end
91
+
49
92
  end
50
93
 
@@ -0,0 +1 @@
1
+ <html><head><meta><meta><meta><meta><meta><meta><link><title>Foo</title><style></style></head><body><div><table> <tr> <td> <p>Foo</p> </td> <td> <p>HTML</p> </td> <td> <p>af<b>s</b></p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <h1>Bold</h1> </td> <td> <p><o></o></p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <p>Fas785932517</p> </td> <td> <p>asf</p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <p><o></o></p> </td> <td> <p><o></o></p> </td> </tr></table><p><o></o></p></div></body></html>
@@ -0,0 +1,148 @@
1
+ <html xmlns:o="urn:schemas-microsoft-com:office:office"
2
+ xmlns:w="urn:schemas-microsoft-com:office:word"
3
+ xmlns="http://www.w3.org/TR/REC-html40">
4
+
5
+ <head>
6
+ <meta name=Title content=Foo>
7
+ <meta name=Keywords content="">
8
+ <meta http-equiv=Content-Type content="text/html; charset=macintosh">
9
+ <meta name=ProgId content=Word.Document>
10
+ <meta name=Generator content="Microsoft Word 10">
11
+ <meta name=Originator content="Microsoft Word 10">
12
+ <link rel=File-List href="Foo_files/filelist.xml">
13
+ <title>Foo</title>
14
+ <!--[if gte mso 9]><xml>
15
+ <o:DocumentProperties>
16
+ <o:Template>Normal</o:Template>
17
+ <o:LastAuthor>Staff</o:LastAuthor>
18
+ <o:Revision>1</o:Revision>
19
+ <o:Created>2009-10-22T23:51:00Z</o:Created>
20
+ <o:LastSaved>2009-10-22T23:53:00Z</o:LastSaved>
21
+ <o:Pages>1</o:Pages>
22
+ <o:Company>WestEd</o:Company>
23
+ <o:Lines>1</o:Lines>
24
+ <o:Paragraphs>1</o:Paragraphs>
25
+ <o:Version>10.262</o:Version>
26
+ </o:DocumentProperties>
27
+ </xml><![endif]--><!--[if gte mso 9]><xml>
28
+ <w:WordDocument>
29
+ <w:DisplayHorizontalDrawingGridEvery>0</w:DisplayHorizontalDrawingGridEvery>
30
+ <w:DisplayVerticalDrawingGridEvery>0</w:DisplayVerticalDrawingGridEvery>
31
+ <w:UseMarginsForDrawingGridOrigin/>
32
+ </w:WordDocument>
33
+ </xml><![endif]-->
34
+ <style>
35
+ <!--
36
+ /* Style Definitions */
37
+ p.MsoNormal, li.MsoNormal, div.MsoNormal
38
+ {mso-style-parent:"";
39
+ margin:0in;
40
+ margin-bottom:.0001pt;
41
+ mso-pagination:widow-orphan;
42
+ font-size:12.0pt;
43
+ font-family:Times;}
44
+ h1
45
+ {mso-style-next:Normal;
46
+ margin:0in;
47
+ margin-bottom:.0001pt;
48
+ mso-pagination:widow-orphan;
49
+ page-break-after:avoid;
50
+ mso-outline-level:1;
51
+ font-size:12.0pt;
52
+ font-family:Times;
53
+ mso-font-kerning:0pt;}
54
+ @page Section1
55
+ {size:8.5in 11.0in;
56
+ margin:1.0in 1.25in 1.0in 1.25in;
57
+ mso-header-margin:.5in;
58
+ mso-footer-margin:.5in;
59
+ mso-paper-source:0;}
60
+ div.Section1
61
+ {page:Section1;}
62
+ -->
63
+ </style>
64
+ </head>
65
+
66
+ <body bgcolor=white lang=EN-US style='tab-interval:.5in'>
67
+
68
+ <div class=Section1>
69
+
70
+ <table border=1 cellspacing=0 cellpadding=0 style='border-collapse:collapse;
71
+ border:none;mso-border-alt:solid windowtext .5pt;mso-padding-alt:0in 5.4pt 0in 5.4pt'>
72
+ <tr>
73
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
74
+ padding:0in 5.4pt 0in 5.4pt'>
75
+ <p class=MsoNormal>Foo</p>
76
+ </td>
77
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
78
+ border-left:none;mso-border-left-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
79
+ <p class=MsoNormal>HTML</p>
80
+ </td>
81
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
82
+ border-left:none;mso-border-left-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
83
+ <p class=MsoNormal>af<b>s</b></p>
84
+ </td>
85
+ </tr>
86
+ <tr>
87
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
88
+ border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
89
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
90
+ </td>
91
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
92
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
93
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
94
+ padding:0in 5.4pt 0in 5.4pt'>
95
+ <h1>Bold</h1>
96
+ </td>
97
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
98
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
99
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
100
+ padding:0in 5.4pt 0in 5.4pt'>
101
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
102
+ </td>
103
+ </tr>
104
+ <tr>
105
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
106
+ border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
107
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
108
+ </td>
109
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
110
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
111
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
112
+ padding:0in 5.4pt 0in 5.4pt'>
113
+ <p class=MsoNormal>Fas785932517</p>
114
+ </td>
115
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
116
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
117
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
118
+ padding:0in 5.4pt 0in 5.4pt'>
119
+ <p class=MsoNormal>�asf�</p>
120
+ </td>
121
+ </tr>
122
+ <tr>
123
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
124
+ border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
125
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
126
+ </td>
127
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
128
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
129
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
130
+ padding:0in 5.4pt 0in 5.4pt'>
131
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
132
+ </td>
133
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
134
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
135
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
136
+ padding:0in 5.4pt 0in 5.4pt'>
137
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
138
+ </td>
139
+ </tr>
140
+ </table>
141
+
142
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
143
+
144
+ </div>
145
+
146
+ </body>
147
+
148
+ </html>
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget_ramp
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.7.1.2
4
+ version: 1.7.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - WebGet
@@ -32,7 +32,7 @@ cert_chain:
32
32
  DXnLFY0cVuBnNDMOOFl8vk1qIcZjcTovhzgcixpG6Uk5qmUsKHRLQf4oQJx7TfLK
33
33
  -----END CERTIFICATE-----
34
34
 
35
- date: 2009-10-27 00:00:00 -07:00
35
+ date: 2009-10-29 00:00:00 -07:00
36
36
  default_executable:
37
37
  dependencies: []
38
38
 
@@ -68,6 +68,8 @@ files:
68
68
  - lib/webget_ramp/xml.rb
69
69
  - lib/webget_ramp/yaml.rb
70
70
  - test/webget_ramp/io_test.txt
71
+ - test/webget_ramp/xml_test_msword_clean.html
72
+ - test/webget_ramp/xml_test_msword_dirty.html
71
73
  - test/webget_ramp/xml_test_1.xml
72
74
  - test/webget_ramp/xml_test_2.xml
73
75
  - test/webget_ramp/yaml_test_1.yml
metadata.gz.sig CHANGED
Binary file