webget_ramp 1.7.1.2 → 1.7.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -66,6 +66,69 @@ module XML
66
66
  }
67
67
  end
68
68
 
69
+
70
+ # Santize dirty xml by removing unprintables, bad tags,
71
+ # comments, and generally anything else we might need
72
+ # to enable the XML parser to handle a dirty document.
73
+ #
74
+ # ==Example
75
+ # s="<foo a=b c=d><!--comment-->Hello<!-[if bar]>Microsoft<![endif]>World</foo>"
76
+ # XML.strip_all(s) => "<foo>HelloWorld</foo>"
77
+ #
78
+ # This method calls these in order:
79
+ # - XML.strip_unprintables
80
+ # - XML.strip_microsoft
81
+ # - XML.strip_comments
82
+ # - XML.strip_attributes
83
+
84
+ def XML.strip_all(xml_text)
85
+ return XML.strip_attributes(XML.strip_comments(XML.strip_microsoft(XML.strip_unprintables(xml_text))))
86
+ end
87
+
88
+
89
+ # Strip out all attributes from the xml text's tags.
90
+ #
91
+ # ==Example
92
+ # s="<foo a=b c=d e=f>Hello</foo>"
93
+ # XML.strip_attributes(s) => "<foo>Hello</foo>"
94
+
95
+ def XML.strip_attributes(xml_text)
96
+ return xml_text.gsub(/<(\/?\w+).*?>/im){"<#{$1}>"} # delete attributes
97
+ end
98
+
99
+
100
+ # Strip out all comments from the xml text.
101
+ #
102
+ # ==Example
103
+ # s="Hello<!--comment-->World"
104
+ # XML.strip_comments(s) => "HelloWorld"
105
+
106
+ def XML.strip_comments(xml_text)
107
+ return xml_text.gsub(/<!.*?>/im,'')
108
+ end
109
+
110
+
111
+ # Strip out all microsoft proprietary codes.
112
+ #
113
+ # ==Example
114
+ # s="Hello<!-[if foo]>Microsoft<![endif]->World"
115
+ # XML.strip_microsoft(s) => "HelloWorld"
116
+
117
+ def XML.strip_microsoft(xml_text)
118
+ return xml_text.gsub(/<!-*\[if\b.*?<!\[endif\]-*>/im,'')
119
+ end
120
+
121
+
122
+ # Strip out all unprintable characters from the input string.
123
+ #
124
+ # ==Example
125
+ # s="Hello\XXXWorld" # where XXX is unprintable
126
+ # XML.strip_unprintables(s) => "HelloWorld"
127
+
128
+ def XML.strip_unprintables(xml_text)
129
+ return xml_text.gsub(/[^[:print:]]/, "")
130
+ end
131
+
69
132
  end
70
133
 
71
134
 
data/lib/webget_ramp.rb CHANGED
@@ -176,6 +176,11 @@ Extensions that help debug Ruby programs.
176
176
  == XML
177
177
 
178
178
  * (class) load_dir: specify a one or more directory patterns and pass each XML file in the matching directories to a block; see [Dir#glob](http://www.ruby-doc.org/core/classes/Dir.html#M002347) for pattern details.
179
+ * (class) strip_all: delete exraneous junk from an XML text string, typically for sanitizing input
180
+ * (class) strip_attributes: delete all attributes from an XML text string
181
+ * (class) strip_comments: delete all comments from an XML text string
182
+ * (class) strip_microsoft: delete all proprietary Microsoft code from an XML text string
183
+ * (class) strip_unprintables: delete all unprintable characters from an XML text string
179
184
 
180
185
 
181
186
  == YAML
@@ -185,7 +190,8 @@ Extensions that help debug Ruby programs.
185
190
 
186
191
  == Changes
187
192
 
188
- - 1.7.1.1 Update gems: Gemcutter, Ruby 1.9.1, JRuby sqlite3
193
+ - 1.7.1.3 Add XML#strip_xxx
194
+ - 1.7.1.2 Update gems: Gemcutter, Ruby 1.9.1, JRuby sqlite3
189
195
  - 1.7.1.0 Add XML attributes methods #
190
196
  - 1.7.0.9 Add Enumerable #hash_by, #index_by
191
197
  - 1.7.0.7 Add Array#to_tsv, String#split_tsv, improve Array#to_csv
@@ -46,5 +46,48 @@ class XMLTest < Test::Unit::TestCase
46
46
  assert_equal(expect,actual)
47
47
  end
48
48
 
49
+ def test_strip_all
50
+ s="<foo a=b c=d><!--comment-->Hello<!-[if bar]>Microsoft<![endif]>World</foo>"
51
+ expect="<foo>HelloWorld</foo>"
52
+ actual=XML.strip_all(s)
53
+ assert_equal(expect,actual)
54
+ end
55
+
56
+ def strip_attributes
57
+ s="<foo a=b c=d e=f>Hello</foo>"
58
+ expect="<foo>Hello</foo>"
59
+ actual=XML.strip_attributes(s)
60
+ assert_equal(expect,actual)
61
+ end
62
+
63
+ def test_strip_comments
64
+ s="Hello<!--comment-->World"
65
+ expect="HelloWorld"
66
+ actual=XML.strip_comments(s)
67
+ assert_equal(expect,actual)
68
+ end
69
+
70
+ def test_strip_microsoft
71
+ s="Hello<!-[if foo]>Microsoft<![endif]->World"
72
+ expect="HelloWorld"
73
+ actual=XML.strip_microsoft(s)
74
+ assert_equal(expect,actual)
75
+ end
76
+
77
+ def test_strip_unprintables
78
+ s="HelloWorld" #TODO create test that has unprintables
79
+ expect="HelloWorld"
80
+ actual=XML.strip_unprintables(s)
81
+ assert_equal(expect,actual)
82
+ end
83
+
84
+ def test_strip_msword
85
+ clean=File.open(File.join(MYDIR,"xml_test_msword_clean.html"),"rb")
86
+ dirty=File.open(File.join(MYDIR,"xml_test_msword_dirty.html"),"rb")
87
+ expect=clean.read
88
+ actual=XML.strip_all(dirty.read)
89
+ assert_equal(expect,actual)
90
+ end
91
+
49
92
  end
50
93
 
@@ -0,0 +1 @@
1
+ <html><head><meta><meta><meta><meta><meta><meta><link><title>Foo</title><style></style></head><body><div><table> <tr> <td> <p>Foo</p> </td> <td> <p>HTML</p> </td> <td> <p>af<b>s</b></p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <h1>Bold</h1> </td> <td> <p><o></o></p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <p>Fas785932517</p> </td> <td> <p>asf</p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <p><o></o></p> </td> <td> <p><o></o></p> </td> </tr></table><p><o></o></p></div></body></html>
@@ -0,0 +1,148 @@
1
+ <html xmlns:o="urn:schemas-microsoft-com:office:office"
2
+ xmlns:w="urn:schemas-microsoft-com:office:word"
3
+ xmlns="http://www.w3.org/TR/REC-html40">
4
+
5
+ <head>
6
+ <meta name=Title content=Foo>
7
+ <meta name=Keywords content="">
8
+ <meta http-equiv=Content-Type content="text/html; charset=macintosh">
9
+ <meta name=ProgId content=Word.Document>
10
+ <meta name=Generator content="Microsoft Word 10">
11
+ <meta name=Originator content="Microsoft Word 10">
12
+ <link rel=File-List href="Foo_files/filelist.xml">
13
+ <title>Foo</title>
14
+ <!--[if gte mso 9]><xml>
15
+ <o:DocumentProperties>
16
+ <o:Template>Normal</o:Template>
17
+ <o:LastAuthor>Staff</o:LastAuthor>
18
+ <o:Revision>1</o:Revision>
19
+ <o:Created>2009-10-22T23:51:00Z</o:Created>
20
+ <o:LastSaved>2009-10-22T23:53:00Z</o:LastSaved>
21
+ <o:Pages>1</o:Pages>
22
+ <o:Company>WestEd</o:Company>
23
+ <o:Lines>1</o:Lines>
24
+ <o:Paragraphs>1</o:Paragraphs>
25
+ <o:Version>10.262</o:Version>
26
+ </o:DocumentProperties>
27
+ </xml><![endif]--><!--[if gte mso 9]><xml>
28
+ <w:WordDocument>
29
+ <w:DisplayHorizontalDrawingGridEvery>0</w:DisplayHorizontalDrawingGridEvery>
30
+ <w:DisplayVerticalDrawingGridEvery>0</w:DisplayVerticalDrawingGridEvery>
31
+ <w:UseMarginsForDrawingGridOrigin/>
32
+ </w:WordDocument>
33
+ </xml><![endif]-->
34
+ <style>
35
+ <!--
36
+ /* Style Definitions */
37
+ p.MsoNormal, li.MsoNormal, div.MsoNormal
38
+ {mso-style-parent:"";
39
+ margin:0in;
40
+ margin-bottom:.0001pt;
41
+ mso-pagination:widow-orphan;
42
+ font-size:12.0pt;
43
+ font-family:Times;}
44
+ h1
45
+ {mso-style-next:Normal;
46
+ margin:0in;
47
+ margin-bottom:.0001pt;
48
+ mso-pagination:widow-orphan;
49
+ page-break-after:avoid;
50
+ mso-outline-level:1;
51
+ font-size:12.0pt;
52
+ font-family:Times;
53
+ mso-font-kerning:0pt;}
54
+ @page Section1
55
+ {size:8.5in 11.0in;
56
+ margin:1.0in 1.25in 1.0in 1.25in;
57
+ mso-header-margin:.5in;
58
+ mso-footer-margin:.5in;
59
+ mso-paper-source:0;}
60
+ div.Section1
61
+ {page:Section1;}
62
+ -->
63
+ </style>
64
+ </head>
65
+
66
+ <body bgcolor=white lang=EN-US style='tab-interval:.5in'>
67
+
68
+ <div class=Section1>
69
+
70
+ <table border=1 cellspacing=0 cellpadding=0 style='border-collapse:collapse;
71
+ border:none;mso-border-alt:solid windowtext .5pt;mso-padding-alt:0in 5.4pt 0in 5.4pt'>
72
+ <tr>
73
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
74
+ padding:0in 5.4pt 0in 5.4pt'>
75
+ <p class=MsoNormal>Foo</p>
76
+ </td>
77
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
78
+ border-left:none;mso-border-left-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
79
+ <p class=MsoNormal>HTML</p>
80
+ </td>
81
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
82
+ border-left:none;mso-border-left-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
83
+ <p class=MsoNormal>af<b>s</b></p>
84
+ </td>
85
+ </tr>
86
+ <tr>
87
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
88
+ border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
89
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
90
+ </td>
91
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
92
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
93
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
94
+ padding:0in 5.4pt 0in 5.4pt'>
95
+ <h1>Bold</h1>
96
+ </td>
97
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
98
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
99
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
100
+ padding:0in 5.4pt 0in 5.4pt'>
101
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
102
+ </td>
103
+ </tr>
104
+ <tr>
105
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
106
+ border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
107
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
108
+ </td>
109
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
110
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
111
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
112
+ padding:0in 5.4pt 0in 5.4pt'>
113
+ <p class=MsoNormal>Fas785932517</p>
114
+ </td>
115
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
116
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
117
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
118
+ padding:0in 5.4pt 0in 5.4pt'>
119
+ <p class=MsoNormal>�asf�</p>
120
+ </td>
121
+ </tr>
122
+ <tr>
123
+ <td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
124
+ border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
125
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
126
+ </td>
127
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
128
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
129
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
130
+ padding:0in 5.4pt 0in 5.4pt'>
131
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
132
+ </td>
133
+ <td width=148 valign=top style='width:2.05in;border-top:none;border-left:
134
+ none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
135
+ mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
136
+ padding:0in 5.4pt 0in 5.4pt'>
137
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
138
+ </td>
139
+ </tr>
140
+ </table>
141
+
142
+ <p class=MsoNormal><![if !supportEmptyParas]>&nbsp;<![endif]><o:p></o:p></p>
143
+
144
+ </div>
145
+
146
+ </body>
147
+
148
+ </html>
data.tar.gz.sig CHANGED
Binary file
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: webget_ramp
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.7.1.2
4
+ version: 1.7.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - WebGet
@@ -32,7 +32,7 @@ cert_chain:
32
32
  DXnLFY0cVuBnNDMOOFl8vk1qIcZjcTovhzgcixpG6Uk5qmUsKHRLQf4oQJx7TfLK
33
33
  -----END CERTIFICATE-----
34
34
 
35
- date: 2009-10-27 00:00:00 -07:00
35
+ date: 2009-10-29 00:00:00 -07:00
36
36
  default_executable:
37
37
  dependencies: []
38
38
 
@@ -68,6 +68,8 @@ files:
68
68
  - lib/webget_ramp/xml.rb
69
69
  - lib/webget_ramp/yaml.rb
70
70
  - test/webget_ramp/io_test.txt
71
+ - test/webget_ramp/xml_test_msword_clean.html
72
+ - test/webget_ramp/xml_test_msword_dirty.html
71
73
  - test/webget_ramp/xml_test_1.xml
72
74
  - test/webget_ramp/xml_test_2.xml
73
75
  - test/webget_ramp/yaml_test_1.yml
metadata.gz.sig CHANGED
Binary file