webget_ramp 1.7.1.2 → 1.7.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/webget_ramp/xml.rb +63 -0
- data/lib/webget_ramp.rb +7 -1
- data/test/webget_ramp/xml_test.rb +43 -0
- data/test/webget_ramp/xml_test_msword_clean.html +1 -0
- data/test/webget_ramp/xml_test_msword_dirty.html +148 -0
- data.tar.gz.sig +0 -0
- metadata +4 -2
- metadata.gz.sig +0 -0
data/lib/webget_ramp/xml.rb
CHANGED
@@ -66,6 +66,69 @@ module XML
|
|
66
66
|
}
|
67
67
|
end
|
68
68
|
|
69
|
+
|
70
|
+
# Santize dirty xml by removing unprintables, bad tags,
|
71
|
+
# comments, and generally anything else we might need
|
72
|
+
# to enable the XML parser to handle a dirty document.
|
73
|
+
#
|
74
|
+
# ==Example
|
75
|
+
# s="<foo a=b c=d><!--comment-->Hello<!-[if bar]>Microsoft<![endif]>World</foo>"
|
76
|
+
# XML.strip_all(s) => "<foo>HelloWorld</foo>"
|
77
|
+
#
|
78
|
+
# This method calls these in order:
|
79
|
+
# - XML.strip_unprintables
|
80
|
+
# - XML.strip_microsoft
|
81
|
+
# - XML.strip_comments
|
82
|
+
# - XML.strip_attributes
|
83
|
+
|
84
|
+
def XML.strip_all(xml_text)
|
85
|
+
return XML.strip_attributes(XML.strip_comments(XML.strip_microsoft(XML.strip_unprintables(xml_text))))
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
# Strip out all attributes from the xml text's tags.
|
90
|
+
#
|
91
|
+
# ==Example
|
92
|
+
# s="<foo a=b c=d e=f>Hello</foo>"
|
93
|
+
# XML.strip_attributes(s) => "<foo>Hello</foo>"
|
94
|
+
|
95
|
+
def XML.strip_attributes(xml_text)
|
96
|
+
return xml_text.gsub(/<(\/?\w+).*?>/im){"<#{$1}>"} # delete attributes
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
# Strip out all comments from the xml text.
|
101
|
+
#
|
102
|
+
# ==Example
|
103
|
+
# s="Hello<!--comment-->World"
|
104
|
+
# XML.strip_comments(s) => "HelloWorld"
|
105
|
+
|
106
|
+
def XML.strip_comments(xml_text)
|
107
|
+
return xml_text.gsub(/<!.*?>/im,'')
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
# Strip out all microsoft proprietary codes.
|
112
|
+
#
|
113
|
+
# ==Example
|
114
|
+
# s="Hello<!-[if foo]>Microsoft<![endif]->World"
|
115
|
+
# XML.strip_microsoft(s) => "HelloWorld"
|
116
|
+
|
117
|
+
def XML.strip_microsoft(xml_text)
|
118
|
+
return xml_text.gsub(/<!-*\[if\b.*?<!\[endif\]-*>/im,'')
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
# Strip out all unprintable characters from the input string.
|
123
|
+
#
|
124
|
+
# ==Example
|
125
|
+
# s="Hello\XXXWorld" # where XXX is unprintable
|
126
|
+
# XML.strip_unprintables(s) => "HelloWorld"
|
127
|
+
|
128
|
+
def XML.strip_unprintables(xml_text)
|
129
|
+
return xml_text.gsub(/[^[:print:]]/, "")
|
130
|
+
end
|
131
|
+
|
69
132
|
end
|
70
133
|
|
71
134
|
|
data/lib/webget_ramp.rb
CHANGED
@@ -176,6 +176,11 @@ Extensions that help debug Ruby programs.
|
|
176
176
|
== XML
|
177
177
|
|
178
178
|
* (class) load_dir: specify a one or more directory patterns and pass each XML file in the matching directories to a block; see [Dir#glob](http://www.ruby-doc.org/core/classes/Dir.html#M002347) for pattern details.
|
179
|
+
* (class) strip_all: delete exraneous junk from an XML text string, typically for sanitizing input
|
180
|
+
* (class) strip_attributes: delete all attributes from an XML text string
|
181
|
+
* (class) strip_comments: delete all comments from an XML text string
|
182
|
+
* (class) strip_microsoft: delete all proprietary Microsoft code from an XML text string
|
183
|
+
* (class) strip_unprintables: delete all unprintable characters from an XML text string
|
179
184
|
|
180
185
|
|
181
186
|
== YAML
|
@@ -185,7 +190,8 @@ Extensions that help debug Ruby programs.
|
|
185
190
|
|
186
191
|
== Changes
|
187
192
|
|
188
|
-
- 1.7.1.
|
193
|
+
- 1.7.1.3 Add XML#strip_xxx
|
194
|
+
- 1.7.1.2 Update gems: Gemcutter, Ruby 1.9.1, JRuby sqlite3
|
189
195
|
- 1.7.1.0 Add XML attributes methods #
|
190
196
|
- 1.7.0.9 Add Enumerable #hash_by, #index_by
|
191
197
|
- 1.7.0.7 Add Array#to_tsv, String#split_tsv, improve Array#to_csv
|
@@ -46,5 +46,48 @@ class XMLTest < Test::Unit::TestCase
|
|
46
46
|
assert_equal(expect,actual)
|
47
47
|
end
|
48
48
|
|
49
|
+
def test_strip_all
|
50
|
+
s="<foo a=b c=d><!--comment-->Hello<!-[if bar]>Microsoft<![endif]>World</foo>"
|
51
|
+
expect="<foo>HelloWorld</foo>"
|
52
|
+
actual=XML.strip_all(s)
|
53
|
+
assert_equal(expect,actual)
|
54
|
+
end
|
55
|
+
|
56
|
+
def strip_attributes
|
57
|
+
s="<foo a=b c=d e=f>Hello</foo>"
|
58
|
+
expect="<foo>Hello</foo>"
|
59
|
+
actual=XML.strip_attributes(s)
|
60
|
+
assert_equal(expect,actual)
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_strip_comments
|
64
|
+
s="Hello<!--comment-->World"
|
65
|
+
expect="HelloWorld"
|
66
|
+
actual=XML.strip_comments(s)
|
67
|
+
assert_equal(expect,actual)
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_strip_microsoft
|
71
|
+
s="Hello<!-[if foo]>Microsoft<![endif]->World"
|
72
|
+
expect="HelloWorld"
|
73
|
+
actual=XML.strip_microsoft(s)
|
74
|
+
assert_equal(expect,actual)
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_strip_unprintables
|
78
|
+
s="HelloWorld" #TODO create test that has unprintables
|
79
|
+
expect="HelloWorld"
|
80
|
+
actual=XML.strip_unprintables(s)
|
81
|
+
assert_equal(expect,actual)
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_strip_msword
|
85
|
+
clean=File.open(File.join(MYDIR,"xml_test_msword_clean.html"),"rb")
|
86
|
+
dirty=File.open(File.join(MYDIR,"xml_test_msword_dirty.html"),"rb")
|
87
|
+
expect=clean.read
|
88
|
+
actual=XML.strip_all(dirty.read)
|
89
|
+
assert_equal(expect,actual)
|
90
|
+
end
|
91
|
+
|
49
92
|
end
|
50
93
|
|
@@ -0,0 +1 @@
|
|
1
|
+
<html><head><meta><meta><meta><meta><meta><meta><link><title>Foo</title><style></style></head><body><div><table> <tr> <td> <p>Foo</p> </td> <td> <p>HTML</p> </td> <td> <p>af<b>s</b></p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <h1>Bold</h1> </td> <td> <p><o></o></p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <p>Fas785932517</p> </td> <td> <p>asf</p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <p><o></o></p> </td> <td> <p><o></o></p> </td> </tr></table><p><o></o></p></div></body></html>
|
@@ -0,0 +1,148 @@
|
|
1
|
+
<html xmlns:o="urn:schemas-microsoft-com:office:office"
|
2
|
+
xmlns:w="urn:schemas-microsoft-com:office:word"
|
3
|
+
xmlns="http://www.w3.org/TR/REC-html40">
|
4
|
+
|
5
|
+
<head>
|
6
|
+
<meta name=Title content=Foo>
|
7
|
+
<meta name=Keywords content="">
|
8
|
+
<meta http-equiv=Content-Type content="text/html; charset=macintosh">
|
9
|
+
<meta name=ProgId content=Word.Document>
|
10
|
+
<meta name=Generator content="Microsoft Word 10">
|
11
|
+
<meta name=Originator content="Microsoft Word 10">
|
12
|
+
<link rel=File-List href="Foo_files/filelist.xml">
|
13
|
+
<title>Foo</title>
|
14
|
+
<!--[if gte mso 9]><xml>
|
15
|
+
<o:DocumentProperties>
|
16
|
+
<o:Template>Normal</o:Template>
|
17
|
+
<o:LastAuthor>Staff</o:LastAuthor>
|
18
|
+
<o:Revision>1</o:Revision>
|
19
|
+
<o:Created>2009-10-22T23:51:00Z</o:Created>
|
20
|
+
<o:LastSaved>2009-10-22T23:53:00Z</o:LastSaved>
|
21
|
+
<o:Pages>1</o:Pages>
|
22
|
+
<o:Company>WestEd</o:Company>
|
23
|
+
<o:Lines>1</o:Lines>
|
24
|
+
<o:Paragraphs>1</o:Paragraphs>
|
25
|
+
<o:Version>10.262</o:Version>
|
26
|
+
</o:DocumentProperties>
|
27
|
+
</xml><![endif]--><!--[if gte mso 9]><xml>
|
28
|
+
<w:WordDocument>
|
29
|
+
<w:DisplayHorizontalDrawingGridEvery>0</w:DisplayHorizontalDrawingGridEvery>
|
30
|
+
<w:DisplayVerticalDrawingGridEvery>0</w:DisplayVerticalDrawingGridEvery>
|
31
|
+
<w:UseMarginsForDrawingGridOrigin/>
|
32
|
+
</w:WordDocument>
|
33
|
+
</xml><![endif]-->
|
34
|
+
<style>
|
35
|
+
<!--
|
36
|
+
/* Style Definitions */
|
37
|
+
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
38
|
+
{mso-style-parent:"";
|
39
|
+
margin:0in;
|
40
|
+
margin-bottom:.0001pt;
|
41
|
+
mso-pagination:widow-orphan;
|
42
|
+
font-size:12.0pt;
|
43
|
+
font-family:Times;}
|
44
|
+
h1
|
45
|
+
{mso-style-next:Normal;
|
46
|
+
margin:0in;
|
47
|
+
margin-bottom:.0001pt;
|
48
|
+
mso-pagination:widow-orphan;
|
49
|
+
page-break-after:avoid;
|
50
|
+
mso-outline-level:1;
|
51
|
+
font-size:12.0pt;
|
52
|
+
font-family:Times;
|
53
|
+
mso-font-kerning:0pt;}
|
54
|
+
@page Section1
|
55
|
+
{size:8.5in 11.0in;
|
56
|
+
margin:1.0in 1.25in 1.0in 1.25in;
|
57
|
+
mso-header-margin:.5in;
|
58
|
+
mso-footer-margin:.5in;
|
59
|
+
mso-paper-source:0;}
|
60
|
+
div.Section1
|
61
|
+
{page:Section1;}
|
62
|
+
-->
|
63
|
+
</style>
|
64
|
+
</head>
|
65
|
+
|
66
|
+
<body bgcolor=white lang=EN-US style='tab-interval:.5in'>
|
67
|
+
|
68
|
+
<div class=Section1>
|
69
|
+
|
70
|
+
<table border=1 cellspacing=0 cellpadding=0 style='border-collapse:collapse;
|
71
|
+
border:none;mso-border-alt:solid windowtext .5pt;mso-padding-alt:0in 5.4pt 0in 5.4pt'>
|
72
|
+
<tr>
|
73
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
74
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
75
|
+
<p class=MsoNormal>Foo</p>
|
76
|
+
</td>
|
77
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
78
|
+
border-left:none;mso-border-left-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
79
|
+
<p class=MsoNormal>HTML</p>
|
80
|
+
</td>
|
81
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
82
|
+
border-left:none;mso-border-left-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
83
|
+
<p class=MsoNormal>af<b>s</b></p>
|
84
|
+
</td>
|
85
|
+
</tr>
|
86
|
+
<tr>
|
87
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
88
|
+
border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
89
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
90
|
+
</td>
|
91
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
92
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
93
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
94
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
95
|
+
<h1>Bold</h1>
|
96
|
+
</td>
|
97
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
98
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
99
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
100
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
101
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
102
|
+
</td>
|
103
|
+
</tr>
|
104
|
+
<tr>
|
105
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
106
|
+
border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
107
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
108
|
+
</td>
|
109
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
110
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
111
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
112
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
113
|
+
<p class=MsoNormal>Fas785932517</p>
|
114
|
+
</td>
|
115
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
116
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
117
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
118
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
119
|
+
<p class=MsoNormal>�asf�</p>
|
120
|
+
</td>
|
121
|
+
</tr>
|
122
|
+
<tr>
|
123
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
124
|
+
border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
125
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
126
|
+
</td>
|
127
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
128
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
129
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
130
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
131
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
132
|
+
</td>
|
133
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
134
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
135
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
136
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
137
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
138
|
+
</td>
|
139
|
+
</tr>
|
140
|
+
</table>
|
141
|
+
|
142
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
143
|
+
|
144
|
+
</div>
|
145
|
+
|
146
|
+
</body>
|
147
|
+
|
148
|
+
</html>
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget_ramp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.7.1.
|
4
|
+
version: 1.7.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- WebGet
|
@@ -32,7 +32,7 @@ cert_chain:
|
|
32
32
|
DXnLFY0cVuBnNDMOOFl8vk1qIcZjcTovhzgcixpG6Uk5qmUsKHRLQf4oQJx7TfLK
|
33
33
|
-----END CERTIFICATE-----
|
34
34
|
|
35
|
-
date: 2009-10-
|
35
|
+
date: 2009-10-29 00:00:00 -07:00
|
36
36
|
default_executable:
|
37
37
|
dependencies: []
|
38
38
|
|
@@ -68,6 +68,8 @@ files:
|
|
68
68
|
- lib/webget_ramp/xml.rb
|
69
69
|
- lib/webget_ramp/yaml.rb
|
70
70
|
- test/webget_ramp/io_test.txt
|
71
|
+
- test/webget_ramp/xml_test_msword_clean.html
|
72
|
+
- test/webget_ramp/xml_test_msword_dirty.html
|
71
73
|
- test/webget_ramp/xml_test_1.xml
|
72
74
|
- test/webget_ramp/xml_test_2.xml
|
73
75
|
- test/webget_ramp/yaml_test_1.yml
|
metadata.gz.sig
CHANGED
Binary file
|