webget_ramp 1.7.1.2 → 1.7.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/webget_ramp/xml.rb +63 -0
- data/lib/webget_ramp.rb +7 -1
- data/test/webget_ramp/xml_test.rb +43 -0
- data/test/webget_ramp/xml_test_msword_clean.html +1 -0
- data/test/webget_ramp/xml_test_msword_dirty.html +148 -0
- data.tar.gz.sig +0 -0
- metadata +4 -2
- metadata.gz.sig +0 -0
data/lib/webget_ramp/xml.rb
CHANGED
@@ -66,6 +66,69 @@ module XML
|
|
66
66
|
}
|
67
67
|
end
|
68
68
|
|
69
|
+
|
70
|
+
# Santize dirty xml by removing unprintables, bad tags,
|
71
|
+
# comments, and generally anything else we might need
|
72
|
+
# to enable the XML parser to handle a dirty document.
|
73
|
+
#
|
74
|
+
# ==Example
|
75
|
+
# s="<foo a=b c=d><!--comment-->Hello<!-[if bar]>Microsoft<![endif]>World</foo>"
|
76
|
+
# XML.strip_all(s) => "<foo>HelloWorld</foo>"
|
77
|
+
#
|
78
|
+
# This method calls these in order:
|
79
|
+
# - XML.strip_unprintables
|
80
|
+
# - XML.strip_microsoft
|
81
|
+
# - XML.strip_comments
|
82
|
+
# - XML.strip_attributes
|
83
|
+
|
84
|
+
def XML.strip_all(xml_text)
|
85
|
+
return XML.strip_attributes(XML.strip_comments(XML.strip_microsoft(XML.strip_unprintables(xml_text))))
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
# Strip out all attributes from the xml text's tags.
|
90
|
+
#
|
91
|
+
# ==Example
|
92
|
+
# s="<foo a=b c=d e=f>Hello</foo>"
|
93
|
+
# XML.strip_attributes(s) => "<foo>Hello</foo>"
|
94
|
+
|
95
|
+
def XML.strip_attributes(xml_text)
|
96
|
+
return xml_text.gsub(/<(\/?\w+).*?>/im){"<#{$1}>"} # delete attributes
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
# Strip out all comments from the xml text.
|
101
|
+
#
|
102
|
+
# ==Example
|
103
|
+
# s="Hello<!--comment-->World"
|
104
|
+
# XML.strip_comments(s) => "HelloWorld"
|
105
|
+
|
106
|
+
def XML.strip_comments(xml_text)
|
107
|
+
return xml_text.gsub(/<!.*?>/im,'')
|
108
|
+
end
|
109
|
+
|
110
|
+
|
111
|
+
# Strip out all microsoft proprietary codes.
|
112
|
+
#
|
113
|
+
# ==Example
|
114
|
+
# s="Hello<!-[if foo]>Microsoft<![endif]->World"
|
115
|
+
# XML.strip_microsoft(s) => "HelloWorld"
|
116
|
+
|
117
|
+
def XML.strip_microsoft(xml_text)
|
118
|
+
return xml_text.gsub(/<!-*\[if\b.*?<!\[endif\]-*>/im,'')
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
# Strip out all unprintable characters from the input string.
|
123
|
+
#
|
124
|
+
# ==Example
|
125
|
+
# s="Hello\XXXWorld" # where XXX is unprintable
|
126
|
+
# XML.strip_unprintables(s) => "HelloWorld"
|
127
|
+
|
128
|
+
def XML.strip_unprintables(xml_text)
|
129
|
+
return xml_text.gsub(/[^[:print:]]/, "")
|
130
|
+
end
|
131
|
+
|
69
132
|
end
|
70
133
|
|
71
134
|
|
data/lib/webget_ramp.rb
CHANGED
@@ -176,6 +176,11 @@ Extensions that help debug Ruby programs.
|
|
176
176
|
== XML
|
177
177
|
|
178
178
|
* (class) load_dir: specify a one or more directory patterns and pass each XML file in the matching directories to a block; see [Dir#glob](http://www.ruby-doc.org/core/classes/Dir.html#M002347) for pattern details.
|
179
|
+
* (class) strip_all: delete exraneous junk from an XML text string, typically for sanitizing input
|
180
|
+
* (class) strip_attributes: delete all attributes from an XML text string
|
181
|
+
* (class) strip_comments: delete all comments from an XML text string
|
182
|
+
* (class) strip_microsoft: delete all proprietary Microsoft code from an XML text string
|
183
|
+
* (class) strip_unprintables: delete all unprintable characters from an XML text string
|
179
184
|
|
180
185
|
|
181
186
|
== YAML
|
@@ -185,7 +190,8 @@ Extensions that help debug Ruby programs.
|
|
185
190
|
|
186
191
|
== Changes
|
187
192
|
|
188
|
-
- 1.7.1.
|
193
|
+
- 1.7.1.3 Add XML#strip_xxx
|
194
|
+
- 1.7.1.2 Update gems: Gemcutter, Ruby 1.9.1, JRuby sqlite3
|
189
195
|
- 1.7.1.0 Add XML attributes methods #
|
190
196
|
- 1.7.0.9 Add Enumerable #hash_by, #index_by
|
191
197
|
- 1.7.0.7 Add Array#to_tsv, String#split_tsv, improve Array#to_csv
|
@@ -46,5 +46,48 @@ class XMLTest < Test::Unit::TestCase
|
|
46
46
|
assert_equal(expect,actual)
|
47
47
|
end
|
48
48
|
|
49
|
+
def test_strip_all
|
50
|
+
s="<foo a=b c=d><!--comment-->Hello<!-[if bar]>Microsoft<![endif]>World</foo>"
|
51
|
+
expect="<foo>HelloWorld</foo>"
|
52
|
+
actual=XML.strip_all(s)
|
53
|
+
assert_equal(expect,actual)
|
54
|
+
end
|
55
|
+
|
56
|
+
def strip_attributes
|
57
|
+
s="<foo a=b c=d e=f>Hello</foo>"
|
58
|
+
expect="<foo>Hello</foo>"
|
59
|
+
actual=XML.strip_attributes(s)
|
60
|
+
assert_equal(expect,actual)
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_strip_comments
|
64
|
+
s="Hello<!--comment-->World"
|
65
|
+
expect="HelloWorld"
|
66
|
+
actual=XML.strip_comments(s)
|
67
|
+
assert_equal(expect,actual)
|
68
|
+
end
|
69
|
+
|
70
|
+
def test_strip_microsoft
|
71
|
+
s="Hello<!-[if foo]>Microsoft<![endif]->World"
|
72
|
+
expect="HelloWorld"
|
73
|
+
actual=XML.strip_microsoft(s)
|
74
|
+
assert_equal(expect,actual)
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_strip_unprintables
|
78
|
+
s="HelloWorld" #TODO create test that has unprintables
|
79
|
+
expect="HelloWorld"
|
80
|
+
actual=XML.strip_unprintables(s)
|
81
|
+
assert_equal(expect,actual)
|
82
|
+
end
|
83
|
+
|
84
|
+
def test_strip_msword
|
85
|
+
clean=File.open(File.join(MYDIR,"xml_test_msword_clean.html"),"rb")
|
86
|
+
dirty=File.open(File.join(MYDIR,"xml_test_msword_dirty.html"),"rb")
|
87
|
+
expect=clean.read
|
88
|
+
actual=XML.strip_all(dirty.read)
|
89
|
+
assert_equal(expect,actual)
|
90
|
+
end
|
91
|
+
|
49
92
|
end
|
50
93
|
|
@@ -0,0 +1 @@
|
|
1
|
+
<html><head><meta><meta><meta><meta><meta><meta><link><title>Foo</title><style></style></head><body><div><table> <tr> <td> <p>Foo</p> </td> <td> <p>HTML</p> </td> <td> <p>af<b>s</b></p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <h1>Bold</h1> </td> <td> <p><o></o></p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <p>Fas785932517</p> </td> <td> <p>asf</p> </td> </tr> <tr> <td> <p><o></o></p> </td> <td> <p><o></o></p> </td> <td> <p><o></o></p> </td> </tr></table><p><o></o></p></div></body></html>
|
@@ -0,0 +1,148 @@
|
|
1
|
+
<html xmlns:o="urn:schemas-microsoft-com:office:office"
|
2
|
+
xmlns:w="urn:schemas-microsoft-com:office:word"
|
3
|
+
xmlns="http://www.w3.org/TR/REC-html40">
|
4
|
+
|
5
|
+
<head>
|
6
|
+
<meta name=Title content=Foo>
|
7
|
+
<meta name=Keywords content="">
|
8
|
+
<meta http-equiv=Content-Type content="text/html; charset=macintosh">
|
9
|
+
<meta name=ProgId content=Word.Document>
|
10
|
+
<meta name=Generator content="Microsoft Word 10">
|
11
|
+
<meta name=Originator content="Microsoft Word 10">
|
12
|
+
<link rel=File-List href="Foo_files/filelist.xml">
|
13
|
+
<title>Foo</title>
|
14
|
+
<!--[if gte mso 9]><xml>
|
15
|
+
<o:DocumentProperties>
|
16
|
+
<o:Template>Normal</o:Template>
|
17
|
+
<o:LastAuthor>Staff</o:LastAuthor>
|
18
|
+
<o:Revision>1</o:Revision>
|
19
|
+
<o:Created>2009-10-22T23:51:00Z</o:Created>
|
20
|
+
<o:LastSaved>2009-10-22T23:53:00Z</o:LastSaved>
|
21
|
+
<o:Pages>1</o:Pages>
|
22
|
+
<o:Company>WestEd</o:Company>
|
23
|
+
<o:Lines>1</o:Lines>
|
24
|
+
<o:Paragraphs>1</o:Paragraphs>
|
25
|
+
<o:Version>10.262</o:Version>
|
26
|
+
</o:DocumentProperties>
|
27
|
+
</xml><![endif]--><!--[if gte mso 9]><xml>
|
28
|
+
<w:WordDocument>
|
29
|
+
<w:DisplayHorizontalDrawingGridEvery>0</w:DisplayHorizontalDrawingGridEvery>
|
30
|
+
<w:DisplayVerticalDrawingGridEvery>0</w:DisplayVerticalDrawingGridEvery>
|
31
|
+
<w:UseMarginsForDrawingGridOrigin/>
|
32
|
+
</w:WordDocument>
|
33
|
+
</xml><![endif]-->
|
34
|
+
<style>
|
35
|
+
<!--
|
36
|
+
/* Style Definitions */
|
37
|
+
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
38
|
+
{mso-style-parent:"";
|
39
|
+
margin:0in;
|
40
|
+
margin-bottom:.0001pt;
|
41
|
+
mso-pagination:widow-orphan;
|
42
|
+
font-size:12.0pt;
|
43
|
+
font-family:Times;}
|
44
|
+
h1
|
45
|
+
{mso-style-next:Normal;
|
46
|
+
margin:0in;
|
47
|
+
margin-bottom:.0001pt;
|
48
|
+
mso-pagination:widow-orphan;
|
49
|
+
page-break-after:avoid;
|
50
|
+
mso-outline-level:1;
|
51
|
+
font-size:12.0pt;
|
52
|
+
font-family:Times;
|
53
|
+
mso-font-kerning:0pt;}
|
54
|
+
@page Section1
|
55
|
+
{size:8.5in 11.0in;
|
56
|
+
margin:1.0in 1.25in 1.0in 1.25in;
|
57
|
+
mso-header-margin:.5in;
|
58
|
+
mso-footer-margin:.5in;
|
59
|
+
mso-paper-source:0;}
|
60
|
+
div.Section1
|
61
|
+
{page:Section1;}
|
62
|
+
-->
|
63
|
+
</style>
|
64
|
+
</head>
|
65
|
+
|
66
|
+
<body bgcolor=white lang=EN-US style='tab-interval:.5in'>
|
67
|
+
|
68
|
+
<div class=Section1>
|
69
|
+
|
70
|
+
<table border=1 cellspacing=0 cellpadding=0 style='border-collapse:collapse;
|
71
|
+
border:none;mso-border-alt:solid windowtext .5pt;mso-padding-alt:0in 5.4pt 0in 5.4pt'>
|
72
|
+
<tr>
|
73
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
74
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
75
|
+
<p class=MsoNormal>Foo</p>
|
76
|
+
</td>
|
77
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
78
|
+
border-left:none;mso-border-left-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
79
|
+
<p class=MsoNormal>HTML</p>
|
80
|
+
</td>
|
81
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
82
|
+
border-left:none;mso-border-left-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
83
|
+
<p class=MsoNormal>af<b>s</b></p>
|
84
|
+
</td>
|
85
|
+
</tr>
|
86
|
+
<tr>
|
87
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
88
|
+
border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
89
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
90
|
+
</td>
|
91
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
92
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
93
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
94
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
95
|
+
<h1>Bold</h1>
|
96
|
+
</td>
|
97
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
98
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
99
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
100
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
101
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
102
|
+
</td>
|
103
|
+
</tr>
|
104
|
+
<tr>
|
105
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
106
|
+
border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
107
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
108
|
+
</td>
|
109
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
110
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
111
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
112
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
113
|
+
<p class=MsoNormal>Fas785932517</p>
|
114
|
+
</td>
|
115
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
116
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
117
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
118
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
119
|
+
<p class=MsoNormal>�asf�</p>
|
120
|
+
</td>
|
121
|
+
</tr>
|
122
|
+
<tr>
|
123
|
+
<td width=148 valign=top style='width:2.05in;border:solid windowtext .5pt;
|
124
|
+
border-top:none;mso-border-top-alt:solid windowtext .5pt;padding:0in 5.4pt 0in 5.4pt'>
|
125
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
126
|
+
</td>
|
127
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
128
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
129
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
130
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
131
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
132
|
+
</td>
|
133
|
+
<td width=148 valign=top style='width:2.05in;border-top:none;border-left:
|
134
|
+
none;border-bottom:solid windowtext .5pt;border-right:solid windowtext .5pt;
|
135
|
+
mso-border-top-alt:solid windowtext .5pt;mso-border-left-alt:solid windowtext .5pt;
|
136
|
+
padding:0in 5.4pt 0in 5.4pt'>
|
137
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
138
|
+
</td>
|
139
|
+
</tr>
|
140
|
+
</table>
|
141
|
+
|
142
|
+
<p class=MsoNormal><![if !supportEmptyParas]> <![endif]><o:p></o:p></p>
|
143
|
+
|
144
|
+
</div>
|
145
|
+
|
146
|
+
</body>
|
147
|
+
|
148
|
+
</html>
|
data.tar.gz.sig
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: webget_ramp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.7.1.
|
4
|
+
version: 1.7.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- WebGet
|
@@ -32,7 +32,7 @@ cert_chain:
|
|
32
32
|
DXnLFY0cVuBnNDMOOFl8vk1qIcZjcTovhzgcixpG6Uk5qmUsKHRLQf4oQJx7TfLK
|
33
33
|
-----END CERTIFICATE-----
|
34
34
|
|
35
|
-
date: 2009-10-
|
35
|
+
date: 2009-10-29 00:00:00 -07:00
|
36
36
|
default_executable:
|
37
37
|
dependencies: []
|
38
38
|
|
@@ -68,6 +68,8 @@ files:
|
|
68
68
|
- lib/webget_ramp/xml.rb
|
69
69
|
- lib/webget_ramp/yaml.rb
|
70
70
|
- test/webget_ramp/io_test.txt
|
71
|
+
- test/webget_ramp/xml_test_msword_clean.html
|
72
|
+
- test/webget_ramp/xml_test_msword_dirty.html
|
71
73
|
- test/webget_ramp/xml_test_1.xml
|
72
74
|
- test/webget_ramp/xml_test_2.xml
|
73
75
|
- test/webget_ramp/yaml_test_1.yml
|
metadata.gz.sig
CHANGED
Binary file
|