rfeedparser 0.9.8 → 0.9.9
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rfeedparser.rb +170 -3345
- data/lib/rfeedparser/aliases.rb +432 -0
- data/lib/rfeedparser/better_attributelist.rb +41 -0
- data/lib/rfeedparser/better_sgmlparser.rb +264 -0
- data/lib/rfeedparser/encoding_helpers.rb +257 -0
- data/lib/rfeedparser/feedparserdict.rb +93 -0
- data/lib/rfeedparser/forgiving_uri.rb +93 -0
- data/lib/rfeedparser/markup_helpers.rb +73 -0
- data/lib/rfeedparser/parser_mixin.rb +1235 -0
- data/lib/rfeedparser/parsers.rb +177 -0
- data/lib/rfeedparser/scrub.rb +207 -0
- data/lib/rfeedparser/time_helpers.rb +408 -0
- data/tests/rfeedparsertest.rb +3 -1
- metadata +3271 -3250
@@ -0,0 +1,264 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
|
4
|
+
class BetterSGMLParserError < Exception; end;
|
5
|
+
class BetterSGMLParser < HTML::SGMLParser
|
6
|
+
# Replaced Tagfind and Charref Regexps with the ones in feedparser.py
|
7
|
+
# This makes things work.
|
8
|
+
Interesting = /[&<]/u
|
9
|
+
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|![^<>]*)?', 64)
|
10
|
+
# 64 is the unicode flag
|
11
|
+
|
12
|
+
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
|
13
|
+
Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
|
14
|
+
|
15
|
+
Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
|
16
|
+
Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
|
17
|
+
Endtagopen = /<\//u # Matching the Python SGMLParser
|
18
|
+
Endbracket = /[<>]/u
|
19
|
+
Declopen = /<!/u
|
20
|
+
Piopenbegin = /^<\?/u
|
21
|
+
Piclose = />/u
|
22
|
+
|
23
|
+
Commentopen = /<!--/u
|
24
|
+
Commentclose = /--\s*>/u
|
25
|
+
Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
|
26
|
+
Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
|
27
|
+
'(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
|
28
|
+
64)
|
29
|
+
Endtagfind = /\s*\/\s*>/u
|
30
|
+
def initialize(verbose=false)
|
31
|
+
super(verbose)
|
32
|
+
end
|
33
|
+
def feed(*args)
|
34
|
+
super(*args)
|
35
|
+
end
|
36
|
+
|
37
|
+
def goahead(_end)
|
38
|
+
rawdata = @rawdata # woo, utf-8 magic
|
39
|
+
i = 0
|
40
|
+
n = rawdata.length
|
41
|
+
while i < n
|
42
|
+
if @nomoretags
|
43
|
+
# handle_data_range does nothing more than set a "Range" that is never used. wtf?
|
44
|
+
handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
|
45
|
+
i = n
|
46
|
+
break
|
47
|
+
end
|
48
|
+
j = rawdata.index(Interesting, i)
|
49
|
+
j = n unless j
|
50
|
+
handle_data(rawdata[i...j]) if i < j
|
51
|
+
i = j
|
52
|
+
break if (i == n)
|
53
|
+
if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
|
54
|
+
if rawdata.index(Starttagopen,i) == i
|
55
|
+
if @literal
|
56
|
+
handle_data(rawdata[i..i])
|
57
|
+
i = i+1
|
58
|
+
next
|
59
|
+
end
|
60
|
+
k = parse_starttag(i)
|
61
|
+
break unless k
|
62
|
+
i = k
|
63
|
+
next
|
64
|
+
end
|
65
|
+
if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
|
66
|
+
k = parse_endtag(i)
|
67
|
+
break unless k
|
68
|
+
i = k
|
69
|
+
@literal = false
|
70
|
+
next
|
71
|
+
end
|
72
|
+
if @literal
|
73
|
+
if n > (i+1)
|
74
|
+
handle_data("<")
|
75
|
+
i = i+1
|
76
|
+
else
|
77
|
+
#incomplete
|
78
|
+
break
|
79
|
+
end
|
80
|
+
next
|
81
|
+
end
|
82
|
+
if rawdata.index(Commentopen,i) == i
|
83
|
+
k = parse_comment(i)
|
84
|
+
break unless k
|
85
|
+
i = k
|
86
|
+
next
|
87
|
+
end
|
88
|
+
if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
|
89
|
+
k = parse_pi(i)
|
90
|
+
break unless k
|
91
|
+
i += k
|
92
|
+
next
|
93
|
+
end
|
94
|
+
if rawdata.index(Declopen,i) == i
|
95
|
+
# This is some sort of declaration; in "HTML as
|
96
|
+
# deployed," this should only be the document type
|
97
|
+
# declaration ("<!DOCTYPE html...>").
|
98
|
+
k = parse_declaration(i)
|
99
|
+
break unless k
|
100
|
+
i = k
|
101
|
+
next
|
102
|
+
end
|
103
|
+
elsif rawdata[i..i] == '&'
|
104
|
+
if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
|
105
|
+
handle_data(rawdata[i..i])
|
106
|
+
i += 1
|
107
|
+
next
|
108
|
+
end
|
109
|
+
|
110
|
+
# the Char must come first as its #=~ method is the only one that is UTF-8 safe
|
111
|
+
ni,match = index_match(rawdata, Charref, i)
|
112
|
+
if ni and ni == i # See? Ugly
|
113
|
+
handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
|
114
|
+
i += match[0].length # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
|
115
|
+
i -= 1 unless rawdata[i-1..i-1] == ";"
|
116
|
+
next
|
117
|
+
end
|
118
|
+
ni,match = index_match(rawdata, Entityref, i)
|
119
|
+
if ni and ni == i
|
120
|
+
handle_entityref(match[1])
|
121
|
+
i += match[0].length
|
122
|
+
i -= 1 unless rawdata[i-1..i-1] == ";"
|
123
|
+
next
|
124
|
+
end
|
125
|
+
else
|
126
|
+
error('neither < nor & ??')
|
127
|
+
end
|
128
|
+
# We get here only if incomplete matches but
|
129
|
+
# nothing else
|
130
|
+
ni,match = index_match(rawdata,Incomplete,i)
|
131
|
+
unless ni and ni == 0
|
132
|
+
handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
|
133
|
+
i += 1
|
134
|
+
next
|
135
|
+
end
|
136
|
+
j = ni + match[0].length
|
137
|
+
break if j == n # Really incomplete
|
138
|
+
handle_data(rawdata[i...j])
|
139
|
+
i = j
|
140
|
+
end # end while
|
141
|
+
|
142
|
+
if _end and i < n
|
143
|
+
handle_data(rawdata[i...n])
|
144
|
+
i = n
|
145
|
+
end
|
146
|
+
|
147
|
+
@rawdata = rawdata[i..-1]
|
148
|
+
# @offset += i # FIXME BUGME another unused variable in SGMLParser?
|
149
|
+
end
|
150
|
+
|
151
|
+
|
152
|
+
# Internal -- parse processing instr, return length or -1 if not terminated
|
153
|
+
def parse_pi(i)
|
154
|
+
rawdata = @rawdata
|
155
|
+
if rawdata[i...i+2] != '<?'
|
156
|
+
error("unexpected call to parse_pi()")
|
157
|
+
end
|
158
|
+
ni,match = index_match(rawdata,Piclose,i+2)
|
159
|
+
return nil unless match
|
160
|
+
j = ni
|
161
|
+
handle_pi(rawdata[i+2...j])
|
162
|
+
j = (j + match[0].length)
|
163
|
+
return j-i
|
164
|
+
end
|
165
|
+
|
166
|
+
def parse_comment(i)
|
167
|
+
rawdata = @rawdata
|
168
|
+
if rawdata[i...i+4] != "<!--"
|
169
|
+
error("unexpected call to parse_comment()")
|
170
|
+
end
|
171
|
+
ni,match = index_match(rawdata, Commentclose,i)
|
172
|
+
return nil unless match
|
173
|
+
handle_comment(rawdata[i+4..(ni-1)])
|
174
|
+
return ni+match[0].length # Length from i to just past the closing comment tag
|
175
|
+
end
|
176
|
+
|
177
|
+
|
178
|
+
def parse_starttag(i)
|
179
|
+
@_starttag_text = nil
|
180
|
+
start_pos = i
|
181
|
+
rawdata = @rawdata
|
182
|
+
ni,match = index_match(rawdata,Shorttagopen,i)
|
183
|
+
if ni == i
|
184
|
+
# SGML shorthand: <tag/data/ == <tag>data</tag>
|
185
|
+
# XXX Can data contain &... (entity or char refs)?
|
186
|
+
# XXX Can data contain < or > (tag characters)?
|
187
|
+
# XXX Can there be whitespace before the first /?
|
188
|
+
k,match = index_match(rawdata,Shorttag,i)
|
189
|
+
return nil unless match
|
190
|
+
tag, data = match[1], match[2]
|
191
|
+
@_starttag_text = "<#{tag}/"
|
192
|
+
tag.downcase!
|
193
|
+
second_end = rawdata.index(Shorttagopen,k)
|
194
|
+
finish_shorttag(tag, data)
|
195
|
+
@_starttag_text = rawdata[start_pos...second_end+1]
|
196
|
+
return k
|
197
|
+
end
|
198
|
+
|
199
|
+
j = rawdata.index(Endbracket, i+1)
|
200
|
+
return nil unless j
|
201
|
+
attrsd = []
|
202
|
+
if rawdata[i...i+2] == '<>'
|
203
|
+
# SGML shorthand: <> == <last open tag seen>
|
204
|
+
k = j
|
205
|
+
tag = @lasttag
|
206
|
+
else
|
207
|
+
ni,match = index_match(rawdata,Tagfind,i+1)
|
208
|
+
unless match
|
209
|
+
error('unexpected call to parse_starttag')
|
210
|
+
end
|
211
|
+
k = ni+match[0].length+1
|
212
|
+
tag = match[0].downcase
|
213
|
+
@lasttag = tag
|
214
|
+
end
|
215
|
+
|
216
|
+
while k < j
|
217
|
+
break if rawdata.index(Endtagfind, k) == k
|
218
|
+
ni,match = index_match(rawdata,Attrfind,k)
|
219
|
+
break unless ni
|
220
|
+
matched_length = match[0].length
|
221
|
+
attrname, rest, attrvalue = match[1],match[2],match[3]
|
222
|
+
if rest.nil? or rest.empty?
|
223
|
+
attrvalue = '' # was: = attrname # Why the change?
|
224
|
+
elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
|
225
|
+
attrvalue = attrvalue[1...-1]
|
226
|
+
end
|
227
|
+
attrsd << [attrname.downcase, attrvalue]
|
228
|
+
k += matched_length
|
229
|
+
end
|
230
|
+
if rawdata[j..j] == ">"
|
231
|
+
j += 1
|
232
|
+
end
|
233
|
+
@_starttag_text = rawdata[start_pos...j]
|
234
|
+
finish_starttag(tag, attrsd)
|
235
|
+
return j
|
236
|
+
end
|
237
|
+
|
238
|
+
def parse_endtag(i)
|
239
|
+
rawdata = @rawdata
|
240
|
+
j, match = index_match(rawdata, /[<>]/,i+1)
|
241
|
+
return nil unless j
|
242
|
+
tag = rawdata[i+2...j].strip.downcase
|
243
|
+
if rawdata[j..j] == ">"
|
244
|
+
j += 1
|
245
|
+
end
|
246
|
+
finish_endtag(tag)
|
247
|
+
return j
|
248
|
+
end
|
249
|
+
|
250
|
+
def output
|
251
|
+
# Return processed HTML as a single string
|
252
|
+
return @pieces.map{|p| p.to_s}.join
|
253
|
+
end
|
254
|
+
|
255
|
+
def error(message)
|
256
|
+
raise BetterSGMLParserError.new(message)
|
257
|
+
end
|
258
|
+
def handle_pi(text)
|
259
|
+
end
|
260
|
+
def handle_decl(text)
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
|
@@ -0,0 +1,257 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
|
3
|
+
module FeedParserUtilities
|
4
|
+
|
5
|
+
def unicode(data, from_encoding)
|
6
|
+
# Takes a single string and converts it from the encoding in
|
7
|
+
# from_encoding to unicode.
|
8
|
+
uconvert(data, from_encoding, 'unicode')
|
9
|
+
end
|
10
|
+
|
11
|
+
def uconvert(data, from_encoding, to_encoding = 'utf-8')
|
12
|
+
from_encoding = Encoding_Aliases[from_encoding] || from_encoding
|
13
|
+
to_encoding = Encoding_Aliases[to_encoding] || to_encoding
|
14
|
+
Iconv.iconv(to_encoding, from_encoding, data)[0]
|
15
|
+
end
|
16
|
+
|
17
|
+
def index_match(stri,regexp, offset)
|
18
|
+
i = stri.index(regexp, offset)
|
19
|
+
|
20
|
+
return nil, nil unless i
|
21
|
+
|
22
|
+
full = stri[i..-1].match(regexp)
|
23
|
+
return i, full
|
24
|
+
end
|
25
|
+
|
26
|
+
def _ebcdic_to_ascii(s)
|
27
|
+
return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
|
28
|
+
end
|
29
|
+
|
30
|
+
def getCharacterEncoding(feed, xml_data)
|
31
|
+
# Get the character encoding of the XML document
|
32
|
+
$stderr << "In getCharacterEncoding\n" if $debug
|
33
|
+
sniffed_xml_encoding = nil
|
34
|
+
xml_encoding = nil
|
35
|
+
true_encoding = nil
|
36
|
+
begin
|
37
|
+
http_headers = feed.meta
|
38
|
+
http_content_type = feed.meta['content-type'].split(';')[0]
|
39
|
+
encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
|
40
|
+
http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
|
41
|
+
http_encoding = nil if http_encoding.empty?
|
42
|
+
# FIXME Open-Uri returns iso8859-1 if there is no charset header,
|
43
|
+
# but that doesn't pass the tests. Open-Uri claims its following
|
44
|
+
# the right RFC. Are they wrong or do we need to change the tests?
|
45
|
+
rescue NoMethodError
|
46
|
+
http_headers = {}
|
47
|
+
http_content_type = nil
|
48
|
+
http_encoding = nil
|
49
|
+
end
|
50
|
+
# Must sniff for non-ASCII-compatible character encodings before
|
51
|
+
# searching for XML declaration. This heuristic is defined in
|
52
|
+
# section F of the XML specification:
|
53
|
+
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
54
|
+
begin
|
55
|
+
if xml_data[0..3] == "\x4c\x6f\xa7\x94"
|
56
|
+
# EBCDIC
|
57
|
+
xml_data = _ebcdic_to_ascii(xml_data)
|
58
|
+
elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
|
59
|
+
# UTF-16BE
|
60
|
+
sniffed_xml_encoding = 'utf-16be'
|
61
|
+
xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
|
62
|
+
elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
|
63
|
+
# UTF-16BE with BOM
|
64
|
+
sniffed_xml_encoding = 'utf-16be'
|
65
|
+
xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
|
66
|
+
elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
|
67
|
+
# UTF-16LE
|
68
|
+
sniffed_xml_encoding = 'utf-16le'
|
69
|
+
xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
|
70
|
+
elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
|
71
|
+
# UTF-16LE with BOM
|
72
|
+
sniffed_xml_encoding = 'utf-16le'
|
73
|
+
xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
|
74
|
+
elsif xml_data[0..3] == "\x00\x00\x00\x3c"
|
75
|
+
# UTF-32BE
|
76
|
+
sniffed_xml_encoding = 'utf-32be'
|
77
|
+
xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
|
78
|
+
elsif xml_data[0..3] == "\x3c\x00\x00\x00"
|
79
|
+
# UTF-32LE
|
80
|
+
sniffed_xml_encoding = 'utf-32le'
|
81
|
+
xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
|
82
|
+
elsif xml_data[0..3] == "\x00\x00\xfe\xff"
|
83
|
+
# UTF-32BE with BOM
|
84
|
+
sniffed_xml_encoding = 'utf-32be'
|
85
|
+
xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
|
86
|
+
elsif xml_data[0..3] == "\xff\xfe\x00\x00"
|
87
|
+
# UTF-32LE with BOM
|
88
|
+
sniffed_xml_encoding = 'utf-32le'
|
89
|
+
xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
|
90
|
+
elsif xml_data[0..2] == "\xef\xbb\xbf"
|
91
|
+
# UTF-8 with BOM
|
92
|
+
sniffed_xml_encoding = 'utf-8'
|
93
|
+
xml_data = xml_data[3..-1]
|
94
|
+
else
|
95
|
+
# ASCII-compatible
|
96
|
+
end
|
97
|
+
xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
|
98
|
+
rescue
|
99
|
+
xml_encoding_match = nil
|
100
|
+
end
|
101
|
+
if xml_encoding_match
|
102
|
+
xml_encoding = xml_encoding_match[1].downcase
|
103
|
+
xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
|
104
|
+
if sniffed_xml_encoding and xencodings.include?xml_encoding
|
105
|
+
xml_encoding = sniffed_xml_encoding
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
acceptable_content_type = false
|
110
|
+
application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
|
111
|
+
text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
|
112
|
+
|
113
|
+
if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
|
114
|
+
acceptable_content_type = true
|
115
|
+
true_encoding = http_encoding || xml_encoding || 'utf-8'
|
116
|
+
elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
|
117
|
+
acceptable_content_type = true
|
118
|
+
true_encoding = http_encoding || 'us-ascii'
|
119
|
+
elsif /^text\// =~ http_content_type
|
120
|
+
true_encoding = http_encoding || 'us-ascii'
|
121
|
+
elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
|
122
|
+
true_encoding = xml_encoding || 'iso-8859-1'
|
123
|
+
else
|
124
|
+
true_encoding = xml_encoding || 'utf-8'
|
125
|
+
end
|
126
|
+
return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
|
127
|
+
end
|
128
|
+
|
129
|
+
def toUTF8(data, encoding)
|
130
|
+
=begin
|
131
|
+
Changes an XML data stream on the fly to specify a new encoding
|
132
|
+
|
133
|
+
data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
|
134
|
+
encoding is a string recognized by encodings.aliases
|
135
|
+
=end
|
136
|
+
$stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
|
137
|
+
# NOTE we must use double quotes when dealing with \x encodings!
|
138
|
+
if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
|
139
|
+
if $debug
|
140
|
+
$stderr << "stripping BOM\n"
|
141
|
+
if encoding != 'utf-16be'
|
142
|
+
$stderr << "string utf-16be instead\n"
|
143
|
+
end
|
144
|
+
end
|
145
|
+
encoding = 'utf-16be'
|
146
|
+
data = data[2..-1]
|
147
|
+
elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
|
148
|
+
if $debug
|
149
|
+
$stderr << "stripping BOM\n"
|
150
|
+
$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
|
151
|
+
end
|
152
|
+
encoding = 'utf-16le'
|
153
|
+
data = data[2..-1]
|
154
|
+
elsif (data[0..2] == "\xef\xbb\xbf")
|
155
|
+
if $debug
|
156
|
+
$stderr << "stripping BOM\n"
|
157
|
+
$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
|
158
|
+
end
|
159
|
+
encoding = 'utf-8'
|
160
|
+
data = data[3..-1]
|
161
|
+
elsif (data[0..3] == "\x00\x00\xfe\xff")
|
162
|
+
if $debug
|
163
|
+
$stderr << "stripping BOM\n"
|
164
|
+
if encoding != 'utf-32be'
|
165
|
+
$stderr << "trying utf-32be instead\n"
|
166
|
+
end
|
167
|
+
end
|
168
|
+
encoding = 'utf-32be'
|
169
|
+
data = data[4..-1]
|
170
|
+
elsif (data[0..3] == "\xff\xfe\x00\x00")
|
171
|
+
if $debug
|
172
|
+
$stderr << "stripping BOM\n"
|
173
|
+
if encoding != 'utf-32le'
|
174
|
+
$stderr << "trying utf-32le instead\n"
|
175
|
+
end
|
176
|
+
end
|
177
|
+
encoding = 'utf-32le'
|
178
|
+
data = data[4..-1]
|
179
|
+
end
|
180
|
+
begin
|
181
|
+
newdata = uconvert(data, encoding, 'utf-8')
|
182
|
+
rescue => details
|
183
|
+
end
|
184
|
+
$stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
|
185
|
+
declmatch = /^<\?xml[^>]*?>/
|
186
|
+
newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
|
187
|
+
if declmatch =~ newdata
|
188
|
+
newdata.sub!(declmatch, newdecl)
|
189
|
+
else
|
190
|
+
newdata = newdecl + "\n" + newdata
|
191
|
+
end
|
192
|
+
return newdata
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
|
197
|
+
# http://intertwingly.net/stories/2005/09/28/xchar.rb
|
198
|
+
module XChar
|
199
|
+
# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
|
200
|
+
CP1252 = {
|
201
|
+
128 => 8364, # euro sign
|
202
|
+
130 => 8218, # single low-9 quotation mark
|
203
|
+
131 => 402, # latin small letter f with hook
|
204
|
+
132 => 8222, # double low-9 quotation mark
|
205
|
+
133 => 8230, # horizontal ellipsis
|
206
|
+
134 => 8224, # dagger
|
207
|
+
135 => 8225, # double dagger
|
208
|
+
136 => 710, # modifier letter circumflex accent
|
209
|
+
137 => 8240, # per mille sign
|
210
|
+
138 => 352, # latin capital letter s with caron
|
211
|
+
139 => 8249, # single left-pointing angle quotation mark
|
212
|
+
140 => 338, # latin capital ligature oe
|
213
|
+
142 => 381, # latin capital letter z with caron
|
214
|
+
145 => 8216, # left single quotation mark
|
215
|
+
146 => 8217, # right single quotation mark
|
216
|
+
147 => 8220, # left double quotation mark
|
217
|
+
148 => 8221, # right double quotation mark
|
218
|
+
149 => 8226, # bullet
|
219
|
+
150 => 8211, # en dash
|
220
|
+
151 => 8212, # em dash
|
221
|
+
152 => 732, # small tilde
|
222
|
+
153 => 8482, # trade mark sign
|
223
|
+
154 => 353, # latin small letter s with caron
|
224
|
+
155 => 8250, # single right-pointing angle quotation mark
|
225
|
+
156 => 339, # latin small ligature oe
|
226
|
+
158 => 382, # latin small letter z with caron
|
227
|
+
159 => 376} # latin capital letter y with diaeresis
|
228
|
+
|
229
|
+
# http://www.w3.org/TR/REC-xml/#dt-chardata
|
230
|
+
PREDEFINED = {
|
231
|
+
38 => '&', # ampersand
|
232
|
+
60 => '<', # left angle bracket
|
233
|
+
62 => '>'} # right angle bracket
|
234
|
+
|
235
|
+
# http://www.w3.org/TR/REC-xml/#charsets
|
236
|
+
VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
|
237
|
+
(0xE000..0xFFFD), (0x10000..0x10FFFF)]
|
238
|
+
end
|
239
|
+
|
240
|
+
class Fixnum
|
241
|
+
# xml escaped version of chr
|
242
|
+
def xchr
|
243
|
+
n = XChar::CP1252[self] || self
|
244
|
+
n = 42 unless XChar::VALID.find {|range| range.include? n}
|
245
|
+
XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
class String
|
250
|
+
alias :old_index :index
|
251
|
+
def to_xs
|
252
|
+
unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
|
253
|
+
rescue
|
254
|
+
unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|