rfeedparser 0.9.931 → 0.9.940
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rfeedparser.rb +143 -58
- data/lib/rfeedparser/aliases.rb +1 -1
- data/lib/rfeedparser/better_attributelist.rb +11 -11
- data/lib/rfeedparser/better_sgmlparser.rb +1 -1
- data/lib/rfeedparser/encoding_helpers.rb +120 -127
- data/lib/rfeedparser/feedparserdict.rb +30 -20
- data/lib/rfeedparser/forgiving_uri.rb +9 -7
- data/lib/rfeedparser/markup_helpers.rb +11 -14
- data/lib/rfeedparser/parser_mixin.rb +16 -11
- data/lib/rfeedparser/parsers.rb +1 -2
- data/lib/rfeedparser/scrub.rb +95 -90
- data/lib/rfeedparser/time_helpers.rb +379 -379
- data/lib/rfeedparser/utilities.rb +23 -0
- data/tests/rfeedparser_test_helper.rb +262 -0
- data/tests/rfeedparserserver.rb +3 -109
- data/tests/rfeedparsertest.rb +6 -165
- data/tests/rfponly/http/200.xml +30 -0
- data/tests/rfponly/http/220.xml +28 -0
- data/tests/rfponly/http/300.xml +8 -0
- data/tests/rfponly/http/300.xml_redirect +25 -0
- data/tests/rfponly/http/301.xml +8 -0
- data/tests/rfponly/http/301.xml_redirect +25 -0
- data/tests/rfponly/http/302.xml +8 -0
- data/tests/rfponly/http/302.xml_redirect +25 -0
- data/tests/rfponly/http/307.xml +8 -0
- data/tests/rfponly/http/307.xml_redirect +25 -0
- data/tests/rfponly/http/320.xml +8 -0
- data/tests/rfponly/http/320.xml_redirect +25 -0
- data/tests/rfponly/http/400.xml +7 -0
- data/tests/rfponly/http/404.xml +7 -0
- data/tests/rfponly/http/410.xml +7 -0
- data/tests/rfponly/http/420.xml +7 -0
- data/tests/rfponly/http/500.xml +7 -0
- data/tests/rfponly/http/520.xml +7 -0
- data/tests/rfponly/http/etag.xml +28 -0
- data/tests/rfponly/http/lastmodified.xml +29 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
- metadata +31 -3
data/lib/rfeedparser.rb
CHANGED
@@ -19,11 +19,14 @@ require 'rubygems'
|
|
19
19
|
require 'base64'
|
20
20
|
require 'iconv'
|
21
21
|
|
22
|
+
gem 'hpricot', "=0.6"
|
23
|
+
require 'hpricot'
|
22
24
|
gem 'character-encodings', ">=0.2.0"
|
23
25
|
gem 'htmltools', ">=1.10"
|
24
26
|
gem 'htmlentities', ">=4.0.0"
|
25
27
|
gem 'activesupport', ">=1.4.1"
|
26
28
|
gem 'rchardet', ">=1.0"
|
29
|
+
|
27
30
|
require 'xml/saxdriver' # calling expat through the xmlparser gem
|
28
31
|
|
29
32
|
require 'rchardet'
|
@@ -40,23 +43,21 @@ $debug = false
|
|
40
43
|
$compatible = true
|
41
44
|
|
42
45
|
$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
|
46
|
+
require 'rfeedparser/utilities'
|
43
47
|
require 'rfeedparser/forgiving_uri'
|
44
|
-
require 'rfeedparser/aliases'
|
45
|
-
require 'rfeedparser/encoding_helpers'
|
46
48
|
require 'rfeedparser/better_sgmlparser'
|
47
49
|
require 'rfeedparser/better_attributelist'
|
48
|
-
require 'rfeedparser/scrub'
|
49
|
-
require 'rfeedparser/time_helpers'
|
50
50
|
require 'rfeedparser/feedparserdict'
|
51
51
|
require 'rfeedparser/parser_mixin'
|
52
52
|
require 'rfeedparser/parsers'
|
53
|
-
require 'rfeedparser/markup_helpers'
|
54
53
|
|
55
|
-
|
54
|
+
|
56
55
|
|
57
56
|
|
58
57
|
module FeedParser
|
59
|
-
|
58
|
+
extend FeedParserUtilities
|
59
|
+
|
60
|
+
Version = "0.9.940"
|
60
61
|
|
61
62
|
License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
62
63
|
|
@@ -81,18 +82,19 @@ module FeedParser
|
|
81
82
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
82
83
|
POSSIBILITY OF SUCH DAMAGE."""
|
83
84
|
|
84
|
-
|
85
|
-
|
85
|
+
Translator_From_Python_To_Ruby = "Jeff Hodges <http://somethingsimilar.com>"
|
86
|
+
Author = "Mark Pilgrim <http://diveintomark.org/>"
|
86
87
|
Contributors = [ "Jason Diamond <http://injektilo.org/>",
|
87
88
|
"John Beimler <http://john.beimler.org/>",
|
88
89
|
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
|
89
90
|
"Aaron Swartz <http://aaronsw.com/>",
|
90
|
-
"Kevin Marks <http://epeus.blogspot.com/>"
|
91
|
+
"Kevin Marks <http://epeus.blogspot.com/>",
|
92
|
+
"Jesse Newland <http://jnewland.com/>"
|
91
93
|
]
|
92
94
|
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
93
95
|
# If you are embedding feedparser in a larger application, you should
|
94
96
|
# change this to your application name and URL.
|
95
|
-
USER_AGENT = "
|
97
|
+
USER_AGENT = "rFeedParser/#{Version} +http://rfeedparser.rubyforge.org/"
|
96
98
|
|
97
99
|
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
|
98
100
|
# want to send an Accept header, set this to None.
|
@@ -141,60 +143,139 @@ module FeedParser
|
|
141
143
|
'hotrss' => 'Hot RSS'
|
142
144
|
}
|
143
145
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
146
|
+
# Accepted in options: :agent, :modified, :etag, and :referrer
|
147
|
+
def open_resource(url_file_stream_or_string, options)
|
148
|
+
options[:handlers] ||= []
|
149
|
+
|
150
|
+
if url_file_stream_or_string.respond_to?(:read)
|
151
|
+
return url_file_stream_or_string
|
152
|
+
|
153
|
+
elsif url_file_stream_or_string == '-'
|
154
|
+
return $stdin
|
155
|
+
end
|
156
|
+
|
157
|
+
# open-uri freaks out if there's leading spaces.
|
158
|
+
url_file_stream_or_string.strip!
|
159
|
+
|
160
|
+
|
161
|
+
furi = ForgivingURI.parse(url_file_stream_or_string)
|
162
|
+
if furi && ['http','https','ftp'].include?(furi.scheme)
|
163
|
+
auth = nil
|
164
|
+
|
165
|
+
if furi.host && furi.password
|
166
|
+
auth = Base64::encode64("#{furi.user}:#{furi.password}").strip
|
167
|
+
furi.password = nil
|
168
|
+
url_file_stream_or_string = furi.to_s
|
169
|
+
end
|
170
|
+
|
171
|
+
req_headers = {}
|
172
|
+
req_headers["User-Agent"] = options[:agent] || USER_AGENT
|
173
|
+
req_headers["If-None-Match"] = options[:etag] if options[:etag]
|
174
|
+
|
175
|
+
if options[:modified]
|
176
|
+
if options[:modified].is_a?(String)
|
177
|
+
req_headers["If-Modified-Since"] = parse_date(options[:modified]).httpdate
|
178
|
+
elsif options[:modified].is_a?(Time)
|
179
|
+
req_headers["If-Modified-Since"] = options[:modified].httpdate
|
180
|
+
elsif options[:modified].is_a?(Array)
|
181
|
+
req_headers["If-Modified-Since"] = py2rtime(options[:modified]).httpdate
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
req_headers["Referer"] = options[:referrer] if options[:referrer]
|
186
|
+
req_headers["Accept-encoding"] = 'gzip, deflate' # FIXME make tests
|
187
|
+
req_headers["Authorization"] = "Basic #{auth}" if auth
|
188
|
+
req_headers['Accept'] = ACCEPT_HEADER if ACCEPT_HEADER
|
189
|
+
req_headers['A-IM'] = 'feed' # RFC 3229 support
|
190
|
+
|
191
|
+
begin
|
192
|
+
return open(url_file_stream_or_string, req_headers)
|
193
|
+
rescue OpenURI::HTTPError => e
|
194
|
+
return e.io
|
195
|
+
rescue
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
# try to open with native open function (if url_file_stream_or_string is a filename)
|
200
|
+
begin
|
201
|
+
return open(url_file_stream_or_string)
|
202
|
+
rescue
|
203
|
+
end
|
204
|
+
# treat url_file_stream_or_string as string
|
205
|
+
return StringIO.new(url_file_stream_or_string.to_s)
|
206
|
+
end
|
207
|
+
module_function(:open_resource)
|
208
|
+
|
209
|
+
# Parse a feed from a URL, file, stream or string
|
210
|
+
def parse(url_file_stream_or_string, options = {})
|
211
|
+
|
212
|
+
|
213
|
+
# Use the default compatibility if compatible is nil
|
214
|
+
$compatible = options[:compatible].nil? ? $compatible : options[:compatible]
|
215
|
+
|
148
216
|
strictklass = options[:strict] || StrictFeedParser
|
149
217
|
looseklass = options[:loose] || LooseFeedParser
|
218
|
+
options[:handlers] = options[:handlers] || []
|
219
|
+
|
150
220
|
result = FeedParserDict.new
|
151
221
|
result['feed'] = FeedParserDict.new
|
152
222
|
result['entries'] = []
|
153
|
-
|
154
|
-
options[:modified] = Time.parse(options[:modified]).utc.rfc2822
|
155
|
-
# FIXME this ignores all of our time parsing work. Does it matter?
|
156
|
-
end
|
223
|
+
|
157
224
|
result['bozo'] = false
|
158
|
-
|
159
|
-
if handlers.class != Array # FIXME why does this happen?
|
160
|
-
handlers = [handlers]
|
161
|
-
end
|
162
|
-
|
225
|
+
|
163
226
|
begin
|
164
|
-
|
165
|
-
if [nil, "file"].include? parsed_furi.scheme
|
166
|
-
$stderr << "Opening local file #{furi}\n" if $debug
|
167
|
-
f = open(parsed_furi.path) # OpenURI doesn't behave well when passing HTTP options to a file.
|
168
|
-
else
|
169
|
-
# And when you do pass them, make sure they aren't just nil (this still true?)
|
170
|
-
newd = {}
|
171
|
-
newd["If-None-Match"] = options[:etag] unless options[:etag].nil?
|
172
|
-
newd["If-Modified-Since"] = options[:modified] unless options[:modified].nil?
|
173
|
-
newd["User-Agent"] = (options[:agent] || USER_AGENT).to_s
|
174
|
-
newd["Referer"] = options[:referrer] unless options[:referrer].nil?
|
175
|
-
newd["Content-Location"] = options[:content_location] unless options[:content_location].nil?
|
176
|
-
newd["Content-Language"] = options[:content_language] unless options[:content_language].nil?
|
177
|
-
newd["Content-type"] = options[:content_type] unless options[:content_type].nil?
|
178
|
-
|
179
|
-
f = open(furi, newd)
|
180
|
-
end
|
181
|
-
|
227
|
+
f = open_resource(url_file_stream_or_string, options)
|
182
228
|
data = f.read
|
183
|
-
f.close
|
184
229
|
rescue => e
|
185
|
-
$stderr << "Rescued in parse: "+e.to_s+"\n" if $debug # My addition
|
186
230
|
result['bozo'] = true
|
187
231
|
result['bozo_exception'] = e
|
188
232
|
data = ''
|
189
233
|
f = nil
|
190
234
|
end
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
235
|
+
|
236
|
+
if f and !data.blank? and f.respond_to?(:meta)
|
237
|
+
# if feed is gzip-compressed, decompress it
|
238
|
+
if f.meta['content-encoding'] == 'gzip'
|
239
|
+
begin
|
240
|
+
gz = Zlib::GzipReader.new(StringIO.new(data))
|
241
|
+
data = gz.read
|
242
|
+
gz.close
|
243
|
+
rescue => e
|
244
|
+
# Some feeds claim to be gzipped but they're not, so
|
245
|
+
# we get garbage. Ideally, we should re-request the
|
246
|
+
# feed without the 'Accept-encoding: gzip' header,
|
247
|
+
# but we don't.
|
248
|
+
result['bozo'] = true
|
249
|
+
result['bozo_exception'] = e
|
250
|
+
data = ''
|
251
|
+
end
|
252
|
+
elsif f.meta['content-encoding'] == 'deflate'
|
253
|
+
begin
|
254
|
+
data = Zlib::Deflate.inflate(data)
|
255
|
+
rescue => e
|
256
|
+
result['bozo'] = true
|
257
|
+
result['bozo_exception'] = e
|
258
|
+
data = ''
|
259
|
+
end
|
197
260
|
end
|
261
|
+
end
|
262
|
+
|
263
|
+
if f.respond_to?(:meta)
|
264
|
+
result['etag'] = f.meta['etag']
|
265
|
+
result['modified_time'] = parse_date(f.meta['last-modified'])
|
266
|
+
result['modified'] = extract_tuple(result['modified_time'])
|
267
|
+
result['headers'] = f.meta
|
268
|
+
end
|
269
|
+
|
270
|
+
# FIXME open-uri does not return a non-nil base_uri in its HTTPErrors.
|
271
|
+
if f.respond_to?(:base_uri)
|
272
|
+
result['href'] = f.base_uri.to_s # URI => String
|
273
|
+
result['status'] = '200'
|
274
|
+
end
|
275
|
+
|
276
|
+
if f.respond_to?(:status)
|
277
|
+
result['status'] = f.status[0]
|
278
|
+
end
|
198
279
|
|
199
280
|
|
200
281
|
# there are four encodings to keep track of:
|
@@ -204,7 +285,7 @@ module FeedParser
|
|
204
285
|
# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
|
205
286
|
http_headers = result['headers'] || {}
|
206
287
|
result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
|
207
|
-
|
288
|
+
getCharacterEncoding(http_headers,data)
|
208
289
|
|
209
290
|
if not http_headers.blank? and not acceptable_content_type
|
210
291
|
unless http_headers['content-type'].nil?
|
@@ -215,7 +296,7 @@ module FeedParser
|
|
215
296
|
result['bozo'] = true
|
216
297
|
result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
|
217
298
|
end
|
218
|
-
result['version'], data =
|
299
|
+
result['version'], data = stripDoctype(data)
|
219
300
|
|
220
301
|
baseuri = http_headers['content-location'] || result['href']
|
221
302
|
baselang = http_headers['content-language']
|
@@ -244,7 +325,7 @@ module FeedParser
|
|
244
325
|
next if tried_encodings.include? proposed_encoding
|
245
326
|
tried_encodings << proposed_encoding
|
246
327
|
begin
|
247
|
-
data =
|
328
|
+
data = toUTF8(data, proposed_encoding)
|
248
329
|
known_encoding = use_strict_parser = true
|
249
330
|
break
|
250
331
|
rescue
|
@@ -256,7 +337,7 @@ module FeedParser
|
|
256
337
|
proposed_encoding = CharDet.detect(data)['encoding']
|
257
338
|
if proposed_encoding and not tried_encodings.include?proposed_encoding
|
258
339
|
tried_encodings << proposed_encoding
|
259
|
-
data =
|
340
|
+
data = toUTF8(data, proposed_encoding)
|
260
341
|
known_encoding = use_strict_parser = true
|
261
342
|
end
|
262
343
|
rescue
|
@@ -270,7 +351,7 @@ module FeedParser
|
|
270
351
|
begin
|
271
352
|
proposed_encoding = 'utf-8'
|
272
353
|
tried_encodings << proposed_encoding
|
273
|
-
data =
|
354
|
+
data = toUTF8(data, proposed_encoding)
|
274
355
|
known_encoding = use_strict_parser = true
|
275
356
|
rescue
|
276
357
|
end
|
@@ -280,7 +361,7 @@ module FeedParser
|
|
280
361
|
begin
|
281
362
|
proposed_encoding = 'windows-1252'
|
282
363
|
tried_encodings << proposed_encoding
|
283
|
-
data =
|
364
|
+
data = toUTF8(data, proposed_encoding)
|
284
365
|
known_encoding = use_strict_parser = true
|
285
366
|
rescue
|
286
367
|
end
|
@@ -292,7 +373,7 @@ module FeedParser
|
|
292
373
|
# begin
|
293
374
|
# proposed_encoding = 'iso-8859-2'
|
294
375
|
# tried_encodings << proposed_encoding
|
295
|
-
# data =
|
376
|
+
# data = toUTF8(data, proposed_encoding)
|
296
377
|
# known_encoding = use_strict_parser = true
|
297
378
|
# rescue
|
298
379
|
# end
|
@@ -334,9 +415,9 @@ module FeedParser
|
|
334
415
|
end
|
335
416
|
end
|
336
417
|
if not use_strict_parser
|
418
|
+
$stderr << "Using LooseFeed\n\n" if $debug
|
337
419
|
feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
|
338
420
|
feedparser.parse(data)
|
339
|
-
$stderr << "Using LooseFeed\n\n" if $debug
|
340
421
|
end
|
341
422
|
result['feed'] = feedparser.feeddata
|
342
423
|
result['entries'] = feedparser.entries
|
@@ -347,6 +428,10 @@ module FeedParser
|
|
347
428
|
module_function(:parse)
|
348
429
|
end # End FeedParser module
|
349
430
|
|
431
|
+
def rfp(url_file_stream_or_string, options={})
|
432
|
+
FeedParser.parse(url_file_stream_or_string, options)
|
433
|
+
end
|
434
|
+
|
350
435
|
class Serializer
|
351
436
|
def initialize(results)
|
352
437
|
@results = results
|
data/lib/rfeedparser/aliases.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
# Add some helper methods to make AttributeList (all of those damn attrs
|
4
4
|
# and attrsD used by StrictFeedParser) act more like a Hash.
|
@@ -8,31 +8,31 @@ module XML
|
|
8
8
|
module SAX
|
9
9
|
module AttributeList # in xml/sax.rb
|
10
10
|
def [](key)
|
11
|
-
|
11
|
+
getValue(key)
|
12
12
|
end
|
13
13
|
|
14
14
|
def each(&blk)
|
15
|
-
|
15
|
+
(0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
|
16
16
|
end
|
17
17
|
|
18
18
|
def each_key(&blk)
|
19
|
-
|
19
|
+
(0...getLength).each{|pos| yield getName(pos) }
|
20
20
|
end
|
21
21
|
|
22
22
|
def each_value(&blk)
|
23
|
-
|
23
|
+
(0...getLength).each{|pos| yield getValue(pos) }
|
24
24
|
end
|
25
25
|
|
26
26
|
def to_a # Rather use collect? grep for to_a.collect
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
l = []
|
28
|
+
each{|k,v| l << [k,v]}
|
29
|
+
return l
|
30
30
|
end
|
31
31
|
|
32
32
|
def to_s
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
l = []
|
34
|
+
each{|k,v| l << "#{k} => #{v}"}
|
35
|
+
"{ "+l.join(", ")+" }"
|
36
36
|
end
|
37
37
|
end
|
38
38
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
module FeedParserUtilities
|
4
4
|
|
@@ -26,73 +26,68 @@ module FeedParserUtilities
|
|
26
26
|
def _ebcdic_to_ascii(s)
|
27
27
|
return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
|
28
28
|
end
|
29
|
-
|
30
|
-
def getCharacterEncoding(
|
29
|
+
|
30
|
+
def getCharacterEncoding(http_headers, xml_data)
|
31
31
|
# Get the character encoding of the XML document
|
32
32
|
$stderr << "In getCharacterEncoding\n" if $debug
|
33
33
|
sniffed_xml_encoding = nil
|
34
34
|
xml_encoding = nil
|
35
35
|
true_encoding = nil
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
http_encoding = nil if http_encoding.empty?
|
36
|
+
|
37
|
+
http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
|
38
|
+
encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
|
39
|
+
http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
|
40
|
+
http_encoding = nil if http_encoding.blank?
|
42
41
|
# FIXME Open-Uri returns iso8859-1 if there is no charset header,
|
43
42
|
# but that doesn't pass the tests. Open-Uri claims its following
|
44
43
|
# the right RFC. Are they wrong or do we need to change the tests?
|
45
|
-
|
46
|
-
http_headers = {}
|
47
|
-
http_content_type = nil
|
48
|
-
http_encoding = nil
|
49
|
-
end
|
44
|
+
|
50
45
|
# Must sniff for non-ASCII-compatible character encodings before
|
51
46
|
# searching for XML declaration. This heuristic is defined in
|
52
47
|
# section F of the XML specification:
|
53
48
|
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
54
49
|
begin
|
55
50
|
if xml_data[0..3] == "\x4c\x6f\xa7\x94"
|
56
|
-
|
57
|
-
|
51
|
+
# EBCDIC
|
52
|
+
xml_data = __ebcdic_to_ascii(xml_data)
|
58
53
|
elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
|
59
|
-
|
60
|
-
|
61
|
-
|
54
|
+
# UTF-16BE
|
55
|
+
sniffed_xml_encoding = 'utf-16be'
|
56
|
+
xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
|
62
57
|
elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
|
63
|
-
|
64
|
-
|
65
|
-
|
58
|
+
# UTF-16BE with BOM
|
59
|
+
sniffed_xml_encoding = 'utf-16be'
|
60
|
+
xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
|
66
61
|
elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
|
67
|
-
|
68
|
-
|
69
|
-
|
62
|
+
# UTF-16LE
|
63
|
+
sniffed_xml_encoding = 'utf-16le'
|
64
|
+
xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
|
70
65
|
elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
|
71
|
-
|
72
|
-
|
73
|
-
|
66
|
+
# UTF-16LE with BOM
|
67
|
+
sniffed_xml_encoding = 'utf-16le'
|
68
|
+
xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
|
74
69
|
elsif xml_data[0..3] == "\x00\x00\x00\x3c"
|
75
|
-
|
76
|
-
|
77
|
-
|
70
|
+
# UTF-32BE
|
71
|
+
sniffed_xml_encoding = 'utf-32be'
|
72
|
+
xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
|
78
73
|
elsif xml_data[0..3] == "\x3c\x00\x00\x00"
|
79
|
-
|
80
|
-
|
81
|
-
|
74
|
+
# UTF-32LE
|
75
|
+
sniffed_xml_encoding = 'utf-32le'
|
76
|
+
xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
|
82
77
|
elsif xml_data[0..3] == "\x00\x00\xfe\xff"
|
83
|
-
|
84
|
-
|
85
|
-
|
78
|
+
# UTF-32BE with BOM
|
79
|
+
sniffed_xml_encoding = 'utf-32be'
|
80
|
+
xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
|
86
81
|
elsif xml_data[0..3] == "\xff\xfe\x00\x00"
|
87
|
-
|
88
|
-
|
89
|
-
|
82
|
+
# UTF-32LE with BOM
|
83
|
+
sniffed_xml_encoding = 'utf-32le'
|
84
|
+
xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
|
90
85
|
elsif xml_data[0..2] == "\xef\xbb\xbf"
|
91
|
-
|
92
|
-
|
93
|
-
|
86
|
+
# UTF-8 with BOM
|
87
|
+
sniffed_xml_encoding = 'utf-8'
|
88
|
+
xml_data = xml_data[3..-1]
|
94
89
|
else
|
95
|
-
|
90
|
+
# ASCII-compatible
|
96
91
|
end
|
97
92
|
xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
|
98
93
|
rescue
|
@@ -102,7 +97,7 @@ module FeedParserUtilities
|
|
102
97
|
xml_encoding = xml_encoding_match[1].downcase
|
103
98
|
xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
|
104
99
|
if sniffed_xml_encoding and xencodings.include?xml_encoding
|
105
|
-
|
100
|
+
xml_encoding = sniffed_xml_encoding
|
106
101
|
end
|
107
102
|
end
|
108
103
|
|
@@ -125,54 +120,48 @@ module FeedParserUtilities
|
|
125
120
|
end
|
126
121
|
return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
|
127
122
|
end
|
128
|
-
|
123
|
+
|
129
124
|
def toUTF8(data, encoding)
|
130
|
-
=begin
|
131
|
-
Changes an XML data stream on the fly to specify a new encoding
|
132
|
-
|
133
|
-
data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
|
134
|
-
encoding is a string recognized by encodings.aliases
|
135
|
-
=end
|
136
125
|
$stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
|
137
126
|
# NOTE we must use double quotes when dealing with \x encodings!
|
138
127
|
if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
|
139
128
|
if $debug
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
129
|
+
$stderr << "stripping BOM\n"
|
130
|
+
if encoding != 'utf-16be'
|
131
|
+
$stderr << "string utf-16be instead\n"
|
132
|
+
end
|
144
133
|
end
|
145
134
|
encoding = 'utf-16be'
|
146
135
|
data = data[2..-1]
|
147
136
|
elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
|
148
137
|
if $debug
|
149
|
-
|
150
|
-
|
138
|
+
$stderr << "stripping BOM\n"
|
139
|
+
$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
|
151
140
|
end
|
152
141
|
encoding = 'utf-16le'
|
153
142
|
data = data[2..-1]
|
154
143
|
elsif (data[0..2] == "\xef\xbb\xbf")
|
155
144
|
if $debug
|
156
|
-
|
157
|
-
|
145
|
+
$stderr << "stripping BOM\n"
|
146
|
+
$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
|
158
147
|
end
|
159
148
|
encoding = 'utf-8'
|
160
149
|
data = data[3..-1]
|
161
150
|
elsif (data[0..3] == "\x00\x00\xfe\xff")
|
162
151
|
if $debug
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
152
|
+
$stderr << "stripping BOM\n"
|
153
|
+
if encoding != 'utf-32be'
|
154
|
+
$stderr << "trying utf-32be instead\n"
|
155
|
+
end
|
167
156
|
end
|
168
157
|
encoding = 'utf-32be'
|
169
158
|
data = data[4..-1]
|
170
159
|
elsif (data[0..3] == "\xff\xfe\x00\x00")
|
171
160
|
if $debug
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
161
|
+
$stderr << "stripping BOM\n"
|
162
|
+
if encoding != 'utf-32le'
|
163
|
+
$stderr << "trying utf-32le instead\n"
|
164
|
+
end
|
176
165
|
end
|
177
166
|
encoding = 'utf-32le'
|
178
167
|
data = data[4..-1]
|
@@ -184,75 +173,79 @@ module FeedParserUtilities
|
|
184
173
|
end
|
185
174
|
$stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
|
186
175
|
declmatch = /^<\?xml[^>]*?>/
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
176
|
+
newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
|
177
|
+
if declmatch =~ newdata
|
178
|
+
newdata.sub!(declmatch, newdecl)
|
179
|
+
else
|
180
|
+
newdata = newdecl + "\n" + newdata
|
181
|
+
end
|
193
182
|
return newdata
|
194
183
|
end
|
195
|
-
|
184
|
+
|
196
185
|
end
|
197
186
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
187
|
+
unless defined?(Builder::XChar)
|
188
|
+
# http://intertwingly.net/stories/2005/09/28/xchar.rb
|
189
|
+
module XChar
|
190
|
+
# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
|
191
|
+
CP1252 = {
|
192
|
+
128 => 8364, # euro sign
|
193
|
+
130 => 8218, # single low-9 quotation mark
|
194
|
+
131 => 402, # latin small letter f with hook
|
195
|
+
132 => 8222, # double low-9 quotation mark
|
196
|
+
133 => 8230, # horizontal ellipsis
|
197
|
+
134 => 8224, # dagger
|
198
|
+
135 => 8225, # double dagger
|
199
|
+
136 => 710, # modifier letter circumflex accent
|
200
|
+
137 => 8240, # per mille sign
|
201
|
+
138 => 352, # latin capital letter s with caron
|
202
|
+
139 => 8249, # single left-pointing angle quotation mark
|
203
|
+
140 => 338, # latin capital ligature oe
|
204
|
+
142 => 381, # latin capital letter z with caron
|
205
|
+
145 => 8216, # left single quotation mark
|
206
|
+
146 => 8217, # right single quotation mark
|
207
|
+
147 => 8220, # left double quotation mark
|
208
|
+
148 => 8221, # right double quotation mark
|
209
|
+
149 => 8226, # bullet
|
210
|
+
150 => 8211, # en dash
|
211
|
+
151 => 8212, # em dash
|
212
|
+
152 => 732, # small tilde
|
213
|
+
153 => 8482, # trade mark sign
|
214
|
+
154 => 353, # latin small letter s with caron
|
215
|
+
155 => 8250, # single right-pointing angle quotation mark
|
216
|
+
156 => 339, # latin small ligature oe
|
217
|
+
158 => 382, # latin small letter z with caron
|
218
|
+
159 => 376 # latin capital letter y with diaeresis
|
219
|
+
}
|
230
220
|
# http://www.w3.org/TR/REC-xml/#dt-chardata
|
231
221
|
PREDEFINED = {
|
232
222
|
38 => '&', # ampersand
|
233
223
|
60 => '<', # left angle bracket
|
234
|
-
62 => '>'
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
end
|
224
|
+
62 => '>' # right angle bracket
|
225
|
+
}
|
226
|
+
# http://www.w3.org/TR/REC-xml/#charsets
|
227
|
+
VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
|
228
|
+
(0xE000..0xFFFD), (0x10000..0x10FFFF)]
|
229
|
+
end
|
240
230
|
|
241
|
-
class Fixnum
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
231
|
+
class Fixnum
|
232
|
+
# xml escaped version of chr
|
233
|
+
def xchr
|
234
|
+
n = XChar::CP1252[self] || self
|
235
|
+
case n when *XChar::VALID
|
236
|
+
XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
|
237
|
+
else
|
238
|
+
'*'
|
239
|
+
end
|
240
|
+
end
|
247
241
|
end
|
248
|
-
end
|
249
242
|
|
250
|
-
class String
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
243
|
+
class String
|
244
|
+
alias :old_index :index
|
245
|
+
def to_xs
|
246
|
+
unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
|
247
|
+
rescue
|
248
|
+
unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
|
249
|
+
end
|
256
250
|
end
|
257
251
|
end
|
258
|
-
|