rfeedparser 0.9.931 → 0.9.940
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rfeedparser.rb +143 -58
- data/lib/rfeedparser/aliases.rb +1 -1
- data/lib/rfeedparser/better_attributelist.rb +11 -11
- data/lib/rfeedparser/better_sgmlparser.rb +1 -1
- data/lib/rfeedparser/encoding_helpers.rb +120 -127
- data/lib/rfeedparser/feedparserdict.rb +30 -20
- data/lib/rfeedparser/forgiving_uri.rb +9 -7
- data/lib/rfeedparser/markup_helpers.rb +11 -14
- data/lib/rfeedparser/parser_mixin.rb +16 -11
- data/lib/rfeedparser/parsers.rb +1 -2
- data/lib/rfeedparser/scrub.rb +95 -90
- data/lib/rfeedparser/time_helpers.rb +379 -379
- data/lib/rfeedparser/utilities.rb +23 -0
- data/tests/rfeedparser_test_helper.rb +262 -0
- data/tests/rfeedparserserver.rb +3 -109
- data/tests/rfeedparsertest.rb +6 -165
- data/tests/rfponly/http/200.xml +30 -0
- data/tests/rfponly/http/220.xml +28 -0
- data/tests/rfponly/http/300.xml +8 -0
- data/tests/rfponly/http/300.xml_redirect +25 -0
- data/tests/rfponly/http/301.xml +8 -0
- data/tests/rfponly/http/301.xml_redirect +25 -0
- data/tests/rfponly/http/302.xml +8 -0
- data/tests/rfponly/http/302.xml_redirect +25 -0
- data/tests/rfponly/http/307.xml +8 -0
- data/tests/rfponly/http/307.xml_redirect +25 -0
- data/tests/rfponly/http/320.xml +8 -0
- data/tests/rfponly/http/320.xml_redirect +25 -0
- data/tests/rfponly/http/400.xml +7 -0
- data/tests/rfponly/http/404.xml +7 -0
- data/tests/rfponly/http/410.xml +7 -0
- data/tests/rfponly/http/420.xml +7 -0
- data/tests/rfponly/http/500.xml +7 -0
- data/tests/rfponly/http/520.xml +7 -0
- data/tests/rfponly/http/etag.xml +28 -0
- data/tests/rfponly/http/lastmodified.xml +29 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
- data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
- metadata +31 -3
data/lib/rfeedparser.rb
CHANGED
@@ -19,11 +19,14 @@ require 'rubygems'
|
|
19
19
|
require 'base64'
|
20
20
|
require 'iconv'
|
21
21
|
|
22
|
+
gem 'hpricot', "=0.6"
|
23
|
+
require 'hpricot'
|
22
24
|
gem 'character-encodings', ">=0.2.0"
|
23
25
|
gem 'htmltools', ">=1.10"
|
24
26
|
gem 'htmlentities', ">=4.0.0"
|
25
27
|
gem 'activesupport', ">=1.4.1"
|
26
28
|
gem 'rchardet', ">=1.0"
|
29
|
+
|
27
30
|
require 'xml/saxdriver' # calling expat through the xmlparser gem
|
28
31
|
|
29
32
|
require 'rchardet'
|
@@ -40,23 +43,21 @@ $debug = false
|
|
40
43
|
$compatible = true
|
41
44
|
|
42
45
|
$LOAD_PATH << File.expand_path(File.dirname(__FILE__))
|
46
|
+
require 'rfeedparser/utilities'
|
43
47
|
require 'rfeedparser/forgiving_uri'
|
44
|
-
require 'rfeedparser/aliases'
|
45
|
-
require 'rfeedparser/encoding_helpers'
|
46
48
|
require 'rfeedparser/better_sgmlparser'
|
47
49
|
require 'rfeedparser/better_attributelist'
|
48
|
-
require 'rfeedparser/scrub'
|
49
|
-
require 'rfeedparser/time_helpers'
|
50
50
|
require 'rfeedparser/feedparserdict'
|
51
51
|
require 'rfeedparser/parser_mixin'
|
52
52
|
require 'rfeedparser/parsers'
|
53
|
-
require 'rfeedparser/markup_helpers'
|
54
53
|
|
55
|
-
|
54
|
+
|
56
55
|
|
57
56
|
|
58
57
|
module FeedParser
|
59
|
-
|
58
|
+
extend FeedParserUtilities
|
59
|
+
|
60
|
+
Version = "0.9.940"
|
60
61
|
|
61
62
|
License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
62
63
|
|
@@ -81,18 +82,19 @@ module FeedParser
|
|
81
82
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
82
83
|
POSSIBILITY OF SUCH DAMAGE."""
|
83
84
|
|
84
|
-
|
85
|
-
|
85
|
+
Translator_From_Python_To_Ruby = "Jeff Hodges <http://somethingsimilar.com>"
|
86
|
+
Author = "Mark Pilgrim <http://diveintomark.org/>"
|
86
87
|
Contributors = [ "Jason Diamond <http://injektilo.org/>",
|
87
88
|
"John Beimler <http://john.beimler.org/>",
|
88
89
|
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
|
89
90
|
"Aaron Swartz <http://aaronsw.com/>",
|
90
|
-
"Kevin Marks <http://epeus.blogspot.com/>"
|
91
|
+
"Kevin Marks <http://epeus.blogspot.com/>",
|
92
|
+
"Jesse Newland <http://jnewland.com/>"
|
91
93
|
]
|
92
94
|
# HTTP "User-Agent" header to send to servers when downloading feeds.
|
93
95
|
# If you are embedding feedparser in a larger application, you should
|
94
96
|
# change this to your application name and URL.
|
95
|
-
USER_AGENT = "
|
97
|
+
USER_AGENT = "rFeedParser/#{Version} +http://rfeedparser.rubyforge.org/"
|
96
98
|
|
97
99
|
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
|
98
100
|
# want to send an Accept header, set this to None.
|
@@ -141,60 +143,139 @@ module FeedParser
|
|
141
143
|
'hotrss' => 'Hot RSS'
|
142
144
|
}
|
143
145
|
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
146
|
+
# Accepted in options: :agent, :modified, :etag, and :referrer
|
147
|
+
def open_resource(url_file_stream_or_string, options)
|
148
|
+
options[:handlers] ||= []
|
149
|
+
|
150
|
+
if url_file_stream_or_string.respond_to?(:read)
|
151
|
+
return url_file_stream_or_string
|
152
|
+
|
153
|
+
elsif url_file_stream_or_string == '-'
|
154
|
+
return $stdin
|
155
|
+
end
|
156
|
+
|
157
|
+
# open-uri freaks out if there's leading spaces.
|
158
|
+
url_file_stream_or_string.strip!
|
159
|
+
|
160
|
+
|
161
|
+
furi = ForgivingURI.parse(url_file_stream_or_string)
|
162
|
+
if furi && ['http','https','ftp'].include?(furi.scheme)
|
163
|
+
auth = nil
|
164
|
+
|
165
|
+
if furi.host && furi.password
|
166
|
+
auth = Base64::encode64("#{furi.user}:#{furi.password}").strip
|
167
|
+
furi.password = nil
|
168
|
+
url_file_stream_or_string = furi.to_s
|
169
|
+
end
|
170
|
+
|
171
|
+
req_headers = {}
|
172
|
+
req_headers["User-Agent"] = options[:agent] || USER_AGENT
|
173
|
+
req_headers["If-None-Match"] = options[:etag] if options[:etag]
|
174
|
+
|
175
|
+
if options[:modified]
|
176
|
+
if options[:modified].is_a?(String)
|
177
|
+
req_headers["If-Modified-Since"] = parse_date(options[:modified]).httpdate
|
178
|
+
elsif options[:modified].is_a?(Time)
|
179
|
+
req_headers["If-Modified-Since"] = options[:modified].httpdate
|
180
|
+
elsif options[:modified].is_a?(Array)
|
181
|
+
req_headers["If-Modified-Since"] = py2rtime(options[:modified]).httpdate
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
req_headers["Referer"] = options[:referrer] if options[:referrer]
|
186
|
+
req_headers["Accept-encoding"] = 'gzip, deflate' # FIXME make tests
|
187
|
+
req_headers["Authorization"] = "Basic #{auth}" if auth
|
188
|
+
req_headers['Accept'] = ACCEPT_HEADER if ACCEPT_HEADER
|
189
|
+
req_headers['A-IM'] = 'feed' # RFC 3229 support
|
190
|
+
|
191
|
+
begin
|
192
|
+
return open(url_file_stream_or_string, req_headers)
|
193
|
+
rescue OpenURI::HTTPError => e
|
194
|
+
return e.io
|
195
|
+
rescue
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
# try to open with native open function (if url_file_stream_or_string is a filename)
|
200
|
+
begin
|
201
|
+
return open(url_file_stream_or_string)
|
202
|
+
rescue
|
203
|
+
end
|
204
|
+
# treat url_file_stream_or_string as string
|
205
|
+
return StringIO.new(url_file_stream_or_string.to_s)
|
206
|
+
end
|
207
|
+
module_function(:open_resource)
|
208
|
+
|
209
|
+
# Parse a feed from a URL, file, stream or string
|
210
|
+
def parse(url_file_stream_or_string, options = {})
|
211
|
+
|
212
|
+
|
213
|
+
# Use the default compatibility if compatible is nil
|
214
|
+
$compatible = options[:compatible].nil? ? $compatible : options[:compatible]
|
215
|
+
|
148
216
|
strictklass = options[:strict] || StrictFeedParser
|
149
217
|
looseklass = options[:loose] || LooseFeedParser
|
218
|
+
options[:handlers] = options[:handlers] || []
|
219
|
+
|
150
220
|
result = FeedParserDict.new
|
151
221
|
result['feed'] = FeedParserDict.new
|
152
222
|
result['entries'] = []
|
153
|
-
|
154
|
-
options[:modified] = Time.parse(options[:modified]).utc.rfc2822
|
155
|
-
# FIXME this ignores all of our time parsing work. Does it matter?
|
156
|
-
end
|
223
|
+
|
157
224
|
result['bozo'] = false
|
158
|
-
|
159
|
-
if handlers.class != Array # FIXME why does this happen?
|
160
|
-
handlers = [handlers]
|
161
|
-
end
|
162
|
-
|
225
|
+
|
163
226
|
begin
|
164
|
-
|
165
|
-
if [nil, "file"].include? parsed_furi.scheme
|
166
|
-
$stderr << "Opening local file #{furi}\n" if $debug
|
167
|
-
f = open(parsed_furi.path) # OpenURI doesn't behave well when passing HTTP options to a file.
|
168
|
-
else
|
169
|
-
# And when you do pass them, make sure they aren't just nil (this still true?)
|
170
|
-
newd = {}
|
171
|
-
newd["If-None-Match"] = options[:etag] unless options[:etag].nil?
|
172
|
-
newd["If-Modified-Since"] = options[:modified] unless options[:modified].nil?
|
173
|
-
newd["User-Agent"] = (options[:agent] || USER_AGENT).to_s
|
174
|
-
newd["Referer"] = options[:referrer] unless options[:referrer].nil?
|
175
|
-
newd["Content-Location"] = options[:content_location] unless options[:content_location].nil?
|
176
|
-
newd["Content-Language"] = options[:content_language] unless options[:content_language].nil?
|
177
|
-
newd["Content-type"] = options[:content_type] unless options[:content_type].nil?
|
178
|
-
|
179
|
-
f = open(furi, newd)
|
180
|
-
end
|
181
|
-
|
227
|
+
f = open_resource(url_file_stream_or_string, options)
|
182
228
|
data = f.read
|
183
|
-
f.close
|
184
229
|
rescue => e
|
185
|
-
$stderr << "Rescued in parse: "+e.to_s+"\n" if $debug # My addition
|
186
230
|
result['bozo'] = true
|
187
231
|
result['bozo_exception'] = e
|
188
232
|
data = ''
|
189
233
|
f = nil
|
190
234
|
end
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
235
|
+
|
236
|
+
if f and !data.blank? and f.respond_to?(:meta)
|
237
|
+
# if feed is gzip-compressed, decompress it
|
238
|
+
if f.meta['content-encoding'] == 'gzip'
|
239
|
+
begin
|
240
|
+
gz = Zlib::GzipReader.new(StringIO.new(data))
|
241
|
+
data = gz.read
|
242
|
+
gz.close
|
243
|
+
rescue => e
|
244
|
+
# Some feeds claim to be gzipped but they're not, so
|
245
|
+
# we get garbage. Ideally, we should re-request the
|
246
|
+
# feed without the 'Accept-encoding: gzip' header,
|
247
|
+
# but we don't.
|
248
|
+
result['bozo'] = true
|
249
|
+
result['bozo_exception'] = e
|
250
|
+
data = ''
|
251
|
+
end
|
252
|
+
elsif f.meta['content-encoding'] == 'deflate'
|
253
|
+
begin
|
254
|
+
data = Zlib::Deflate.inflate(data)
|
255
|
+
rescue => e
|
256
|
+
result['bozo'] = true
|
257
|
+
result['bozo_exception'] = e
|
258
|
+
data = ''
|
259
|
+
end
|
197
260
|
end
|
261
|
+
end
|
262
|
+
|
263
|
+
if f.respond_to?(:meta)
|
264
|
+
result['etag'] = f.meta['etag']
|
265
|
+
result['modified_time'] = parse_date(f.meta['last-modified'])
|
266
|
+
result['modified'] = extract_tuple(result['modified_time'])
|
267
|
+
result['headers'] = f.meta
|
268
|
+
end
|
269
|
+
|
270
|
+
# FIXME open-uri does not return a non-nil base_uri in its HTTPErrors.
|
271
|
+
if f.respond_to?(:base_uri)
|
272
|
+
result['href'] = f.base_uri.to_s # URI => String
|
273
|
+
result['status'] = '200'
|
274
|
+
end
|
275
|
+
|
276
|
+
if f.respond_to?(:status)
|
277
|
+
result['status'] = f.status[0]
|
278
|
+
end
|
198
279
|
|
199
280
|
|
200
281
|
# there are four encodings to keep track of:
|
@@ -204,7 +285,7 @@ module FeedParser
|
|
204
285
|
# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
|
205
286
|
http_headers = result['headers'] || {}
|
206
287
|
result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
|
207
|
-
|
288
|
+
getCharacterEncoding(http_headers,data)
|
208
289
|
|
209
290
|
if not http_headers.blank? and not acceptable_content_type
|
210
291
|
unless http_headers['content-type'].nil?
|
@@ -215,7 +296,7 @@ module FeedParser
|
|
215
296
|
result['bozo'] = true
|
216
297
|
result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
|
217
298
|
end
|
218
|
-
result['version'], data =
|
299
|
+
result['version'], data = stripDoctype(data)
|
219
300
|
|
220
301
|
baseuri = http_headers['content-location'] || result['href']
|
221
302
|
baselang = http_headers['content-language']
|
@@ -244,7 +325,7 @@ module FeedParser
|
|
244
325
|
next if tried_encodings.include? proposed_encoding
|
245
326
|
tried_encodings << proposed_encoding
|
246
327
|
begin
|
247
|
-
data =
|
328
|
+
data = toUTF8(data, proposed_encoding)
|
248
329
|
known_encoding = use_strict_parser = true
|
249
330
|
break
|
250
331
|
rescue
|
@@ -256,7 +337,7 @@ module FeedParser
|
|
256
337
|
proposed_encoding = CharDet.detect(data)['encoding']
|
257
338
|
if proposed_encoding and not tried_encodings.include?proposed_encoding
|
258
339
|
tried_encodings << proposed_encoding
|
259
|
-
data =
|
340
|
+
data = toUTF8(data, proposed_encoding)
|
260
341
|
known_encoding = use_strict_parser = true
|
261
342
|
end
|
262
343
|
rescue
|
@@ -270,7 +351,7 @@ module FeedParser
|
|
270
351
|
begin
|
271
352
|
proposed_encoding = 'utf-8'
|
272
353
|
tried_encodings << proposed_encoding
|
273
|
-
data =
|
354
|
+
data = toUTF8(data, proposed_encoding)
|
274
355
|
known_encoding = use_strict_parser = true
|
275
356
|
rescue
|
276
357
|
end
|
@@ -280,7 +361,7 @@ module FeedParser
|
|
280
361
|
begin
|
281
362
|
proposed_encoding = 'windows-1252'
|
282
363
|
tried_encodings << proposed_encoding
|
283
|
-
data =
|
364
|
+
data = toUTF8(data, proposed_encoding)
|
284
365
|
known_encoding = use_strict_parser = true
|
285
366
|
rescue
|
286
367
|
end
|
@@ -292,7 +373,7 @@ module FeedParser
|
|
292
373
|
# begin
|
293
374
|
# proposed_encoding = 'iso-8859-2'
|
294
375
|
# tried_encodings << proposed_encoding
|
295
|
-
# data =
|
376
|
+
# data = toUTF8(data, proposed_encoding)
|
296
377
|
# known_encoding = use_strict_parser = true
|
297
378
|
# rescue
|
298
379
|
# end
|
@@ -334,9 +415,9 @@ module FeedParser
|
|
334
415
|
end
|
335
416
|
end
|
336
417
|
if not use_strict_parser
|
418
|
+
$stderr << "Using LooseFeed\n\n" if $debug
|
337
419
|
feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
|
338
420
|
feedparser.parse(data)
|
339
|
-
$stderr << "Using LooseFeed\n\n" if $debug
|
340
421
|
end
|
341
422
|
result['feed'] = feedparser.feeddata
|
342
423
|
result['entries'] = feedparser.entries
|
@@ -347,6 +428,10 @@ module FeedParser
|
|
347
428
|
module_function(:parse)
|
348
429
|
end # End FeedParser module
|
349
430
|
|
431
|
+
def rfp(url_file_stream_or_string, options={})
|
432
|
+
FeedParser.parse(url_file_stream_or_string, options)
|
433
|
+
end
|
434
|
+
|
350
435
|
class Serializer
|
351
436
|
def initialize(results)
|
352
437
|
@results = results
|
data/lib/rfeedparser/aliases.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
# Add some helper methods to make AttributeList (all of those damn attrs
|
4
4
|
# and attrsD used by StrictFeedParser) act more like a Hash.
|
@@ -8,31 +8,31 @@ module XML
|
|
8
8
|
module SAX
|
9
9
|
module AttributeList # in xml/sax.rb
|
10
10
|
def [](key)
|
11
|
-
|
11
|
+
getValue(key)
|
12
12
|
end
|
13
13
|
|
14
14
|
def each(&blk)
|
15
|
-
|
15
|
+
(0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
|
16
16
|
end
|
17
17
|
|
18
18
|
def each_key(&blk)
|
19
|
-
|
19
|
+
(0...getLength).each{|pos| yield getName(pos) }
|
20
20
|
end
|
21
21
|
|
22
22
|
def each_value(&blk)
|
23
|
-
|
23
|
+
(0...getLength).each{|pos| yield getValue(pos) }
|
24
24
|
end
|
25
25
|
|
26
26
|
def to_a # Rather use collect? grep for to_a.collect
|
27
|
-
|
28
|
-
|
29
|
-
|
27
|
+
l = []
|
28
|
+
each{|k,v| l << [k,v]}
|
29
|
+
return l
|
30
30
|
end
|
31
31
|
|
32
32
|
def to_s
|
33
|
-
|
34
|
-
|
35
|
-
|
33
|
+
l = []
|
34
|
+
each{|k,v| l << "#{k} => #{v}"}
|
35
|
+
"{ "+l.join(", ")+" }"
|
36
36
|
end
|
37
37
|
end
|
38
38
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
#!/usr/bin/ruby
|
1
|
+
#!/usr/bin/env ruby
|
2
2
|
|
3
3
|
module FeedParserUtilities
|
4
4
|
|
@@ -26,73 +26,68 @@ module FeedParserUtilities
|
|
26
26
|
def _ebcdic_to_ascii(s)
|
27
27
|
return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
|
28
28
|
end
|
29
|
-
|
30
|
-
def getCharacterEncoding(
|
29
|
+
|
30
|
+
def getCharacterEncoding(http_headers, xml_data)
|
31
31
|
# Get the character encoding of the XML document
|
32
32
|
$stderr << "In getCharacterEncoding\n" if $debug
|
33
33
|
sniffed_xml_encoding = nil
|
34
34
|
xml_encoding = nil
|
35
35
|
true_encoding = nil
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
http_encoding = nil if http_encoding.empty?
|
36
|
+
|
37
|
+
http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
|
38
|
+
encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
|
39
|
+
http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
|
40
|
+
http_encoding = nil if http_encoding.blank?
|
42
41
|
# FIXME Open-Uri returns iso8859-1 if there is no charset header,
|
43
42
|
# but that doesn't pass the tests. Open-Uri claims its following
|
44
43
|
# the right RFC. Are they wrong or do we need to change the tests?
|
45
|
-
|
46
|
-
http_headers = {}
|
47
|
-
http_content_type = nil
|
48
|
-
http_encoding = nil
|
49
|
-
end
|
44
|
+
|
50
45
|
# Must sniff for non-ASCII-compatible character encodings before
|
51
46
|
# searching for XML declaration. This heuristic is defined in
|
52
47
|
# section F of the XML specification:
|
53
48
|
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
|
54
49
|
begin
|
55
50
|
if xml_data[0..3] == "\x4c\x6f\xa7\x94"
|
56
|
-
|
57
|
-
|
51
|
+
# EBCDIC
|
52
|
+
xml_data = __ebcdic_to_ascii(xml_data)
|
58
53
|
elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
|
59
|
-
|
60
|
-
|
61
|
-
|
54
|
+
# UTF-16BE
|
55
|
+
sniffed_xml_encoding = 'utf-16be'
|
56
|
+
xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
|
62
57
|
elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
|
63
|
-
|
64
|
-
|
65
|
-
|
58
|
+
# UTF-16BE with BOM
|
59
|
+
sniffed_xml_encoding = 'utf-16be'
|
60
|
+
xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
|
66
61
|
elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
|
67
|
-
|
68
|
-
|
69
|
-
|
62
|
+
# UTF-16LE
|
63
|
+
sniffed_xml_encoding = 'utf-16le'
|
64
|
+
xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
|
70
65
|
elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
|
71
|
-
|
72
|
-
|
73
|
-
|
66
|
+
# UTF-16LE with BOM
|
67
|
+
sniffed_xml_encoding = 'utf-16le'
|
68
|
+
xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
|
74
69
|
elsif xml_data[0..3] == "\x00\x00\x00\x3c"
|
75
|
-
|
76
|
-
|
77
|
-
|
70
|
+
# UTF-32BE
|
71
|
+
sniffed_xml_encoding = 'utf-32be'
|
72
|
+
xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
|
78
73
|
elsif xml_data[0..3] == "\x3c\x00\x00\x00"
|
79
|
-
|
80
|
-
|
81
|
-
|
74
|
+
# UTF-32LE
|
75
|
+
sniffed_xml_encoding = 'utf-32le'
|
76
|
+
xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
|
82
77
|
elsif xml_data[0..3] == "\x00\x00\xfe\xff"
|
83
|
-
|
84
|
-
|
85
|
-
|
78
|
+
# UTF-32BE with BOM
|
79
|
+
sniffed_xml_encoding = 'utf-32be'
|
80
|
+
xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
|
86
81
|
elsif xml_data[0..3] == "\xff\xfe\x00\x00"
|
87
|
-
|
88
|
-
|
89
|
-
|
82
|
+
# UTF-32LE with BOM
|
83
|
+
sniffed_xml_encoding = 'utf-32le'
|
84
|
+
xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
|
90
85
|
elsif xml_data[0..2] == "\xef\xbb\xbf"
|
91
|
-
|
92
|
-
|
93
|
-
|
86
|
+
# UTF-8 with BOM
|
87
|
+
sniffed_xml_encoding = 'utf-8'
|
88
|
+
xml_data = xml_data[3..-1]
|
94
89
|
else
|
95
|
-
|
90
|
+
# ASCII-compatible
|
96
91
|
end
|
97
92
|
xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
|
98
93
|
rescue
|
@@ -102,7 +97,7 @@ module FeedParserUtilities
|
|
102
97
|
xml_encoding = xml_encoding_match[1].downcase
|
103
98
|
xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
|
104
99
|
if sniffed_xml_encoding and xencodings.include?xml_encoding
|
105
|
-
|
100
|
+
xml_encoding = sniffed_xml_encoding
|
106
101
|
end
|
107
102
|
end
|
108
103
|
|
@@ -125,54 +120,48 @@ module FeedParserUtilities
|
|
125
120
|
end
|
126
121
|
return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
|
127
122
|
end
|
128
|
-
|
123
|
+
|
129
124
|
def toUTF8(data, encoding)
|
130
|
-
=begin
|
131
|
-
Changes an XML data stream on the fly to specify a new encoding
|
132
|
-
|
133
|
-
data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
|
134
|
-
encoding is a string recognized by encodings.aliases
|
135
|
-
=end
|
136
125
|
$stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
|
137
126
|
# NOTE we must use double quotes when dealing with \x encodings!
|
138
127
|
if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
|
139
128
|
if $debug
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
129
|
+
$stderr << "stripping BOM\n"
|
130
|
+
if encoding != 'utf-16be'
|
131
|
+
$stderr << "string utf-16be instead\n"
|
132
|
+
end
|
144
133
|
end
|
145
134
|
encoding = 'utf-16be'
|
146
135
|
data = data[2..-1]
|
147
136
|
elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
|
148
137
|
if $debug
|
149
|
-
|
150
|
-
|
138
|
+
$stderr << "stripping BOM\n"
|
139
|
+
$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
|
151
140
|
end
|
152
141
|
encoding = 'utf-16le'
|
153
142
|
data = data[2..-1]
|
154
143
|
elsif (data[0..2] == "\xef\xbb\xbf")
|
155
144
|
if $debug
|
156
|
-
|
157
|
-
|
145
|
+
$stderr << "stripping BOM\n"
|
146
|
+
$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
|
158
147
|
end
|
159
148
|
encoding = 'utf-8'
|
160
149
|
data = data[3..-1]
|
161
150
|
elsif (data[0..3] == "\x00\x00\xfe\xff")
|
162
151
|
if $debug
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
152
|
+
$stderr << "stripping BOM\n"
|
153
|
+
if encoding != 'utf-32be'
|
154
|
+
$stderr << "trying utf-32be instead\n"
|
155
|
+
end
|
167
156
|
end
|
168
157
|
encoding = 'utf-32be'
|
169
158
|
data = data[4..-1]
|
170
159
|
elsif (data[0..3] == "\xff\xfe\x00\x00")
|
171
160
|
if $debug
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
161
|
+
$stderr << "stripping BOM\n"
|
162
|
+
if encoding != 'utf-32le'
|
163
|
+
$stderr << "trying utf-32le instead\n"
|
164
|
+
end
|
176
165
|
end
|
177
166
|
encoding = 'utf-32le'
|
178
167
|
data = data[4..-1]
|
@@ -184,75 +173,79 @@ module FeedParserUtilities
|
|
184
173
|
end
|
185
174
|
$stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
|
186
175
|
declmatch = /^<\?xml[^>]*?>/
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
176
|
+
newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
|
177
|
+
if declmatch =~ newdata
|
178
|
+
newdata.sub!(declmatch, newdecl)
|
179
|
+
else
|
180
|
+
newdata = newdecl + "\n" + newdata
|
181
|
+
end
|
193
182
|
return newdata
|
194
183
|
end
|
195
|
-
|
184
|
+
|
196
185
|
end
|
197
186
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
187
|
+
unless defined?(Builder::XChar)
|
188
|
+
# http://intertwingly.net/stories/2005/09/28/xchar.rb
|
189
|
+
module XChar
|
190
|
+
# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
|
191
|
+
CP1252 = {
|
192
|
+
128 => 8364, # euro sign
|
193
|
+
130 => 8218, # single low-9 quotation mark
|
194
|
+
131 => 402, # latin small letter f with hook
|
195
|
+
132 => 8222, # double low-9 quotation mark
|
196
|
+
133 => 8230, # horizontal ellipsis
|
197
|
+
134 => 8224, # dagger
|
198
|
+
135 => 8225, # double dagger
|
199
|
+
136 => 710, # modifier letter circumflex accent
|
200
|
+
137 => 8240, # per mille sign
|
201
|
+
138 => 352, # latin capital letter s with caron
|
202
|
+
139 => 8249, # single left-pointing angle quotation mark
|
203
|
+
140 => 338, # latin capital ligature oe
|
204
|
+
142 => 381, # latin capital letter z with caron
|
205
|
+
145 => 8216, # left single quotation mark
|
206
|
+
146 => 8217, # right single quotation mark
|
207
|
+
147 => 8220, # left double quotation mark
|
208
|
+
148 => 8221, # right double quotation mark
|
209
|
+
149 => 8226, # bullet
|
210
|
+
150 => 8211, # en dash
|
211
|
+
151 => 8212, # em dash
|
212
|
+
152 => 732, # small tilde
|
213
|
+
153 => 8482, # trade mark sign
|
214
|
+
154 => 353, # latin small letter s with caron
|
215
|
+
155 => 8250, # single right-pointing angle quotation mark
|
216
|
+
156 => 339, # latin small ligature oe
|
217
|
+
158 => 382, # latin small letter z with caron
|
218
|
+
159 => 376 # latin capital letter y with diaeresis
|
219
|
+
}
|
230
220
|
# http://www.w3.org/TR/REC-xml/#dt-chardata
|
231
221
|
PREDEFINED = {
|
232
222
|
38 => '&', # ampersand
|
233
223
|
60 => '<', # left angle bracket
|
234
|
-
62 => '>'
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
end
|
224
|
+
62 => '>' # right angle bracket
|
225
|
+
}
|
226
|
+
# http://www.w3.org/TR/REC-xml/#charsets
|
227
|
+
VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
|
228
|
+
(0xE000..0xFFFD), (0x10000..0x10FFFF)]
|
229
|
+
end
|
240
230
|
|
241
|
-
class Fixnum
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
231
|
+
class Fixnum
|
232
|
+
# xml escaped version of chr
|
233
|
+
def xchr
|
234
|
+
n = XChar::CP1252[self] || self
|
235
|
+
case n when *XChar::VALID
|
236
|
+
XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
|
237
|
+
else
|
238
|
+
'*'
|
239
|
+
end
|
240
|
+
end
|
247
241
|
end
|
248
|
-
end
|
249
242
|
|
250
|
-
class String
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
243
|
+
class String
|
244
|
+
alias :old_index :index
|
245
|
+
def to_xs
|
246
|
+
unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
|
247
|
+
rescue
|
248
|
+
unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
|
249
|
+
end
|
256
250
|
end
|
257
251
|
end
|
258
|
-
|