rfeedparser 0.9.931 → 0.9.940

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/lib/rfeedparser.rb +143 -58
  2. data/lib/rfeedparser/aliases.rb +1 -1
  3. data/lib/rfeedparser/better_attributelist.rb +11 -11
  4. data/lib/rfeedparser/better_sgmlparser.rb +1 -1
  5. data/lib/rfeedparser/encoding_helpers.rb +120 -127
  6. data/lib/rfeedparser/feedparserdict.rb +30 -20
  7. data/lib/rfeedparser/forgiving_uri.rb +9 -7
  8. data/lib/rfeedparser/markup_helpers.rb +11 -14
  9. data/lib/rfeedparser/parser_mixin.rb +16 -11
  10. data/lib/rfeedparser/parsers.rb +1 -2
  11. data/lib/rfeedparser/scrub.rb +95 -90
  12. data/lib/rfeedparser/time_helpers.rb +379 -379
  13. data/lib/rfeedparser/utilities.rb +23 -0
  14. data/tests/rfeedparser_test_helper.rb +262 -0
  15. data/tests/rfeedparserserver.rb +3 -109
  16. data/tests/rfeedparsertest.rb +6 -165
  17. data/tests/rfponly/http/200.xml +30 -0
  18. data/tests/rfponly/http/220.xml +28 -0
  19. data/tests/rfponly/http/300.xml +8 -0
  20. data/tests/rfponly/http/300.xml_redirect +25 -0
  21. data/tests/rfponly/http/301.xml +8 -0
  22. data/tests/rfponly/http/301.xml_redirect +25 -0
  23. data/tests/rfponly/http/302.xml +8 -0
  24. data/tests/rfponly/http/302.xml_redirect +25 -0
  25. data/tests/rfponly/http/307.xml +8 -0
  26. data/tests/rfponly/http/307.xml_redirect +25 -0
  27. data/tests/rfponly/http/320.xml +8 -0
  28. data/tests/rfponly/http/320.xml_redirect +25 -0
  29. data/tests/rfponly/http/400.xml +7 -0
  30. data/tests/rfponly/http/404.xml +7 -0
  31. data/tests/rfponly/http/410.xml +7 -0
  32. data/tests/rfponly/http/420.xml +7 -0
  33. data/tests/rfponly/http/500.xml +7 -0
  34. data/tests/rfponly/http/520.xml +7 -0
  35. data/tests/rfponly/http/etag.xml +28 -0
  36. data/tests/rfponly/http/lastmodified.xml +29 -0
  37. data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
  38. data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
  39. data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
  40. metadata +31 -3
@@ -19,11 +19,14 @@ require 'rubygems'
19
19
  require 'base64'
20
20
  require 'iconv'
21
21
 
22
+ gem 'hpricot', "=0.6"
23
+ require 'hpricot'
22
24
  gem 'character-encodings', ">=0.2.0"
23
25
  gem 'htmltools', ">=1.10"
24
26
  gem 'htmlentities', ">=4.0.0"
25
27
  gem 'activesupport', ">=1.4.1"
26
28
  gem 'rchardet', ">=1.0"
29
+
27
30
  require 'xml/saxdriver' # calling expat through the xmlparser gem
28
31
 
29
32
  require 'rchardet'
@@ -40,23 +43,21 @@ $debug = false
40
43
  $compatible = true
41
44
 
42
45
  $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
46
+ require 'rfeedparser/utilities'
43
47
  require 'rfeedparser/forgiving_uri'
44
- require 'rfeedparser/aliases'
45
- require 'rfeedparser/encoding_helpers'
46
48
  require 'rfeedparser/better_sgmlparser'
47
49
  require 'rfeedparser/better_attributelist'
48
- require 'rfeedparser/scrub'
49
- require 'rfeedparser/time_helpers'
50
50
  require 'rfeedparser/feedparserdict'
51
51
  require 'rfeedparser/parser_mixin'
52
52
  require 'rfeedparser/parsers'
53
- require 'rfeedparser/markup_helpers'
54
53
 
55
- include FeedParserUtilities
54
+
56
55
 
57
56
 
58
57
  module FeedParser
59
- Version = "0.9.931"
58
+ extend FeedParserUtilities
59
+
60
+ Version = "0.9.940"
60
61
 
61
62
  License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
62
63
 
@@ -81,18 +82,19 @@ module FeedParser
81
82
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
83
  POSSIBILITY OF SUCH DAMAGE."""
83
84
 
84
- Author = "Jeff Hodges <http://somethingsimilar.com>"
85
- Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
85
+ Translator_From_Python_To_Ruby = "Jeff Hodges <http://somethingsimilar.com>"
86
+ Author = "Mark Pilgrim <http://diveintomark.org/>"
86
87
  Contributors = [ "Jason Diamond <http://injektilo.org/>",
87
88
  "John Beimler <http://john.beimler.org/>",
88
89
  "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
90
  "Aaron Swartz <http://aaronsw.com/>",
90
- "Kevin Marks <http://epeus.blogspot.com/>"
91
+ "Kevin Marks <http://epeus.blogspot.com/>",
92
+ "Jesse Newland <http://jnewland.com/>"
91
93
  ]
92
94
  # HTTP "User-Agent" header to send to servers when downloading feeds.
93
95
  # If you are embedding feedparser in a larger application, you should
94
96
  # change this to your application name and URL.
95
- USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
97
+ USER_AGENT = "rFeedParser/#{Version} +http://rfeedparser.rubyforge.org/"
96
98
 
97
99
  # HTTP "Accept" header to send to servers when downloading feeds. If you don't
98
100
  # want to send an Accept header, set this to None.
@@ -141,60 +143,139 @@ module FeedParser
141
143
  'hotrss' => 'Hot RSS'
142
144
  }
143
145
 
144
- def parse(furi, options = {})
145
- furi.strip!
146
- # Parse a feed from a URL, file, stream or string
147
- $compatible = options[:compatible].nil? ? $compatible : options[:compatible]# Use the default compatibility if compatible is nil
146
+ # Accepted in options: :agent, :modified, :etag, and :referrer
147
+ def open_resource(url_file_stream_or_string, options)
148
+ options[:handlers] ||= []
149
+
150
+ if url_file_stream_or_string.respond_to?(:read)
151
+ return url_file_stream_or_string
152
+
153
+ elsif url_file_stream_or_string == '-'
154
+ return $stdin
155
+ end
156
+
157
+ # open-uri freaks out if there's leading spaces.
158
+ url_file_stream_or_string.strip!
159
+
160
+
161
+ furi = ForgivingURI.parse(url_file_stream_or_string)
162
+ if furi && ['http','https','ftp'].include?(furi.scheme)
163
+ auth = nil
164
+
165
+ if furi.host && furi.password
166
+ auth = Base64::encode64("#{furi.user}:#{furi.password}").strip
167
+ furi.password = nil
168
+ url_file_stream_or_string = furi.to_s
169
+ end
170
+
171
+ req_headers = {}
172
+ req_headers["User-Agent"] = options[:agent] || USER_AGENT
173
+ req_headers["If-None-Match"] = options[:etag] if options[:etag]
174
+
175
+ if options[:modified]
176
+ if options[:modified].is_a?(String)
177
+ req_headers["If-Modified-Since"] = parse_date(options[:modified]).httpdate
178
+ elsif options[:modified].is_a?(Time)
179
+ req_headers["If-Modified-Since"] = options[:modified].httpdate
180
+ elsif options[:modified].is_a?(Array)
181
+ req_headers["If-Modified-Since"] = py2rtime(options[:modified]).httpdate
182
+ end
183
+ end
184
+
185
+ req_headers["Referer"] = options[:referrer] if options[:referrer]
186
+ req_headers["Accept-encoding"] = 'gzip, deflate' # FIXME make tests
187
+ req_headers["Authorization"] = "Basic #{auth}" if auth
188
+ req_headers['Accept'] = ACCEPT_HEADER if ACCEPT_HEADER
189
+ req_headers['A-IM'] = 'feed' # RFC 3229 support
190
+
191
+ begin
192
+ return open(url_file_stream_or_string, req_headers)
193
+ rescue OpenURI::HTTPError => e
194
+ return e.io
195
+ rescue
196
+ end
197
+ end
198
+
199
+ # try to open with native open function (if url_file_stream_or_string is a filename)
200
+ begin
201
+ return open(url_file_stream_or_string)
202
+ rescue
203
+ end
204
+ # treat url_file_stream_or_string as string
205
+ return StringIO.new(url_file_stream_or_string.to_s)
206
+ end
207
+ module_function(:open_resource)
208
+
209
+ # Parse a feed from a URL, file, stream or string
210
+ def parse(url_file_stream_or_string, options = {})
211
+
212
+
213
+ # Use the default compatibility if compatible is nil
214
+ $compatible = options[:compatible].nil? ? $compatible : options[:compatible]
215
+
148
216
  strictklass = options[:strict] || StrictFeedParser
149
217
  looseklass = options[:loose] || LooseFeedParser
218
+ options[:handlers] = options[:handlers] || []
219
+
150
220
  result = FeedParserDict.new
151
221
  result['feed'] = FeedParserDict.new
152
222
  result['entries'] = []
153
- if options[:modified]
154
- options[:modified] = Time.parse(options[:modified]).utc.rfc2822
155
- # FIXME this ignores all of our time parsing work. Does it matter?
156
- end
223
+
157
224
  result['bozo'] = false
158
- handlers = options[:handlers]
159
- if handlers.class != Array # FIXME why does this happen?
160
- handlers = [handlers]
161
- end
162
-
225
+
163
226
  begin
164
- parsed_furi = ForgivingURI.parse(furi)
165
- if [nil, "file"].include? parsed_furi.scheme
166
- $stderr << "Opening local file #{furi}\n" if $debug
167
- f = open(parsed_furi.path) # OpenURI doesn't behave well when passing HTTP options to a file.
168
- else
169
- # And when you do pass them, make sure they aren't just nil (this still true?)
170
- newd = {}
171
- newd["If-None-Match"] = options[:etag] unless options[:etag].nil?
172
- newd["If-Modified-Since"] = options[:modified] unless options[:modified].nil?
173
- newd["User-Agent"] = (options[:agent] || USER_AGENT).to_s
174
- newd["Referer"] = options[:referrer] unless options[:referrer].nil?
175
- newd["Content-Location"] = options[:content_location] unless options[:content_location].nil?
176
- newd["Content-Language"] = options[:content_language] unless options[:content_language].nil?
177
- newd["Content-type"] = options[:content_type] unless options[:content_type].nil?
178
-
179
- f = open(furi, newd)
180
- end
181
-
227
+ f = open_resource(url_file_stream_or_string, options)
182
228
  data = f.read
183
- f.close
184
229
  rescue => e
185
- $stderr << "Rescued in parse: "+e.to_s+"\n" if $debug # My addition
186
230
  result['bozo'] = true
187
231
  result['bozo_exception'] = e
188
232
  data = ''
189
233
  f = nil
190
234
  end
191
- if f.respond_to?(:meta)
192
- result['etag'] = f.meta['etag']
193
- result['modified'] = f.meta['modified']
194
- result['url'] = f.base_uri.to_s
195
- result['status'] = f.status[0] || 200
196
- result['headers'] = f.meta
235
+
236
+ if f and !data.blank? and f.respond_to?(:meta)
237
+ # if feed is gzip-compressed, decompress it
238
+ if f.meta['content-encoding'] == 'gzip'
239
+ begin
240
+ gz = Zlib::GzipReader.new(StringIO.new(data))
241
+ data = gz.read
242
+ gz.close
243
+ rescue => e
244
+ # Some feeds claim to be gzipped but they're not, so
245
+ # we get garbage. Ideally, we should re-request the
246
+ # feed without the 'Accept-encoding: gzip' header,
247
+ # but we don't.
248
+ result['bozo'] = true
249
+ result['bozo_exception'] = e
250
+ data = ''
251
+ end
252
+ elsif f.meta['content-encoding'] == 'deflate'
253
+ begin
254
+ data = Zlib::Deflate.inflate(data)
255
+ rescue => e
256
+ result['bozo'] = true
257
+ result['bozo_exception'] = e
258
+ data = ''
259
+ end
197
260
  end
261
+ end
262
+
263
+ if f.respond_to?(:meta)
264
+ result['etag'] = f.meta['etag']
265
+ result['modified_time'] = parse_date(f.meta['last-modified'])
266
+ result['modified'] = extract_tuple(result['modified_time'])
267
+ result['headers'] = f.meta
268
+ end
269
+
270
+ # FIXME open-uri does not return a non-nil base_uri in its HTTPErrors.
271
+ if f.respond_to?(:base_uri)
272
+ result['href'] = f.base_uri.to_s # URI => String
273
+ result['status'] = '200'
274
+ end
275
+
276
+ if f.respond_to?(:status)
277
+ result['status'] = f.status[0]
278
+ end
198
279
 
199
280
 
200
281
  # there are four encodings to keep track of:
@@ -204,7 +285,7 @@ module FeedParser
204
285
  # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
205
286
  http_headers = result['headers'] || {}
206
287
  result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
207
- self.getCharacterEncoding(f,data)
288
+ getCharacterEncoding(http_headers,data)
208
289
 
209
290
  if not http_headers.blank? and not acceptable_content_type
210
291
  unless http_headers['content-type'].nil?
@@ -215,7 +296,7 @@ module FeedParser
215
296
  result['bozo'] = true
216
297
  result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
217
298
  end
218
- result['version'], data = self.stripDoctype(data)
299
+ result['version'], data = stripDoctype(data)
219
300
 
220
301
  baseuri = http_headers['content-location'] || result['href']
221
302
  baselang = http_headers['content-language']
@@ -244,7 +325,7 @@ module FeedParser
244
325
  next if tried_encodings.include? proposed_encoding
245
326
  tried_encodings << proposed_encoding
246
327
  begin
247
- data = self.toUTF8(data, proposed_encoding)
328
+ data = toUTF8(data, proposed_encoding)
248
329
  known_encoding = use_strict_parser = true
249
330
  break
250
331
  rescue
@@ -256,7 +337,7 @@ module FeedParser
256
337
  proposed_encoding = CharDet.detect(data)['encoding']
257
338
  if proposed_encoding and not tried_encodings.include?proposed_encoding
258
339
  tried_encodings << proposed_encoding
259
- data = self.toUTF8(data, proposed_encoding)
340
+ data = toUTF8(data, proposed_encoding)
260
341
  known_encoding = use_strict_parser = true
261
342
  end
262
343
  rescue
@@ -270,7 +351,7 @@ module FeedParser
270
351
  begin
271
352
  proposed_encoding = 'utf-8'
272
353
  tried_encodings << proposed_encoding
273
- data = self.toUTF8(data, proposed_encoding)
354
+ data = toUTF8(data, proposed_encoding)
274
355
  known_encoding = use_strict_parser = true
275
356
  rescue
276
357
  end
@@ -280,7 +361,7 @@ module FeedParser
280
361
  begin
281
362
  proposed_encoding = 'windows-1252'
282
363
  tried_encodings << proposed_encoding
283
- data = self.toUTF8(data, proposed_encoding)
364
+ data = toUTF8(data, proposed_encoding)
284
365
  known_encoding = use_strict_parser = true
285
366
  rescue
286
367
  end
@@ -292,7 +373,7 @@ module FeedParser
292
373
  # begin
293
374
  # proposed_encoding = 'iso-8859-2'
294
375
  # tried_encodings << proposed_encoding
295
- # data = self.toUTF8(data, proposed_encoding)
376
+ # data = toUTF8(data, proposed_encoding)
296
377
  # known_encoding = use_strict_parser = true
297
378
  # rescue
298
379
  # end
@@ -334,9 +415,9 @@ module FeedParser
334
415
  end
335
416
  end
336
417
  if not use_strict_parser
418
+ $stderr << "Using LooseFeed\n\n" if $debug
337
419
  feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
338
420
  feedparser.parse(data)
339
- $stderr << "Using LooseFeed\n\n" if $debug
340
421
  end
341
422
  result['feed'] = feedparser.feeddata
342
423
  result['entries'] = feedparser.entries
@@ -347,6 +428,10 @@ module FeedParser
347
428
  module_function(:parse)
348
429
  end # End FeedParser module
349
430
 
431
+ def rfp(url_file_stream_or_string, options={})
432
+ FeedParser.parse(url_file_stream_or_string, options)
433
+ end
434
+
350
435
  class Serializer
351
436
  def initialize(results)
352
437
  @results = results
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
  module FeedParserUtilities
4
4
  # Adapted from python2.4's encodings/aliases.py
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
  # Add some helper methods to make AttributeList (all of those damn attrs
4
4
  # and attrsD used by StrictFeedParser) act more like a Hash.
@@ -8,31 +8,31 @@ module XML
8
8
  module SAX
9
9
  module AttributeList # in xml/sax.rb
10
10
  def [](key)
11
- getValue(key)
11
+ getValue(key)
12
12
  end
13
13
 
14
14
  def each(&blk)
15
- (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
15
+ (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
16
16
  end
17
17
 
18
18
  def each_key(&blk)
19
- (0...getLength).each{|pos| yield getName(pos) }
19
+ (0...getLength).each{|pos| yield getName(pos) }
20
20
  end
21
21
 
22
22
  def each_value(&blk)
23
- (0...getLength).each{|pos| yield getValue(pos) }
23
+ (0...getLength).each{|pos| yield getValue(pos) }
24
24
  end
25
25
 
26
26
  def to_a # Rather use collect? grep for to_a.collect
27
- l = []
28
- each{|k,v| l << [k,v]}
29
- return l
27
+ l = []
28
+ each{|k,v| l << [k,v]}
29
+ return l
30
30
  end
31
31
 
32
32
  def to_s
33
- l = []
34
- each{|k,v| l << "#{k} => #{v}"}
35
- "{ "+l.join(", ")+" }"
33
+ l = []
34
+ each{|k,v| l << "#{k} => #{v}"}
35
+ "{ "+l.join(", ")+" }"
36
36
  end
37
37
  end
38
38
  end
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
 
4
4
  class BetterSGMLParserError < Exception; end;
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
  module FeedParserUtilities
4
4
 
@@ -26,73 +26,68 @@ module FeedParserUtilities
26
26
  def _ebcdic_to_ascii(s)
27
27
  return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
28
28
  end
29
-
30
- def getCharacterEncoding(feed, xml_data)
29
+
30
+ def getCharacterEncoding(http_headers, xml_data)
31
31
  # Get the character encoding of the XML document
32
32
  $stderr << "In getCharacterEncoding\n" if $debug
33
33
  sniffed_xml_encoding = nil
34
34
  xml_encoding = nil
35
35
  true_encoding = nil
36
- begin
37
- http_headers = feed.meta
38
- http_content_type = feed.meta['content-type'].split(';')[0]
39
- encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
40
- http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
41
- http_encoding = nil if http_encoding.empty?
36
+
37
+ http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
38
+ encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
39
+ http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
40
+ http_encoding = nil if http_encoding.blank?
42
41
  # FIXME Open-Uri returns iso8859-1 if there is no charset header,
43
42
  # but that doesn't pass the tests. Open-Uri claims its following
44
43
  # the right RFC. Are they wrong or do we need to change the tests?
45
- rescue NoMethodError
46
- http_headers = {}
47
- http_content_type = nil
48
- http_encoding = nil
49
- end
44
+
50
45
  # Must sniff for non-ASCII-compatible character encodings before
51
46
  # searching for XML declaration. This heuristic is defined in
52
47
  # section F of the XML specification:
53
48
  # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
54
49
  begin
55
50
  if xml_data[0..3] == "\x4c\x6f\xa7\x94"
56
- # EBCDIC
57
- xml_data = _ebcdic_to_ascii(xml_data)
51
+ # EBCDIC
52
+ xml_data = __ebcdic_to_ascii(xml_data)
58
53
  elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
59
- # UTF-16BE
60
- sniffed_xml_encoding = 'utf-16be'
61
- xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
54
+ # UTF-16BE
55
+ sniffed_xml_encoding = 'utf-16be'
56
+ xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
62
57
  elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
63
- # UTF-16BE with BOM
64
- sniffed_xml_encoding = 'utf-16be'
65
- xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
58
+ # UTF-16BE with BOM
59
+ sniffed_xml_encoding = 'utf-16be'
60
+ xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
66
61
  elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
67
- # UTF-16LE
68
- sniffed_xml_encoding = 'utf-16le'
69
- xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
62
+ # UTF-16LE
63
+ sniffed_xml_encoding = 'utf-16le'
64
+ xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
70
65
  elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
71
- # UTF-16LE with BOM
72
- sniffed_xml_encoding = 'utf-16le'
73
- xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
66
+ # UTF-16LE with BOM
67
+ sniffed_xml_encoding = 'utf-16le'
68
+ xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
74
69
  elsif xml_data[0..3] == "\x00\x00\x00\x3c"
75
- # UTF-32BE
76
- sniffed_xml_encoding = 'utf-32be'
77
- xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
70
+ # UTF-32BE
71
+ sniffed_xml_encoding = 'utf-32be'
72
+ xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
78
73
  elsif xml_data[0..3] == "\x3c\x00\x00\x00"
79
- # UTF-32LE
80
- sniffed_xml_encoding = 'utf-32le'
81
- xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
74
+ # UTF-32LE
75
+ sniffed_xml_encoding = 'utf-32le'
76
+ xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
82
77
  elsif xml_data[0..3] == "\x00\x00\xfe\xff"
83
- # UTF-32BE with BOM
84
- sniffed_xml_encoding = 'utf-32be'
85
- xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
78
+ # UTF-32BE with BOM
79
+ sniffed_xml_encoding = 'utf-32be'
80
+ xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
86
81
  elsif xml_data[0..3] == "\xff\xfe\x00\x00"
87
- # UTF-32LE with BOM
88
- sniffed_xml_encoding = 'utf-32le'
89
- xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
82
+ # UTF-32LE with BOM
83
+ sniffed_xml_encoding = 'utf-32le'
84
+ xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
90
85
  elsif xml_data[0..2] == "\xef\xbb\xbf"
91
- # UTF-8 with BOM
92
- sniffed_xml_encoding = 'utf-8'
93
- xml_data = xml_data[3..-1]
86
+ # UTF-8 with BOM
87
+ sniffed_xml_encoding = 'utf-8'
88
+ xml_data = xml_data[3..-1]
94
89
  else
95
- # ASCII-compatible
90
+ # ASCII-compatible
96
91
  end
97
92
  xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
98
93
  rescue
@@ -102,7 +97,7 @@ module FeedParserUtilities
102
97
  xml_encoding = xml_encoding_match[1].downcase
103
98
  xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
104
99
  if sniffed_xml_encoding and xencodings.include?xml_encoding
105
- xml_encoding = sniffed_xml_encoding
100
+ xml_encoding = sniffed_xml_encoding
106
101
  end
107
102
  end
108
103
 
@@ -125,54 +120,48 @@ module FeedParserUtilities
125
120
  end
126
121
  return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
127
122
  end
128
-
123
+
129
124
  def toUTF8(data, encoding)
130
- =begin
131
- Changes an XML data stream on the fly to specify a new encoding
132
-
133
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
134
- encoding is a string recognized by encodings.aliases
135
- =end
136
125
  $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
137
126
  # NOTE we must use double quotes when dealing with \x encodings!
138
127
  if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
139
128
  if $debug
140
- $stderr << "stripping BOM\n"
141
- if encoding != 'utf-16be'
142
- $stderr << "string utf-16be instead\n"
143
- end
129
+ $stderr << "stripping BOM\n"
130
+ if encoding != 'utf-16be'
131
+ $stderr << "string utf-16be instead\n"
132
+ end
144
133
  end
145
134
  encoding = 'utf-16be'
146
135
  data = data[2..-1]
147
136
  elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
148
137
  if $debug
149
- $stderr << "stripping BOM\n"
150
- $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
138
+ $stderr << "stripping BOM\n"
139
+ $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
151
140
  end
152
141
  encoding = 'utf-16le'
153
142
  data = data[2..-1]
154
143
  elsif (data[0..2] == "\xef\xbb\xbf")
155
144
  if $debug
156
- $stderr << "stripping BOM\n"
157
- $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
145
+ $stderr << "stripping BOM\n"
146
+ $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
158
147
  end
159
148
  encoding = 'utf-8'
160
149
  data = data[3..-1]
161
150
  elsif (data[0..3] == "\x00\x00\xfe\xff")
162
151
  if $debug
163
- $stderr << "stripping BOM\n"
164
- if encoding != 'utf-32be'
165
- $stderr << "trying utf-32be instead\n"
166
- end
152
+ $stderr << "stripping BOM\n"
153
+ if encoding != 'utf-32be'
154
+ $stderr << "trying utf-32be instead\n"
155
+ end
167
156
  end
168
157
  encoding = 'utf-32be'
169
158
  data = data[4..-1]
170
159
  elsif (data[0..3] == "\xff\xfe\x00\x00")
171
160
  if $debug
172
- $stderr << "stripping BOM\n"
173
- if encoding != 'utf-32le'
174
- $stderr << "trying utf-32le instead\n"
175
- end
161
+ $stderr << "stripping BOM\n"
162
+ if encoding != 'utf-32le'
163
+ $stderr << "trying utf-32le instead\n"
164
+ end
176
165
  end
177
166
  encoding = 'utf-32le'
178
167
  data = data[4..-1]
@@ -184,75 +173,79 @@ module FeedParserUtilities
184
173
  end
185
174
  $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
186
175
  declmatch = /^<\?xml[^>]*?>/
187
- newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
188
- if declmatch =~ newdata
189
- newdata.sub!(declmatch, newdecl)
190
- else
191
- newdata = newdecl + "\n" + newdata
192
- end
176
+ newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
177
+ if declmatch =~ newdata
178
+ newdata.sub!(declmatch, newdecl)
179
+ else
180
+ newdata = newdecl + "\n" + newdata
181
+ end
193
182
  return newdata
194
183
  end
195
-
184
+
196
185
  end
197
186
 
198
- # http://intertwingly.net/stories/2005/09/28/xchar.rb
199
- module XChar
200
- # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
201
- CP1252 = {
202
- 128 => 8364, # euro sign
203
- 130 => 8218, # single low-9 quotation mark
204
- 131 => 402, # latin small letter f with hook
205
- 132 => 8222, # double low-9 quotation mark
206
- 133 => 8230, # horizontal ellipsis
207
- 134 => 8224, # dagger
208
- 135 => 8225, # double dagger
209
- 136 => 710, # modifier letter circumflex accent
210
- 137 => 8240, # per mille sign
211
- 138 => 352, # latin capital letter s with caron
212
- 139 => 8249, # single left-pointing angle quotation mark
213
- 140 => 338, # latin capital ligature oe
214
- 142 => 381, # latin capital letter z with caron
215
- 145 => 8216, # left single quotation mark
216
- 146 => 8217, # right single quotation mark
217
- 147 => 8220, # left double quotation mark
218
- 148 => 8221, # right double quotation mark
219
- 149 => 8226, # bullet
220
- 150 => 8211, # en dash
221
- 151 => 8212, # em dash
222
- 152 => 732, # small tilde
223
- 153 => 8482, # trade mark sign
224
- 154 => 353, # latin small letter s with caron
225
- 155 => 8250, # single right-pointing angle quotation mark
226
- 156 => 339, # latin small ligature oe
227
- 158 => 382, # latin small letter z with caron
228
- 159 => 376} # latin capital letter y with diaeresis
229
-
187
+ unless defined?(Builder::XChar)
188
+ # http://intertwingly.net/stories/2005/09/28/xchar.rb
189
+ module XChar
190
+ # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
191
+ CP1252 = {
192
+ 128 => 8364, # euro sign
193
+ 130 => 8218, # single low-9 quotation mark
194
+ 131 => 402, # latin small letter f with hook
195
+ 132 => 8222, # double low-9 quotation mark
196
+ 133 => 8230, # horizontal ellipsis
197
+ 134 => 8224, # dagger
198
+ 135 => 8225, # double dagger
199
+ 136 => 710, # modifier letter circumflex accent
200
+ 137 => 8240, # per mille sign
201
+ 138 => 352, # latin capital letter s with caron
202
+ 139 => 8249, # single left-pointing angle quotation mark
203
+ 140 => 338, # latin capital ligature oe
204
+ 142 => 381, # latin capital letter z with caron
205
+ 145 => 8216, # left single quotation mark
206
+ 146 => 8217, # right single quotation mark
207
+ 147 => 8220, # left double quotation mark
208
+ 148 => 8221, # right double quotation mark
209
+ 149 => 8226, # bullet
210
+ 150 => 8211, # en dash
211
+ 151 => 8212, # em dash
212
+ 152 => 732, # small tilde
213
+ 153 => 8482, # trade mark sign
214
+ 154 => 353, # latin small letter s with caron
215
+ 155 => 8250, # single right-pointing angle quotation mark
216
+ 156 => 339, # latin small ligature oe
217
+ 158 => 382, # latin small letter z with caron
218
+ 159 => 376 # latin capital letter y with diaeresis
219
+ }
230
220
  # http://www.w3.org/TR/REC-xml/#dt-chardata
231
221
  PREDEFINED = {
232
222
  38 => '&amp;', # ampersand
233
223
  60 => '&lt;', # left angle bracket
234
- 62 => '&gt;'} # right angle bracket
235
-
236
- # http://www.w3.org/TR/REC-xml/#charsets
237
- VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
238
- (0xE000..0xFFFD), (0x10000..0x10FFFF)]
239
- end
224
+ 62 => '&gt;' # right angle bracket
225
+ }
226
+ # http://www.w3.org/TR/REC-xml/#charsets
227
+ VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
228
+ (0xE000..0xFFFD), (0x10000..0x10FFFF)]
229
+ end
240
230
 
241
- class Fixnum
242
- # xml escaped version of chr
243
- def xchr
244
- n = XChar::CP1252[self] || self
245
- n = 42 unless XChar::VALID.find {|range| range.include? n}
246
- XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
231
+ class Fixnum
232
+ # xml escaped version of chr
233
+ def xchr
234
+ n = XChar::CP1252[self] || self
235
+ case n when *XChar::VALID
236
+ XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
237
+ else
238
+ '*'
239
+ end
240
+ end
247
241
  end
248
- end
249
242
 
250
- class String
251
- alias :old_index :index
252
- def to_xs
253
- unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
254
- rescue
255
- unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
243
+ class String
244
+ alias :old_index :index
245
+ def to_xs
246
+ unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
247
+ rescue
248
+ unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
249
+ end
256
250
  end
257
251
  end
258
-