rfeedparser 0.9.931 → 0.9.940

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/lib/rfeedparser.rb +143 -58
  2. data/lib/rfeedparser/aliases.rb +1 -1
  3. data/lib/rfeedparser/better_attributelist.rb +11 -11
  4. data/lib/rfeedparser/better_sgmlparser.rb +1 -1
  5. data/lib/rfeedparser/encoding_helpers.rb +120 -127
  6. data/lib/rfeedparser/feedparserdict.rb +30 -20
  7. data/lib/rfeedparser/forgiving_uri.rb +9 -7
  8. data/lib/rfeedparser/markup_helpers.rb +11 -14
  9. data/lib/rfeedparser/parser_mixin.rb +16 -11
  10. data/lib/rfeedparser/parsers.rb +1 -2
  11. data/lib/rfeedparser/scrub.rb +95 -90
  12. data/lib/rfeedparser/time_helpers.rb +379 -379
  13. data/lib/rfeedparser/utilities.rb +23 -0
  14. data/tests/rfeedparser_test_helper.rb +262 -0
  15. data/tests/rfeedparserserver.rb +3 -109
  16. data/tests/rfeedparsertest.rb +6 -165
  17. data/tests/rfponly/http/200.xml +30 -0
  18. data/tests/rfponly/http/220.xml +28 -0
  19. data/tests/rfponly/http/300.xml +8 -0
  20. data/tests/rfponly/http/300.xml_redirect +25 -0
  21. data/tests/rfponly/http/301.xml +8 -0
  22. data/tests/rfponly/http/301.xml_redirect +25 -0
  23. data/tests/rfponly/http/302.xml +8 -0
  24. data/tests/rfponly/http/302.xml_redirect +25 -0
  25. data/tests/rfponly/http/307.xml +8 -0
  26. data/tests/rfponly/http/307.xml_redirect +25 -0
  27. data/tests/rfponly/http/320.xml +8 -0
  28. data/tests/rfponly/http/320.xml_redirect +25 -0
  29. data/tests/rfponly/http/400.xml +7 -0
  30. data/tests/rfponly/http/404.xml +7 -0
  31. data/tests/rfponly/http/410.xml +7 -0
  32. data/tests/rfponly/http/420.xml +7 -0
  33. data/tests/rfponly/http/500.xml +7 -0
  34. data/tests/rfponly/http/520.xml +7 -0
  35. data/tests/rfponly/http/etag.xml +28 -0
  36. data/tests/rfponly/http/lastmodified.xml +29 -0
  37. data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
  38. data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
  39. data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
  40. metadata +31 -3
@@ -19,11 +19,14 @@ require 'rubygems'
19
19
  require 'base64'
20
20
  require 'iconv'
21
21
 
22
+ gem 'hpricot', "=0.6"
23
+ require 'hpricot'
22
24
  gem 'character-encodings', ">=0.2.0"
23
25
  gem 'htmltools', ">=1.10"
24
26
  gem 'htmlentities', ">=4.0.0"
25
27
  gem 'activesupport', ">=1.4.1"
26
28
  gem 'rchardet', ">=1.0"
29
+
27
30
  require 'xml/saxdriver' # calling expat through the xmlparser gem
28
31
 
29
32
  require 'rchardet'
@@ -40,23 +43,21 @@ $debug = false
40
43
  $compatible = true
41
44
 
42
45
  $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
46
+ require 'rfeedparser/utilities'
43
47
  require 'rfeedparser/forgiving_uri'
44
- require 'rfeedparser/aliases'
45
- require 'rfeedparser/encoding_helpers'
46
48
  require 'rfeedparser/better_sgmlparser'
47
49
  require 'rfeedparser/better_attributelist'
48
- require 'rfeedparser/scrub'
49
- require 'rfeedparser/time_helpers'
50
50
  require 'rfeedparser/feedparserdict'
51
51
  require 'rfeedparser/parser_mixin'
52
52
  require 'rfeedparser/parsers'
53
- require 'rfeedparser/markup_helpers'
54
53
 
55
- include FeedParserUtilities
54
+
56
55
 
57
56
 
58
57
  module FeedParser
59
- Version = "0.9.931"
58
+ extend FeedParserUtilities
59
+
60
+ Version = "0.9.940"
60
61
 
61
62
  License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
62
63
 
@@ -81,18 +82,19 @@ module FeedParser
81
82
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
82
83
  POSSIBILITY OF SUCH DAMAGE."""
83
84
 
84
- Author = "Jeff Hodges <http://somethingsimilar.com>"
85
- Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
85
+ Translator_From_Python_To_Ruby = "Jeff Hodges <http://somethingsimilar.com>"
86
+ Author = "Mark Pilgrim <http://diveintomark.org/>"
86
87
  Contributors = [ "Jason Diamond <http://injektilo.org/>",
87
88
  "John Beimler <http://john.beimler.org/>",
88
89
  "Fazal Majid <http://www.majid.info/mylos/weblog/>",
89
90
  "Aaron Swartz <http://aaronsw.com/>",
90
- "Kevin Marks <http://epeus.blogspot.com/>"
91
+ "Kevin Marks <http://epeus.blogspot.com/>",
92
+ "Jesse Newland <http://jnewland.com/>"
91
93
  ]
92
94
  # HTTP "User-Agent" header to send to servers when downloading feeds.
93
95
  # If you are embedding feedparser in a larger application, you should
94
96
  # change this to your application name and URL.
95
- USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
97
+ USER_AGENT = "rFeedParser/#{Version} +http://rfeedparser.rubyforge.org/"
96
98
 
97
99
  # HTTP "Accept" header to send to servers when downloading feeds. If you don't
98
100
  # want to send an Accept header, set this to None.
@@ -141,60 +143,139 @@ module FeedParser
141
143
  'hotrss' => 'Hot RSS'
142
144
  }
143
145
 
144
- def parse(furi, options = {})
145
- furi.strip!
146
- # Parse a feed from a URL, file, stream or string
147
- $compatible = options[:compatible].nil? ? $compatible : options[:compatible]# Use the default compatibility if compatible is nil
146
+ # Accepted in options: :agent, :modified, :etag, and :referrer
147
+ def open_resource(url_file_stream_or_string, options)
148
+ options[:handlers] ||= []
149
+
150
+ if url_file_stream_or_string.respond_to?(:read)
151
+ return url_file_stream_or_string
152
+
153
+ elsif url_file_stream_or_string == '-'
154
+ return $stdin
155
+ end
156
+
157
+ # open-uri freaks out if there's leading spaces.
158
+ url_file_stream_or_string.strip!
159
+
160
+
161
+ furi = ForgivingURI.parse(url_file_stream_or_string)
162
+ if furi && ['http','https','ftp'].include?(furi.scheme)
163
+ auth = nil
164
+
165
+ if furi.host && furi.password
166
+ auth = Base64::encode64("#{furi.user}:#{furi.password}").strip
167
+ furi.password = nil
168
+ url_file_stream_or_string = furi.to_s
169
+ end
170
+
171
+ req_headers = {}
172
+ req_headers["User-Agent"] = options[:agent] || USER_AGENT
173
+ req_headers["If-None-Match"] = options[:etag] if options[:etag]
174
+
175
+ if options[:modified]
176
+ if options[:modified].is_a?(String)
177
+ req_headers["If-Modified-Since"] = parse_date(options[:modified]).httpdate
178
+ elsif options[:modified].is_a?(Time)
179
+ req_headers["If-Modified-Since"] = options[:modified].httpdate
180
+ elsif options[:modified].is_a?(Array)
181
+ req_headers["If-Modified-Since"] = py2rtime(options[:modified]).httpdate
182
+ end
183
+ end
184
+
185
+ req_headers["Referer"] = options[:referrer] if options[:referrer]
186
+ req_headers["Accept-encoding"] = 'gzip, deflate' # FIXME make tests
187
+ req_headers["Authorization"] = "Basic #{auth}" if auth
188
+ req_headers['Accept'] = ACCEPT_HEADER if ACCEPT_HEADER
189
+ req_headers['A-IM'] = 'feed' # RFC 3229 support
190
+
191
+ begin
192
+ return open(url_file_stream_or_string, req_headers)
193
+ rescue OpenURI::HTTPError => e
194
+ return e.io
195
+ rescue
196
+ end
197
+ end
198
+
199
+ # try to open with native open function (if url_file_stream_or_string is a filename)
200
+ begin
201
+ return open(url_file_stream_or_string)
202
+ rescue
203
+ end
204
+ # treat url_file_stream_or_string as string
205
+ return StringIO.new(url_file_stream_or_string.to_s)
206
+ end
207
+ module_function(:open_resource)
208
+
209
+ # Parse a feed from a URL, file, stream or string
210
+ def parse(url_file_stream_or_string, options = {})
211
+
212
+
213
+ # Use the default compatibility if compatible is nil
214
+ $compatible = options[:compatible].nil? ? $compatible : options[:compatible]
215
+
148
216
  strictklass = options[:strict] || StrictFeedParser
149
217
  looseklass = options[:loose] || LooseFeedParser
218
+ options[:handlers] = options[:handlers] || []
219
+
150
220
  result = FeedParserDict.new
151
221
  result['feed'] = FeedParserDict.new
152
222
  result['entries'] = []
153
- if options[:modified]
154
- options[:modified] = Time.parse(options[:modified]).utc.rfc2822
155
- # FIXME this ignores all of our time parsing work. Does it matter?
156
- end
223
+
157
224
  result['bozo'] = false
158
- handlers = options[:handlers]
159
- if handlers.class != Array # FIXME why does this happen?
160
- handlers = [handlers]
161
- end
162
-
225
+
163
226
  begin
164
- parsed_furi = ForgivingURI.parse(furi)
165
- if [nil, "file"].include? parsed_furi.scheme
166
- $stderr << "Opening local file #{furi}\n" if $debug
167
- f = open(parsed_furi.path) # OpenURI doesn't behave well when passing HTTP options to a file.
168
- else
169
- # And when you do pass them, make sure they aren't just nil (this still true?)
170
- newd = {}
171
- newd["If-None-Match"] = options[:etag] unless options[:etag].nil?
172
- newd["If-Modified-Since"] = options[:modified] unless options[:modified].nil?
173
- newd["User-Agent"] = (options[:agent] || USER_AGENT).to_s
174
- newd["Referer"] = options[:referrer] unless options[:referrer].nil?
175
- newd["Content-Location"] = options[:content_location] unless options[:content_location].nil?
176
- newd["Content-Language"] = options[:content_language] unless options[:content_language].nil?
177
- newd["Content-type"] = options[:content_type] unless options[:content_type].nil?
178
-
179
- f = open(furi, newd)
180
- end
181
-
227
+ f = open_resource(url_file_stream_or_string, options)
182
228
  data = f.read
183
- f.close
184
229
  rescue => e
185
- $stderr << "Rescued in parse: "+e.to_s+"\n" if $debug # My addition
186
230
  result['bozo'] = true
187
231
  result['bozo_exception'] = e
188
232
  data = ''
189
233
  f = nil
190
234
  end
191
- if f.respond_to?(:meta)
192
- result['etag'] = f.meta['etag']
193
- result['modified'] = f.meta['modified']
194
- result['url'] = f.base_uri.to_s
195
- result['status'] = f.status[0] || 200
196
- result['headers'] = f.meta
235
+
236
+ if f and !data.blank? and f.respond_to?(:meta)
237
+ # if feed is gzip-compressed, decompress it
238
+ if f.meta['content-encoding'] == 'gzip'
239
+ begin
240
+ gz = Zlib::GzipReader.new(StringIO.new(data))
241
+ data = gz.read
242
+ gz.close
243
+ rescue => e
244
+ # Some feeds claim to be gzipped but they're not, so
245
+ # we get garbage. Ideally, we should re-request the
246
+ # feed without the 'Accept-encoding: gzip' header,
247
+ # but we don't.
248
+ result['bozo'] = true
249
+ result['bozo_exception'] = e
250
+ data = ''
251
+ end
252
+ elsif f.meta['content-encoding'] == 'deflate'
253
+ begin
254
+ data = Zlib::Deflate.inflate(data)
255
+ rescue => e
256
+ result['bozo'] = true
257
+ result['bozo_exception'] = e
258
+ data = ''
259
+ end
197
260
  end
261
+ end
262
+
263
+ if f.respond_to?(:meta)
264
+ result['etag'] = f.meta['etag']
265
+ result['modified_time'] = parse_date(f.meta['last-modified'])
266
+ result['modified'] = extract_tuple(result['modified_time'])
267
+ result['headers'] = f.meta
268
+ end
269
+
270
+ # FIXME open-uri does not return a non-nil base_uri in its HTTPErrors.
271
+ if f.respond_to?(:base_uri)
272
+ result['href'] = f.base_uri.to_s # URI => String
273
+ result['status'] = '200'
274
+ end
275
+
276
+ if f.respond_to?(:status)
277
+ result['status'] = f.status[0]
278
+ end
198
279
 
199
280
 
200
281
  # there are four encodings to keep track of:
@@ -204,7 +285,7 @@ module FeedParser
204
285
  # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
205
286
  http_headers = result['headers'] || {}
206
287
  result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
207
- self.getCharacterEncoding(f,data)
288
+ getCharacterEncoding(http_headers,data)
208
289
 
209
290
  if not http_headers.blank? and not acceptable_content_type
210
291
  unless http_headers['content-type'].nil?
@@ -215,7 +296,7 @@ module FeedParser
215
296
  result['bozo'] = true
216
297
  result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
217
298
  end
218
- result['version'], data = self.stripDoctype(data)
299
+ result['version'], data = stripDoctype(data)
219
300
 
220
301
  baseuri = http_headers['content-location'] || result['href']
221
302
  baselang = http_headers['content-language']
@@ -244,7 +325,7 @@ module FeedParser
244
325
  next if tried_encodings.include? proposed_encoding
245
326
  tried_encodings << proposed_encoding
246
327
  begin
247
- data = self.toUTF8(data, proposed_encoding)
328
+ data = toUTF8(data, proposed_encoding)
248
329
  known_encoding = use_strict_parser = true
249
330
  break
250
331
  rescue
@@ -256,7 +337,7 @@ module FeedParser
256
337
  proposed_encoding = CharDet.detect(data)['encoding']
257
338
  if proposed_encoding and not tried_encodings.include?proposed_encoding
258
339
  tried_encodings << proposed_encoding
259
- data = self.toUTF8(data, proposed_encoding)
340
+ data = toUTF8(data, proposed_encoding)
260
341
  known_encoding = use_strict_parser = true
261
342
  end
262
343
  rescue
@@ -270,7 +351,7 @@ module FeedParser
270
351
  begin
271
352
  proposed_encoding = 'utf-8'
272
353
  tried_encodings << proposed_encoding
273
- data = self.toUTF8(data, proposed_encoding)
354
+ data = toUTF8(data, proposed_encoding)
274
355
  known_encoding = use_strict_parser = true
275
356
  rescue
276
357
  end
@@ -280,7 +361,7 @@ module FeedParser
280
361
  begin
281
362
  proposed_encoding = 'windows-1252'
282
363
  tried_encodings << proposed_encoding
283
- data = self.toUTF8(data, proposed_encoding)
364
+ data = toUTF8(data, proposed_encoding)
284
365
  known_encoding = use_strict_parser = true
285
366
  rescue
286
367
  end
@@ -292,7 +373,7 @@ module FeedParser
292
373
  # begin
293
374
  # proposed_encoding = 'iso-8859-2'
294
375
  # tried_encodings << proposed_encoding
295
- # data = self.toUTF8(data, proposed_encoding)
376
+ # data = toUTF8(data, proposed_encoding)
296
377
  # known_encoding = use_strict_parser = true
297
378
  # rescue
298
379
  # end
@@ -334,9 +415,9 @@ module FeedParser
334
415
  end
335
416
  end
336
417
  if not use_strict_parser
418
+ $stderr << "Using LooseFeed\n\n" if $debug
337
419
  feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
338
420
  feedparser.parse(data)
339
- $stderr << "Using LooseFeed\n\n" if $debug
340
421
  end
341
422
  result['feed'] = feedparser.feeddata
342
423
  result['entries'] = feedparser.entries
@@ -347,6 +428,10 @@ module FeedParser
347
428
  module_function(:parse)
348
429
  end # End FeedParser module
349
430
 
431
+ def rfp(url_file_stream_or_string, options={})
432
+ FeedParser.parse(url_file_stream_or_string, options)
433
+ end
434
+
350
435
  class Serializer
351
436
  def initialize(results)
352
437
  @results = results
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
  module FeedParserUtilities
4
4
  # Adapted from python2.4's encodings/aliases.py
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
  # Add some helper methods to make AttributeList (all of those damn attrs
4
4
  # and attrsD used by StrictFeedParser) act more like a Hash.
@@ -8,31 +8,31 @@ module XML
8
8
  module SAX
9
9
  module AttributeList # in xml/sax.rb
10
10
  def [](key)
11
- getValue(key)
11
+ getValue(key)
12
12
  end
13
13
 
14
14
  def each(&blk)
15
- (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
15
+ (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
16
16
  end
17
17
 
18
18
  def each_key(&blk)
19
- (0...getLength).each{|pos| yield getName(pos) }
19
+ (0...getLength).each{|pos| yield getName(pos) }
20
20
  end
21
21
 
22
22
  def each_value(&blk)
23
- (0...getLength).each{|pos| yield getValue(pos) }
23
+ (0...getLength).each{|pos| yield getValue(pos) }
24
24
  end
25
25
 
26
26
  def to_a # Rather use collect? grep for to_a.collect
27
- l = []
28
- each{|k,v| l << [k,v]}
29
- return l
27
+ l = []
28
+ each{|k,v| l << [k,v]}
29
+ return l
30
30
  end
31
31
 
32
32
  def to_s
33
- l = []
34
- each{|k,v| l << "#{k} => #{v}"}
35
- "{ "+l.join(", ")+" }"
33
+ l = []
34
+ each{|k,v| l << "#{k} => #{v}"}
35
+ "{ "+l.join(", ")+" }"
36
36
  end
37
37
  end
38
38
  end
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
 
4
4
  class BetterSGMLParserError < Exception; end;
@@ -1,4 +1,4 @@
1
- #!/usr/bin/ruby
1
+ #!/usr/bin/env ruby
2
2
 
3
3
  module FeedParserUtilities
4
4
 
@@ -26,73 +26,68 @@ module FeedParserUtilities
26
26
  def _ebcdic_to_ascii(s)
27
27
  return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
28
28
  end
29
-
30
- def getCharacterEncoding(feed, xml_data)
29
+
30
+ def getCharacterEncoding(http_headers, xml_data)
31
31
  # Get the character encoding of the XML document
32
32
  $stderr << "In getCharacterEncoding\n" if $debug
33
33
  sniffed_xml_encoding = nil
34
34
  xml_encoding = nil
35
35
  true_encoding = nil
36
- begin
37
- http_headers = feed.meta
38
- http_content_type = feed.meta['content-type'].split(';')[0]
39
- encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
40
- http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
41
- http_encoding = nil if http_encoding.empty?
36
+
37
+ http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
38
+ encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
39
+ http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
40
+ http_encoding = nil if http_encoding.blank?
42
41
  # FIXME Open-Uri returns iso8859-1 if there is no charset header,
43
42
  # but that doesn't pass the tests. Open-Uri claims its following
44
43
  # the right RFC. Are they wrong or do we need to change the tests?
45
- rescue NoMethodError
46
- http_headers = {}
47
- http_content_type = nil
48
- http_encoding = nil
49
- end
44
+
50
45
  # Must sniff for non-ASCII-compatible character encodings before
51
46
  # searching for XML declaration. This heuristic is defined in
52
47
  # section F of the XML specification:
53
48
  # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
54
49
  begin
55
50
  if xml_data[0..3] == "\x4c\x6f\xa7\x94"
56
- # EBCDIC
57
- xml_data = _ebcdic_to_ascii(xml_data)
51
+ # EBCDIC
52
+ xml_data = __ebcdic_to_ascii(xml_data)
58
53
  elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
59
- # UTF-16BE
60
- sniffed_xml_encoding = 'utf-16be'
61
- xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
54
+ # UTF-16BE
55
+ sniffed_xml_encoding = 'utf-16be'
56
+ xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
62
57
  elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
63
- # UTF-16BE with BOM
64
- sniffed_xml_encoding = 'utf-16be'
65
- xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
58
+ # UTF-16BE with BOM
59
+ sniffed_xml_encoding = 'utf-16be'
60
+ xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
66
61
  elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
67
- # UTF-16LE
68
- sniffed_xml_encoding = 'utf-16le'
69
- xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
62
+ # UTF-16LE
63
+ sniffed_xml_encoding = 'utf-16le'
64
+ xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
70
65
  elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
71
- # UTF-16LE with BOM
72
- sniffed_xml_encoding = 'utf-16le'
73
- xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
66
+ # UTF-16LE with BOM
67
+ sniffed_xml_encoding = 'utf-16le'
68
+ xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
74
69
  elsif xml_data[0..3] == "\x00\x00\x00\x3c"
75
- # UTF-32BE
76
- sniffed_xml_encoding = 'utf-32be'
77
- xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
70
+ # UTF-32BE
71
+ sniffed_xml_encoding = 'utf-32be'
72
+ xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
78
73
  elsif xml_data[0..3] == "\x3c\x00\x00\x00"
79
- # UTF-32LE
80
- sniffed_xml_encoding = 'utf-32le'
81
- xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
74
+ # UTF-32LE
75
+ sniffed_xml_encoding = 'utf-32le'
76
+ xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
82
77
  elsif xml_data[0..3] == "\x00\x00\xfe\xff"
83
- # UTF-32BE with BOM
84
- sniffed_xml_encoding = 'utf-32be'
85
- xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
78
+ # UTF-32BE with BOM
79
+ sniffed_xml_encoding = 'utf-32be'
80
+ xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
86
81
  elsif xml_data[0..3] == "\xff\xfe\x00\x00"
87
- # UTF-32LE with BOM
88
- sniffed_xml_encoding = 'utf-32le'
89
- xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
82
+ # UTF-32LE with BOM
83
+ sniffed_xml_encoding = 'utf-32le'
84
+ xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
90
85
  elsif xml_data[0..2] == "\xef\xbb\xbf"
91
- # UTF-8 with BOM
92
- sniffed_xml_encoding = 'utf-8'
93
- xml_data = xml_data[3..-1]
86
+ # UTF-8 with BOM
87
+ sniffed_xml_encoding = 'utf-8'
88
+ xml_data = xml_data[3..-1]
94
89
  else
95
- # ASCII-compatible
90
+ # ASCII-compatible
96
91
  end
97
92
  xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
98
93
  rescue
@@ -102,7 +97,7 @@ module FeedParserUtilities
102
97
  xml_encoding = xml_encoding_match[1].downcase
103
98
  xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
104
99
  if sniffed_xml_encoding and xencodings.include?xml_encoding
105
- xml_encoding = sniffed_xml_encoding
100
+ xml_encoding = sniffed_xml_encoding
106
101
  end
107
102
  end
108
103
 
@@ -125,54 +120,48 @@ module FeedParserUtilities
125
120
  end
126
121
  return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
127
122
  end
128
-
123
+
129
124
  def toUTF8(data, encoding)
130
- =begin
131
- Changes an XML data stream on the fly to specify a new encoding
132
-
133
- data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
134
- encoding is a string recognized by encodings.aliases
135
- =end
136
125
  $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
137
126
  # NOTE we must use double quotes when dealing with \x encodings!
138
127
  if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
139
128
  if $debug
140
- $stderr << "stripping BOM\n"
141
- if encoding != 'utf-16be'
142
- $stderr << "string utf-16be instead\n"
143
- end
129
+ $stderr << "stripping BOM\n"
130
+ if encoding != 'utf-16be'
131
+ $stderr << "string utf-16be instead\n"
132
+ end
144
133
  end
145
134
  encoding = 'utf-16be'
146
135
  data = data[2..-1]
147
136
  elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
148
137
  if $debug
149
- $stderr << "stripping BOM\n"
150
- $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
138
+ $stderr << "stripping BOM\n"
139
+ $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
151
140
  end
152
141
  encoding = 'utf-16le'
153
142
  data = data[2..-1]
154
143
  elsif (data[0..2] == "\xef\xbb\xbf")
155
144
  if $debug
156
- $stderr << "stripping BOM\n"
157
- $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
145
+ $stderr << "stripping BOM\n"
146
+ $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
158
147
  end
159
148
  encoding = 'utf-8'
160
149
  data = data[3..-1]
161
150
  elsif (data[0..3] == "\x00\x00\xfe\xff")
162
151
  if $debug
163
- $stderr << "stripping BOM\n"
164
- if encoding != 'utf-32be'
165
- $stderr << "trying utf-32be instead\n"
166
- end
152
+ $stderr << "stripping BOM\n"
153
+ if encoding != 'utf-32be'
154
+ $stderr << "trying utf-32be instead\n"
155
+ end
167
156
  end
168
157
  encoding = 'utf-32be'
169
158
  data = data[4..-1]
170
159
  elsif (data[0..3] == "\xff\xfe\x00\x00")
171
160
  if $debug
172
- $stderr << "stripping BOM\n"
173
- if encoding != 'utf-32le'
174
- $stderr << "trying utf-32le instead\n"
175
- end
161
+ $stderr << "stripping BOM\n"
162
+ if encoding != 'utf-32le'
163
+ $stderr << "trying utf-32le instead\n"
164
+ end
176
165
  end
177
166
  encoding = 'utf-32le'
178
167
  data = data[4..-1]
@@ -184,75 +173,79 @@ module FeedParserUtilities
184
173
  end
185
174
  $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
186
175
  declmatch = /^<\?xml[^>]*?>/
187
- newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
188
- if declmatch =~ newdata
189
- newdata.sub!(declmatch, newdecl)
190
- else
191
- newdata = newdecl + "\n" + newdata
192
- end
176
+ newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
177
+ if declmatch =~ newdata
178
+ newdata.sub!(declmatch, newdecl)
179
+ else
180
+ newdata = newdecl + "\n" + newdata
181
+ end
193
182
  return newdata
194
183
  end
195
-
184
+
196
185
  end
197
186
 
198
- # http://intertwingly.net/stories/2005/09/28/xchar.rb
199
- module XChar
200
- # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
201
- CP1252 = {
202
- 128 => 8364, # euro sign
203
- 130 => 8218, # single low-9 quotation mark
204
- 131 => 402, # latin small letter f with hook
205
- 132 => 8222, # double low-9 quotation mark
206
- 133 => 8230, # horizontal ellipsis
207
- 134 => 8224, # dagger
208
- 135 => 8225, # double dagger
209
- 136 => 710, # modifier letter circumflex accent
210
- 137 => 8240, # per mille sign
211
- 138 => 352, # latin capital letter s with caron
212
- 139 => 8249, # single left-pointing angle quotation mark
213
- 140 => 338, # latin capital ligature oe
214
- 142 => 381, # latin capital letter z with caron
215
- 145 => 8216, # left single quotation mark
216
- 146 => 8217, # right single quotation mark
217
- 147 => 8220, # left double quotation mark
218
- 148 => 8221, # right double quotation mark
219
- 149 => 8226, # bullet
220
- 150 => 8211, # en dash
221
- 151 => 8212, # em dash
222
- 152 => 732, # small tilde
223
- 153 => 8482, # trade mark sign
224
- 154 => 353, # latin small letter s with caron
225
- 155 => 8250, # single right-pointing angle quotation mark
226
- 156 => 339, # latin small ligature oe
227
- 158 => 382, # latin small letter z with caron
228
- 159 => 376} # latin capital letter y with diaeresis
229
-
187
+ unless defined?(Builder::XChar)
188
+ # http://intertwingly.net/stories/2005/09/28/xchar.rb
189
+ module XChar
190
+ # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
191
+ CP1252 = {
192
+ 128 => 8364, # euro sign
193
+ 130 => 8218, # single low-9 quotation mark
194
+ 131 => 402, # latin small letter f with hook
195
+ 132 => 8222, # double low-9 quotation mark
196
+ 133 => 8230, # horizontal ellipsis
197
+ 134 => 8224, # dagger
198
+ 135 => 8225, # double dagger
199
+ 136 => 710, # modifier letter circumflex accent
200
+ 137 => 8240, # per mille sign
201
+ 138 => 352, # latin capital letter s with caron
202
+ 139 => 8249, # single left-pointing angle quotation mark
203
+ 140 => 338, # latin capital ligature oe
204
+ 142 => 381, # latin capital letter z with caron
205
+ 145 => 8216, # left single quotation mark
206
+ 146 => 8217, # right single quotation mark
207
+ 147 => 8220, # left double quotation mark
208
+ 148 => 8221, # right double quotation mark
209
+ 149 => 8226, # bullet
210
+ 150 => 8211, # en dash
211
+ 151 => 8212, # em dash
212
+ 152 => 732, # small tilde
213
+ 153 => 8482, # trade mark sign
214
+ 154 => 353, # latin small letter s with caron
215
+ 155 => 8250, # single right-pointing angle quotation mark
216
+ 156 => 339, # latin small ligature oe
217
+ 158 => 382, # latin small letter z with caron
218
+ 159 => 376 # latin capital letter y with diaeresis
219
+ }
230
220
  # http://www.w3.org/TR/REC-xml/#dt-chardata
231
221
  PREDEFINED = {
232
222
  38 => '&amp;', # ampersand
233
223
  60 => '&lt;', # left angle bracket
234
- 62 => '&gt;'} # right angle bracket
235
-
236
- # http://www.w3.org/TR/REC-xml/#charsets
237
- VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
238
- (0xE000..0xFFFD), (0x10000..0x10FFFF)]
239
- end
224
+ 62 => '&gt;' # right angle bracket
225
+ }
226
+ # http://www.w3.org/TR/REC-xml/#charsets
227
+ VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
228
+ (0xE000..0xFFFD), (0x10000..0x10FFFF)]
229
+ end
240
230
 
241
- class Fixnum
242
- # xml escaped version of chr
243
- def xchr
244
- n = XChar::CP1252[self] || self
245
- n = 42 unless XChar::VALID.find {|range| range.include? n}
246
- XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
231
+ class Fixnum
232
+ # xml escaped version of chr
233
+ def xchr
234
+ n = XChar::CP1252[self] || self
235
+ case n when *XChar::VALID
236
+ XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
237
+ else
238
+ '*'
239
+ end
240
+ end
247
241
  end
248
- end
249
242
 
250
- class String
251
- alias :old_index :index
252
- def to_xs
253
- unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
254
- rescue
255
- unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
243
+ class String
244
+ alias :old_index :index
245
+ def to_xs
246
+ unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
247
+ rescue
248
+ unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
249
+ end
256
250
  end
257
251
  end
258
-