RubyGems - rfeedparser - Versions diffs - 0.9.931 → 0.9.940 - Mend

rfeedparser 0.9.931 → 0.9.940

Files changed (40) hide show

data/lib/rfeedparser.rb +143 -58
data/lib/rfeedparser/aliases.rb +1 -1
data/lib/rfeedparser/better_attributelist.rb +11 -11
data/lib/rfeedparser/better_sgmlparser.rb +1 -1
data/lib/rfeedparser/encoding_helpers.rb +120 -127
data/lib/rfeedparser/feedparserdict.rb +30 -20
data/lib/rfeedparser/forgiving_uri.rb +9 -7
data/lib/rfeedparser/markup_helpers.rb +11 -14
data/lib/rfeedparser/parser_mixin.rb +16 -11
data/lib/rfeedparser/parsers.rb +1 -2
data/lib/rfeedparser/scrub.rb +95 -90
data/lib/rfeedparser/time_helpers.rb +379 -379
data/lib/rfeedparser/utilities.rb +23 -0
data/tests/rfeedparser_test_helper.rb +262 -0
data/tests/rfeedparserserver.rb +3 -109
data/tests/rfeedparsertest.rb +6 -165
data/tests/rfponly/http/200.xml +30 -0
data/tests/rfponly/http/220.xml +28 -0
data/tests/rfponly/http/300.xml +8 -0
data/tests/rfponly/http/300.xml_redirect +25 -0
data/tests/rfponly/http/301.xml +8 -0
data/tests/rfponly/http/301.xml_redirect +25 -0
data/tests/rfponly/http/302.xml +8 -0
data/tests/rfponly/http/302.xml_redirect +25 -0
data/tests/rfponly/http/307.xml +8 -0
data/tests/rfponly/http/307.xml_redirect +25 -0
data/tests/rfponly/http/320.xml +8 -0
data/tests/rfponly/http/320.xml_redirect +25 -0
data/tests/rfponly/http/400.xml +7 -0
data/tests/rfponly/http/404.xml +7 -0
data/tests/rfponly/http/410.xml +7 -0
data/tests/rfponly/http/420.xml +7 -0
data/tests/rfponly/http/500.xml +7 -0
data/tests/rfponly/http/520.xml +7 -0
data/tests/rfponly/http/etag.xml +28 -0
data/tests/rfponly/http/lastmodified.xml +29 -0
data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
metadata +31 -3

@@ -19,11 +19,14 @@ require 'rubygems'
 require 'base64'
 require 'iconv'
+gem 'hpricot', "=0.6"
+require 'hpricot'
 gem 'character-encodings', ">=0.2.0"
 gem 'htmltools', ">=1.10"
 gem 'htmlentities', ">=4.0.0"
 gem 'activesupport', ">=1.4.1"
 gem 'rchardet', ">=1.0"
 require 'xml/saxdriver' # calling expat through the xmlparser gem
 require 'rchardet'
@@ -40,23 +43,21 @@ $debug = false
 $compatible = true
 $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
+require 'rfeedparser/utilities'
 require 'rfeedparser/forgiving_uri'
-require 'rfeedparser/aliases'
-require 'rfeedparser/encoding_helpers'
 require 'rfeedparser/better_sgmlparser'
 require 'rfeedparser/better_attributelist'
-require 'rfeedparser/scrub'
-require 'rfeedparser/time_helpers'
 require 'rfeedparser/feedparserdict'
 require 'rfeedparser/parser_mixin'
 require 'rfeedparser/parsers'
-require 'rfeedparser/markup_helpers'
-include FeedParserUtilities
 module FeedParser
-  Version = "0.9.931"
+  extend FeedParserUtilities
+  Version = "0.9.940"
   License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
@@ -81,18 +82,19 @@ module FeedParser
   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   POSSIBILITY OF SUCH DAMAGE."""
-  Author = "Jeff Hodges <http://somethingsimilar.com>"
-  Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
+  Translator_From_Python_To_Ruby = "Jeff Hodges <http://somethingsimilar.com>"
+  Author = "Mark Pilgrim <http://diveintomark.org/>"
   Contributors = [  "Jason Diamond <http://injektilo.org/>",
     "John Beimler <http://john.beimler.org/>",
     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
     "Aaron Swartz <http://aaronsw.com/>",
-    "Kevin Marks <http://epeus.blogspot.com/>"
+    "Kevin Marks <http://epeus.blogspot.com/>",
+    "Jesse Newland <http://jnewland.com/>"
   ]
   # HTTP "User-Agent" header to send to servers when downloading feeds.
   # If you are embedding feedparser in a larger application, you should
   # change this to your application name and URL.
-  USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
+  USER_AGENT = "rFeedParser/#{Version} +http://rfeedparser.rubyforge.org/"
   # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
   # want to send an Accept header, set this to None.
@@ -141,60 +143,139 @@ module FeedParser
     'hotrss' => 'Hot RSS'
   }
-  def parse(furi, options = {})
-    furi.strip!
-    # Parse a feed from a URL, file, stream or string
-    $compatible = options[:compatible].nil? ? $compatible : options[:compatible]# Use the default compatibility if compatible is nil
+  # Accepted in options: :agent, :modified, :etag, and :referrer
+  def open_resource(url_file_stream_or_string, options)
+    options[:handlers] ||= []
+    if url_file_stream_or_string.respond_to?(:read)
+      return url_file_stream_or_string
+    elsif url_file_stream_or_string == '-'
+      return $stdin
+    end
+    # open-uri freaks out if there's leading spaces.
+    url_file_stream_or_string.strip!
+    furi = ForgivingURI.parse(url_file_stream_or_string)
+    if furi && ['http','https','ftp'].include?(furi.scheme)
+      auth = nil
+      if furi.host && furi.password
+        auth = Base64::encode64("#{furi.user}:#{furi.password}").strip
+        furi.password = nil
+        url_file_stream_or_string = furi.to_s
+      end
+      req_headers = {}
+      req_headers["User-Agent"] = options[:agent] || USER_AGENT
+      req_headers["If-None-Match"] = options[:etag] if options[:etag]
+      if options[:modified]
+        if options[:modified].is_a?(String)
+          req_headers["If-Modified-Since"] = parse_date(options[:modified]).httpdate
+        elsif options[:modified].is_a?(Time)
+          req_headers["If-Modified-Since"] = options[:modified].httpdate
+        elsif options[:modified].is_a?(Array)
+          req_headers["If-Modified-Since"] = py2rtime(options[:modified]).httpdate
+        end
+      end
+      req_headers["Referer"] = options[:referrer] if options[:referrer]
+      req_headers["Accept-encoding"] = 'gzip, deflate' # FIXME make tests
+      req_headers["Authorization"] = "Basic #{auth}" if auth
+      req_headers['Accept'] = ACCEPT_HEADER if ACCEPT_HEADER
+      req_headers['A-IM'] = 'feed' # RFC 3229 support
+      begin
+        return open(url_file_stream_or_string, req_headers)
+      rescue OpenURI::HTTPError => e
+        return e.io
+      rescue
+      end
+    end
+    # try to open with native open function (if url_file_stream_or_string is a filename)
+    begin
+      return open(url_file_stream_or_string)
+    rescue
+    end
+    # treat url_file_stream_or_string as string
+    return StringIO.new(url_file_stream_or_string.to_s)
+  end
+  module_function(:open_resource)
+  # Parse a feed from a URL, file, stream or string
+  def parse(url_file_stream_or_string, options = {})
+    # Use the default compatibility if compatible is nil
+    $compatible = options[:compatible].nil? ? $compatible : options[:compatible]
     strictklass = options[:strict] || StrictFeedParser
     looseklass = options[:loose] || LooseFeedParser
+    options[:handlers] = options[:handlers] || []
     result = FeedParserDict.new
     result['feed'] = FeedParserDict.new
     result['entries'] = []
-    if options[:modified]
-      options[:modified] = Time.parse(options[:modified]).utc.rfc2822
-      # FIXME this ignores all of our time parsing work.  Does it matter?
-    end
     result['bozo'] = false
-    handlers = options[:handlers]
-    if handlers.class != Array # FIXME why does this happen?
-      handlers = [handlers]
-    end
     begin
-      parsed_furi = ForgivingURI.parse(furi)
-      if [nil, "file"].include? parsed_furi.scheme
-        $stderr << "Opening local file #{furi}\n" if $debug
-        f = open(parsed_furi.path) # OpenURI doesn't behave well when passing HTTP options to a file.
-      else
-        # And when you do pass them, make sure they aren't just nil (this still true?)
-        newd = {}
-        newd["If-None-Match"] = options[:etag] unless options[:etag].nil?
-        newd["If-Modified-Since"] = options[:modified] unless options[:modified].nil?
-        newd["User-Agent"] = (options[:agent] || USER_AGENT).to_s
-        newd["Referer"] = options[:referrer] unless options[:referrer].nil?
-        newd["Content-Location"] = options[:content_location] unless options[:content_location].nil?
-        newd["Content-Language"] = options[:content_language] unless options[:content_language].nil?
-        newd["Content-type"] = options[:content_type] unless options[:content_type].nil?
-        f = open(furi, newd)
-      end
+      f = open_resource(url_file_stream_or_string, options)
       data = f.read
-      f.close
     rescue => e
-      $stderr << "Rescued in parse: "+e.to_s+"\n" if $debug # My addition
       result['bozo'] = true
       result['bozo_exception'] = e
       data = ''
       f = nil
     end
-      if f.respond_to?(:meta)
-        result['etag'] = f.meta['etag']
-        result['modified'] = f.meta['modified']
-        result['url'] = f.base_uri.to_s
-        result['status'] = f.status[0] || 200
-        result['headers'] = f.meta
+    if f and !data.blank? and f.respond_to?(:meta)
+      # if feed is gzip-compressed, decompress it
+      if f.meta['content-encoding'] == 'gzip'
+        begin
+          gz =  Zlib::GzipReader.new(StringIO.new(data))
+          data = gz.read
+          gz.close
+        rescue => e
+          # Some feeds claim to be gzipped but they're not, so
+          # we get garbage.  Ideally, we should re-request the
+          # feed without the 'Accept-encoding: gzip' header,
+          # but we don't.
+          result['bozo'] = true
+          result['bozo_exception'] = e
+          data = ''
+        end
+      elsif f.meta['content-encoding'] == 'deflate'
+        begin
+          data = Zlib::Deflate.inflate(data)
+        rescue => e
+          result['bozo'] = true
+          result['bozo_exception'] = e
+          data = ''
+        end
       end
+    end
+    if f.respond_to?(:meta)
+      result['etag'] = f.meta['etag']
+      result['modified_time'] = parse_date(f.meta['last-modified'])
+      result['modified'] = extract_tuple(result['modified_time'])
+      result['headers'] = f.meta
+    end
+    # FIXME open-uri does not return a non-nil base_uri in its HTTPErrors.
+    if f.respond_to?(:base_uri)
+      result['href'] = f.base_uri.to_s # URI => String
+      result['status'] = '200'
+    end
+    if f.respond_to?(:status)
+      result['status'] = f.status[0]
+    end
     # there are four encodings to keep track of:
@@ -204,7 +285,7 @@ module FeedParser
     # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
     http_headers = result['headers'] || {}
     result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
-    self.getCharacterEncoding(f,data)
+    getCharacterEncoding(http_headers,data)
     if not http_headers.blank? and not acceptable_content_type
       unless http_headers['content-type'].nil?
@@ -215,7 +296,7 @@ module FeedParser
       result['bozo'] = true
       result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
     end
-    result['version'], data = self.stripDoctype(data)
+    result['version'], data = stripDoctype(data)
     baseuri = http_headers['content-location'] || result['href']
     baselang = http_headers['content-language']
@@ -244,7 +325,7 @@ module FeedParser
       next if tried_encodings.include? proposed_encoding
       tried_encodings << proposed_encoding
       begin
-        data = self.toUTF8(data, proposed_encoding)
+        data = toUTF8(data, proposed_encoding)
         known_encoding = use_strict_parser = true
         break
       rescue
@@ -256,7 +337,7 @@ module FeedParser
         proposed_encoding = CharDet.detect(data)['encoding']
         if proposed_encoding and not tried_encodings.include?proposed_encoding
           tried_encodings << proposed_encoding
-          data = self.toUTF8(data, proposed_encoding)
+          data = toUTF8(data, proposed_encoding)
           known_encoding = use_strict_parser = true
         end
       rescue
@@ -270,7 +351,7 @@ module FeedParser
       begin
         proposed_encoding = 'utf-8'
         tried_encodings << proposed_encoding
-        data = self.toUTF8(data, proposed_encoding)
+        data = toUTF8(data, proposed_encoding)
         known_encoding = use_strict_parser = true
       rescue
       end
@@ -280,7 +361,7 @@ module FeedParser
       begin
         proposed_encoding = 'windows-1252'
         tried_encodings << proposed_encoding
-        data = self.toUTF8(data, proposed_encoding)
+        data = toUTF8(data, proposed_encoding)
         known_encoding = use_strict_parser = true
       rescue
       end
@@ -292,7 +373,7 @@ module FeedParser
     #  begin
     #    proposed_encoding = 'iso-8859-2'
     #    tried_encodings << proposed_encoding
-    #    data = self.toUTF8(data, proposed_encoding)
+    #    data = toUTF8(data, proposed_encoding)
     #    known_encoding = use_strict_parser = true
     #  rescue
     #  end
@@ -334,9 +415,9 @@ module FeedParser
       end
     end
     if not use_strict_parser
+      $stderr << "Using LooseFeed\n\n" if $debug
       feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
       feedparser.parse(data)
-      $stderr << "Using LooseFeed\n\n" if $debug
     end
     result['feed'] = feedparser.feeddata
     result['entries'] = feedparser.entries
@@ -347,6 +428,10 @@ module FeedParser
   module_function(:parse)
 end # End FeedParser module
+def rfp(url_file_stream_or_string, options={})
+  FeedParser.parse(url_file_stream_or_string, options)
+end
 class Serializer
   def initialize(results)
     @results = results

data/lib/rfeedparser/aliases.rb CHANGED

@@ -1,4 +1,4 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 module FeedParserUtilities
   # Adapted from python2.4's encodings/aliases.py

data/lib/rfeedparser/better_attributelist.rb CHANGED

@@ -1,4 +1,4 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 # Add some helper methods to make AttributeList (all of those damn attrs
 # and attrsD used by StrictFeedParser) act more like a Hash.
@@ -8,31 +8,31 @@ module XML
   module SAX
     module AttributeList # in xml/sax.rb
       def [](key)
-	getValue(key)
+        getValue(key)
       end
       def each(&blk)
-	(0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
+        (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
       end
       def each_key(&blk)
-	(0...getLength).each{|pos| yield getName(pos) }
+        (0...getLength).each{|pos| yield getName(pos) }
       end
       def each_value(&blk)
-	(0...getLength).each{|pos| yield getValue(pos) }
+        (0...getLength).each{|pos| yield getValue(pos) }
       end
       def to_a # Rather use collect? grep for to_a.collect
-	l = []
-	each{|k,v| l << [k,v]}
-	return l
+        l = []
+        each{|k,v| l << [k,v]}
+        return l
       end
       def to_s
-	l = []
-	each{|k,v| l << "#{k} => #{v}"}
-	"{ "+l.join(", ")+" }"
+        l = []
+        each{|k,v| l << "#{k} => #{v}"}
+        "{ "+l.join(", ")+" }"
       end
     end
   end

data/lib/rfeedparser/better_sgmlparser.rb CHANGED

@@ -1,4 +1,4 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 class BetterSGMLParserError < Exception; end;

data/lib/rfeedparser/encoding_helpers.rb CHANGED

@@ -1,4 +1,4 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 module FeedParserUtilities
@@ -26,73 +26,68 @@ module FeedParserUtilities
   def _ebcdic_to_ascii(s)
     return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
   end
-  def getCharacterEncoding(feed, xml_data)
+  def getCharacterEncoding(http_headers, xml_data)
     # Get the character encoding of the XML document
     $stderr << "In getCharacterEncoding\n" if $debug
     sniffed_xml_encoding = nil
     xml_encoding = nil
     true_encoding = nil
-    begin
-      http_headers = feed.meta
-      http_content_type = feed.meta['content-type'].split(';')[0]
-      encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
-      http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
-      http_encoding = nil if http_encoding.empty?
+    http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
+    encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
+    http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
+    http_encoding = nil if http_encoding.blank?
       # FIXME Open-Uri returns iso8859-1 if there is no charset header,
       # but that doesn't pass the tests. Open-Uri claims its following
       # the right RFC. Are they wrong or do we need to change the tests?
-    rescue NoMethodError
-      http_headers = {}
-      http_content_type = nil
-      http_encoding = nil
-    end
     # Must sniff for non-ASCII-compatible character encodings before
     # searching for XML declaration.  This heuristic is defined in
     # section F of the XML specification:
     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
     begin
       if xml_data[0..3] == "\x4c\x6f\xa7\x94"
-	# EBCDIC
-	xml_data = _ebcdic_to_ascii(xml_data)
+        # EBCDIC
+        xml_data = __ebcdic_to_ascii(xml_data)
       elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
-	# UTF-16BE
-	sniffed_xml_encoding = 'utf-16be'
-	xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
+        # UTF-16BE
+        sniffed_xml_encoding = 'utf-16be'
+        xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
       elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
-	# UTF-16BE with BOM
-	sniffed_xml_encoding = 'utf-16be'
-	xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
+        # UTF-16BE with BOM
+        sniffed_xml_encoding = 'utf-16be'
+        xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
       elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
-	# UTF-16LE
-	sniffed_xml_encoding = 'utf-16le'
-	xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
+        # UTF-16LE
+        sniffed_xml_encoding = 'utf-16le'
+        xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
       elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
-	# UTF-16LE with BOM
-	sniffed_xml_encoding = 'utf-16le'
-	xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
+        # UTF-16LE with BOM
+        sniffed_xml_encoding = 'utf-16le'
+        xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
       elsif xml_data[0..3] == "\x00\x00\x00\x3c"
-	# UTF-32BE
-	sniffed_xml_encoding = 'utf-32be'
-	xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
+        # UTF-32BE
+        sniffed_xml_encoding = 'utf-32be'
+        xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
       elsif xml_data[0..3] == "\x3c\x00\x00\x00"
-	# UTF-32LE
-	sniffed_xml_encoding = 'utf-32le'
-	xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
+        # UTF-32LE
+        sniffed_xml_encoding = 'utf-32le'
+        xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
       elsif xml_data[0..3] == "\x00\x00\xfe\xff"
-	# UTF-32BE with BOM
-	sniffed_xml_encoding = 'utf-32be'
-	xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
+        # UTF-32BE with BOM
+        sniffed_xml_encoding = 'utf-32be'
+        xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
       elsif xml_data[0..3] == "\xff\xfe\x00\x00"
-	# UTF-32LE with BOM
-	sniffed_xml_encoding = 'utf-32le'
-	xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
+        # UTF-32LE with BOM
+        sniffed_xml_encoding = 'utf-32le'
+        xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
       elsif xml_data[0..2] == "\xef\xbb\xbf"
-	# UTF-8 with BOM
-	sniffed_xml_encoding = 'utf-8'
-	xml_data = xml_data[3..-1]
+        # UTF-8 with BOM
+        sniffed_xml_encoding = 'utf-8'
+        xml_data = xml_data[3..-1]
       else
-	# ASCII-compatible
+        # ASCII-compatible
       end
       xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
     rescue
@@ -102,7 +97,7 @@ module FeedParserUtilities
       xml_encoding = xml_encoding_match[1].downcase
       xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
       if sniffed_xml_encoding and xencodings.include?xml_encoding
-	xml_encoding = sniffed_xml_encoding
+        xml_encoding = sniffed_xml_encoding
       end
     end
@@ -125,54 +120,48 @@ module FeedParserUtilities
     end
     return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
   end
   def toUTF8(data, encoding)
-=begin
-    Changes an XML data stream on the fly to specify a new encoding
-    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
-    encoding is a string recognized by encodings.aliases
-=end
     $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
     # NOTE we must use double quotes when dealing with \x encodings!
     if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
       if $debug
-	$stderr << "stripping BOM\n"
-	if encoding != 'utf-16be'
-	  $stderr << "string utf-16be instead\n"
-	end
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-16be'
+          $stderr << "string utf-16be instead\n"
+        end
       end
       encoding = 'utf-16be'
       data = data[2..-1]
     elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
       if $debug
-	$stderr << "stripping BOM\n"
-	$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
+        $stderr << "stripping BOM\n"
+        $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
       end
       encoding = 'utf-16le'
       data = data[2..-1]
     elsif (data[0..2] == "\xef\xbb\xbf")
       if $debug
-	$stderr << "stripping BOM\n"
-	$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
+        $stderr << "stripping BOM\n"
+        $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
       end
       encoding = 'utf-8'
       data = data[3..-1]
     elsif (data[0..3] == "\x00\x00\xfe\xff")
       if $debug
-	$stderr << "stripping BOM\n"
-	if encoding != 'utf-32be'
-	  $stderr << "trying utf-32be instead\n"
-	end
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-32be'
+          $stderr << "trying utf-32be instead\n"
+        end
       end
       encoding = 'utf-32be'
       data = data[4..-1]
     elsif (data[0..3] == "\xff\xfe\x00\x00")
       if $debug
-	$stderr << "stripping BOM\n"
-	if encoding != 'utf-32le'
-	  $stderr << "trying utf-32le instead\n"
-	end
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-32le'
+          $stderr << "trying utf-32le instead\n"
+        end
       end
       encoding = 'utf-32le'
       data = data[4..-1]
@@ -184,75 +173,79 @@ module FeedParserUtilities
     end
     $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
     declmatch = /^<\?xml[^>]*?>/
-      newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
-      if declmatch =~ newdata
-        newdata.sub!(declmatch, newdecl)
-      else
-        newdata = newdecl + "\n" + newdata
-      end
+    newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
+    if declmatch =~ newdata
+      newdata.sub!(declmatch, newdecl)
+    else
+      newdata = newdecl + "\n" + newdata
+    end
     return newdata
   end
 end
-# http://intertwingly.net/stories/2005/09/28/xchar.rb
-module XChar
-  # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
-  CP1252 = {
-    128 => 8364, # euro sign
-    130 => 8218, # single low-9 quotation mark
-    131 =>  402, # latin small letter f with hook
-    132 => 8222, # double low-9 quotation mark
-    133 => 8230, # horizontal ellipsis
-    134 => 8224, # dagger
-    135 => 8225, # double dagger
-    136 =>  710, # modifier letter circumflex accent
-    137 => 8240, # per mille sign
-    138 =>  352, # latin capital letter s with caron
-    139 => 8249, # single left-pointing angle quotation mark
-    140 =>  338, # latin capital ligature oe
-    142 =>  381, # latin capital letter z with caron
-    145 => 8216, # left single quotation mark
-    146 => 8217, # right single quotation mark
-    147 => 8220, # left double quotation mark
-    148 => 8221, # right double quotation mark
-    149 => 8226, # bullet
-    150 => 8211, # en dash
-    151 => 8212, # em dash
-    152 =>  732, # small tilde
-    153 => 8482, # trade mark sign
-    154 =>  353, # latin small letter s with caron
-    155 => 8250, # single right-pointing angle quotation mark
-    156 =>  339, # latin small ligature oe
-    158 =>  382, # latin small letter z with caron
-    159 =>  376} # latin capital letter y with diaeresis
+unless defined?(Builder::XChar)
+  # http://intertwingly.net/stories/2005/09/28/xchar.rb
+  module XChar
+    # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
+    CP1252 = {
+      128 => 8364, # euro sign
+      130 => 8218, # single low-9 quotation mark
+      131 =>  402, # latin small letter f with hook
+      132 => 8222, # double low-9 quotation mark
+      133 => 8230, # horizontal ellipsis
+      134 => 8224, # dagger
+      135 => 8225, # double dagger
+      136 =>  710, # modifier letter circumflex accent
+      137 => 8240, # per mille sign
+      138 =>  352, # latin capital letter s with caron
+      139 => 8249, # single left-pointing angle quotation mark
+      140 =>  338, # latin capital ligature oe
+      142 =>  381, # latin capital letter z with caron
+      145 => 8216, # left single quotation mark
+      146 => 8217, # right single quotation mark
+      147 => 8220, # left double quotation mark
+      148 => 8221, # right double quotation mark
+      149 => 8226, # bullet
+      150 => 8211, # en dash
+      151 => 8212, # em dash
+      152 =>  732, # small tilde
+      153 => 8482, # trade mark sign
+      154 =>  353, # latin small letter s with caron
+      155 => 8250, # single right-pointing angle quotation mark
+      156 =>  339, # latin small ligature oe
+      158 =>  382, # latin small letter z with caron
+      159 =>  376 # latin capital letter y with diaeresis
+    }
     # http://www.w3.org/TR/REC-xml/#dt-chardata
     PREDEFINED = {
       38 => '&amp;', # ampersand
       60 => '&lt;',  # left angle bracket
-      62 => '&gt;'}  # right angle bracket
-      # http://www.w3.org/TR/REC-xml/#charsets
-      VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
-	(0xE000..0xFFFD), (0x10000..0x10FFFF)]
-end
+      62 => '&gt;'  # right angle bracket
+    }
+    # http://www.w3.org/TR/REC-xml/#charsets
+    VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
+    (0xE000..0xFFFD), (0x10000..0x10FFFF)]
+  end
-class Fixnum
-  # xml escaped version of chr
-  def xchr
-    n = XChar::CP1252[self] || self
-    n = 42 unless XChar::VALID.find {|range| range.include? n}
-    XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
+  class Fixnum
+    # xml escaped version of chr
+    def xchr
+      n = XChar::CP1252[self] || self
+      case n when *XChar::VALID
+        XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
+      else
+        '*'
+      end
+    end
   end
-end
-class String
-  alias :old_index :index
-  def to_xs
-    unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
-  rescue
-    unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
+  class String
+    alias :old_index :index
+    def to_xs
+      unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
+    rescue
+      unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
+    end
   end
 end