RubyGems - rfeedparser - Versions diffs - 0.9.931 → 0.9.940 - Mend

rfeedparser 0.9.931 → 0.9.940

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

data/lib/rfeedparser.rb +143 -58
data/lib/rfeedparser/aliases.rb +1 -1
data/lib/rfeedparser/better_attributelist.rb +11 -11
data/lib/rfeedparser/better_sgmlparser.rb +1 -1
data/lib/rfeedparser/encoding_helpers.rb +120 -127
data/lib/rfeedparser/feedparserdict.rb +30 -20
data/lib/rfeedparser/forgiving_uri.rb +9 -7
data/lib/rfeedparser/markup_helpers.rb +11 -14
data/lib/rfeedparser/parser_mixin.rb +16 -11
data/lib/rfeedparser/parsers.rb +1 -2
data/lib/rfeedparser/scrub.rb +95 -90
data/lib/rfeedparser/time_helpers.rb +379 -379
data/lib/rfeedparser/utilities.rb +23 -0
data/tests/rfeedparser_test_helper.rb +262 -0
data/tests/rfeedparserserver.rb +3 -109
data/tests/rfeedparsertest.rb +6 -165
data/tests/rfponly/http/200.xml +30 -0
data/tests/rfponly/http/220.xml +28 -0
data/tests/rfponly/http/300.xml +8 -0
data/tests/rfponly/http/300.xml_redirect +25 -0
data/tests/rfponly/http/301.xml +8 -0
data/tests/rfponly/http/301.xml_redirect +25 -0
data/tests/rfponly/http/302.xml +8 -0
data/tests/rfponly/http/302.xml_redirect +25 -0
data/tests/rfponly/http/307.xml +8 -0
data/tests/rfponly/http/307.xml_redirect +25 -0
data/tests/rfponly/http/320.xml +8 -0
data/tests/rfponly/http/320.xml_redirect +25 -0
data/tests/rfponly/http/400.xml +7 -0
data/tests/rfponly/http/404.xml +7 -0
data/tests/rfponly/http/410.xml +7 -0
data/tests/rfponly/http/420.xml +7 -0
data/tests/rfponly/http/500.xml +7 -0
data/tests/rfponly/http/520.xml +7 -0
data/tests/rfponly/http/etag.xml +28 -0
data/tests/rfponly/http/lastmodified.xml +29 -0
data/tests/rfponly/wellformed/date/feed_modified_with_negative_numeric_timezone.xml +9 -0
data/tests/rfponly/wellformed/date/feed_modified_with_positive_numeric_timezone.xml +9 -0
data/tests/rfponly/wellformed/scrub/hpricot_self_closing_tag_workaround.xml +11 -0
metadata +31 -3

data/lib/rfeedparser.rb CHANGED

@@ -19,11 +19,14 @@ require 'rubygems'
 require 'base64'
 require 'iconv'
+gem 'hpricot', "=0.6"
+require 'hpricot'
 gem 'character-encodings', ">=0.2.0"
 gem 'htmltools', ">=1.10"
 gem 'htmlentities', ">=4.0.0"
 gem 'activesupport', ">=1.4.1"
 gem 'rchardet', ">=1.0"
 require 'xml/saxdriver' # calling expat through the xmlparser gem
 require 'rchardet'
@@ -40,23 +43,21 @@ $debug = false
 $compatible = true
 $LOAD_PATH << File.expand_path(File.dirname(__FILE__))
+require 'rfeedparser/utilities'
 require 'rfeedparser/forgiving_uri'
-require 'rfeedparser/aliases'
-require 'rfeedparser/encoding_helpers'
 require 'rfeedparser/better_sgmlparser'
 require 'rfeedparser/better_attributelist'
-require 'rfeedparser/scrub'
-require 'rfeedparser/time_helpers'
 require 'rfeedparser/feedparserdict'
 require 'rfeedparser/parser_mixin'
 require 'rfeedparser/parsers'
-require 'rfeedparser/markup_helpers'
-include FeedParserUtilities
 module FeedParser
-  Version = "0.9.931"
+  extend FeedParserUtilities
+  Version = "0.9.940"
   License = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
@@ -81,18 +82,19 @@ module FeedParser
   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   POSSIBILITY OF SUCH DAMAGE."""
-  Author = "Jeff Hodges <http://somethingsimilar.com>"
-  Copyright_Holder = "Mark Pilgrim <http://diveintomark.org/>"
+  Translator_From_Python_To_Ruby = "Jeff Hodges <http://somethingsimilar.com>"
+  Author = "Mark Pilgrim <http://diveintomark.org/>"
   Contributors = [  "Jason Diamond <http://injektilo.org/>",
     "John Beimler <http://john.beimler.org/>",
     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
     "Aaron Swartz <http://aaronsw.com/>",
-    "Kevin Marks <http://epeus.blogspot.com/>"
+    "Kevin Marks <http://epeus.blogspot.com/>",
+    "Jesse Newland <http://jnewland.com/>"
   ]
   # HTTP "User-Agent" header to send to servers when downloading feeds.
   # If you are embedding feedparser in a larger application, you should
   # change this to your application name and URL.
-  USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % @version
+  USER_AGENT = "rFeedParser/#{Version} +http://rfeedparser.rubyforge.org/"
   # HTTP "Accept" header to send to servers when downloading feeds.  If you don't
   # want to send an Accept header, set this to None.
@@ -141,60 +143,139 @@ module FeedParser
     'hotrss' => 'Hot RSS'
   }
-  def parse(furi, options = {})
-    furi.strip!
-    # Parse a feed from a URL, file, stream or string
-    $compatible = options[:compatible].nil? ? $compatible : options[:compatible]# Use the default compatibility if compatible is nil
+  # Accepted in options: :agent, :modified, :etag, and :referrer
+  def open_resource(url_file_stream_or_string, options)
+    options[:handlers] ||= []
+    if url_file_stream_or_string.respond_to?(:read)
+      return url_file_stream_or_string
+    elsif url_file_stream_or_string == '-'
+      return $stdin
+    end
+    # open-uri freaks out if there's leading spaces.
+    url_file_stream_or_string.strip!
+    furi = ForgivingURI.parse(url_file_stream_or_string)
+    if furi && ['http','https','ftp'].include?(furi.scheme)
+      auth = nil
+      if furi.host && furi.password
+        auth = Base64::encode64("#{furi.user}:#{furi.password}").strip
+        furi.password = nil
+        url_file_stream_or_string = furi.to_s
+      end
+      req_headers = {}
+      req_headers["User-Agent"] = options[:agent] || USER_AGENT
+      req_headers["If-None-Match"] = options[:etag] if options[:etag]
+      if options[:modified]
+        if options[:modified].is_a?(String)
+          req_headers["If-Modified-Since"] = parse_date(options[:modified]).httpdate
+        elsif options[:modified].is_a?(Time)
+          req_headers["If-Modified-Since"] = options[:modified].httpdate
+        elsif options[:modified].is_a?(Array)
+          req_headers["If-Modified-Since"] = py2rtime(options[:modified]).httpdate
+        end
+      end
+      req_headers["Referer"] = options[:referrer] if options[:referrer]
+      req_headers["Accept-encoding"] = 'gzip, deflate' # FIXME make tests
+      req_headers["Authorization"] = "Basic #{auth}" if auth
+      req_headers['Accept'] = ACCEPT_HEADER if ACCEPT_HEADER
+      req_headers['A-IM'] = 'feed' # RFC 3229 support
+      begin
+        return open(url_file_stream_or_string, req_headers)
+      rescue OpenURI::HTTPError => e
+        return e.io
+      rescue
+      end
+    end
+    # try to open with native open function (if url_file_stream_or_string is a filename)
+    begin
+      return open(url_file_stream_or_string)
+    rescue
+    end
+    # treat url_file_stream_or_string as string
+    return StringIO.new(url_file_stream_or_string.to_s)
+  end
+  module_function(:open_resource)
+  # Parse a feed from a URL, file, stream or string
+  def parse(url_file_stream_or_string, options = {})
+    # Use the default compatibility if compatible is nil
+    $compatible = options[:compatible].nil? ? $compatible : options[:compatible]
     strictklass = options[:strict] || StrictFeedParser
     looseklass = options[:loose] || LooseFeedParser
+    options[:handlers] = options[:handlers] || []
     result = FeedParserDict.new
     result['feed'] = FeedParserDict.new
     result['entries'] = []
-    if options[:modified]
-      options[:modified] = Time.parse(options[:modified]).utc.rfc2822
-      # FIXME this ignores all of our time parsing work.  Does it matter?
-    end
     result['bozo'] = false
-    handlers = options[:handlers]
-    if handlers.class != Array # FIXME why does this happen?
-      handlers = [handlers]
-    end
     begin
-      parsed_furi = ForgivingURI.parse(furi)
-      if [nil, "file"].include? parsed_furi.scheme
-        $stderr << "Opening local file #{furi}\n" if $debug
-        f = open(parsed_furi.path) # OpenURI doesn't behave well when passing HTTP options to a file.
-      else
-        # And when you do pass them, make sure they aren't just nil (this still true?)
-        newd = {}
-        newd["If-None-Match"] = options[:etag] unless options[:etag].nil?
-        newd["If-Modified-Since"] = options[:modified] unless options[:modified].nil?
-        newd["User-Agent"] = (options[:agent] || USER_AGENT).to_s
-        newd["Referer"] = options[:referrer] unless options[:referrer].nil?
-        newd["Content-Location"] = options[:content_location] unless options[:content_location].nil?
-        newd["Content-Language"] = options[:content_language] unless options[:content_language].nil?
-        newd["Content-type"] = options[:content_type] unless options[:content_type].nil?
-        f = open(furi, newd)
-      end
+      f = open_resource(url_file_stream_or_string, options)
       data = f.read
-      f.close
     rescue => e
-      $stderr << "Rescued in parse: "+e.to_s+"\n" if $debug # My addition
       result['bozo'] = true
       result['bozo_exception'] = e
       data = ''
       f = nil
     end
-      if f.respond_to?(:meta)
-        result['etag'] = f.meta['etag']
-        result['modified'] = f.meta['modified']
-        result['url'] = f.base_uri.to_s
-        result['status'] = f.status[0] || 200
-        result['headers'] = f.meta
+    if f and !data.blank? and f.respond_to?(:meta)
+      # if feed is gzip-compressed, decompress it
+      if f.meta['content-encoding'] == 'gzip'
+        begin
+          gz =  Zlib::GzipReader.new(StringIO.new(data))
+          data = gz.read
+          gz.close
+        rescue => e
+          # Some feeds claim to be gzipped but they're not, so
+          # we get garbage.  Ideally, we should re-request the
+          # feed without the 'Accept-encoding: gzip' header,
+          # but we don't.
+          result['bozo'] = true
+          result['bozo_exception'] = e
+          data = ''
+        end
+      elsif f.meta['content-encoding'] == 'deflate'
+        begin
+          data = Zlib::Deflate.inflate(data)
+        rescue => e
+          result['bozo'] = true
+          result['bozo_exception'] = e
+          data = ''
+        end
       end
+    end
+    if f.respond_to?(:meta)
+      result['etag'] = f.meta['etag']
+      result['modified_time'] = parse_date(f.meta['last-modified'])
+      result['modified'] = extract_tuple(result['modified_time'])
+      result['headers'] = f.meta
+    end
+    # FIXME open-uri does not return a non-nil base_uri in its HTTPErrors.
+    if f.respond_to?(:base_uri)
+      result['href'] = f.base_uri.to_s # URI => String
+      result['status'] = '200'
+    end
+    if f.respond_to?(:status)
+      result['status'] = f.status[0]
+    end
     # there are four encodings to keep track of:
@@ -204,7 +285,7 @@ module FeedParser
     # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
     http_headers = result['headers'] || {}
     result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type =
-    self.getCharacterEncoding(f,data)
+    getCharacterEncoding(http_headers,data)
     if not http_headers.blank? and not acceptable_content_type
       unless http_headers['content-type'].nil?
@@ -215,7 +296,7 @@ module FeedParser
       result['bozo'] = true
       result['bozo_exception'] = NonXMLContentType.new(bozo_message) # I get to care about this, cuz Mark says I should.
     end
-    result['version'], data = self.stripDoctype(data)
+    result['version'], data = stripDoctype(data)
     baseuri = http_headers['content-location'] || result['href']
     baselang = http_headers['content-language']
@@ -244,7 +325,7 @@ module FeedParser
       next if tried_encodings.include? proposed_encoding
       tried_encodings << proposed_encoding
       begin
-        data = self.toUTF8(data, proposed_encoding)
+        data = toUTF8(data, proposed_encoding)
         known_encoding = use_strict_parser = true
         break
       rescue
@@ -256,7 +337,7 @@ module FeedParser
         proposed_encoding = CharDet.detect(data)['encoding']
         if proposed_encoding and not tried_encodings.include?proposed_encoding
           tried_encodings << proposed_encoding
-          data = self.toUTF8(data, proposed_encoding)
+          data = toUTF8(data, proposed_encoding)
           known_encoding = use_strict_parser = true
         end
       rescue
@@ -270,7 +351,7 @@ module FeedParser
       begin
         proposed_encoding = 'utf-8'
         tried_encodings << proposed_encoding
-        data = self.toUTF8(data, proposed_encoding)
+        data = toUTF8(data, proposed_encoding)
         known_encoding = use_strict_parser = true
       rescue
       end
@@ -280,7 +361,7 @@ module FeedParser
       begin
         proposed_encoding = 'windows-1252'
         tried_encodings << proposed_encoding
-        data = self.toUTF8(data, proposed_encoding)
+        data = toUTF8(data, proposed_encoding)
         known_encoding = use_strict_parser = true
       rescue
       end
@@ -292,7 +373,7 @@ module FeedParser
     #  begin
     #    proposed_encoding = 'iso-8859-2'
     #    tried_encodings << proposed_encoding
-    #    data = self.toUTF8(data, proposed_encoding)
+    #    data = toUTF8(data, proposed_encoding)
     #    known_encoding = use_strict_parser = true
     #  rescue
     #  end
@@ -334,9 +415,9 @@ module FeedParser
       end
     end
     if not use_strict_parser
+      $stderr << "Using LooseFeed\n\n" if $debug
       feedparser = looseklass.new(baseuri, baselang, (known_encoding and 'utf-8' or ''))
       feedparser.parse(data)
-      $stderr << "Using LooseFeed\n\n" if $debug
     end
     result['feed'] = feedparser.feeddata
     result['entries'] = feedparser.entries
@@ -347,6 +428,10 @@ module FeedParser
   module_function(:parse)
 end # End FeedParser module
+def rfp(url_file_stream_or_string, options={})
+  FeedParser.parse(url_file_stream_or_string, options)
+end
 class Serializer
   def initialize(results)
     @results = results

data/lib/rfeedparser/aliases.rb CHANGED

@@ -1,4 +1,4 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 module FeedParserUtilities
   # Adapted from python2.4's encodings/aliases.py

data/lib/rfeedparser/better_attributelist.rb CHANGED

@@ -1,4 +1,4 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 # Add some helper methods to make AttributeList (all of those damn attrs
 # and attrsD used by StrictFeedParser) act more like a Hash.
@@ -8,31 +8,31 @@ module XML
   module SAX
     module AttributeList # in xml/sax.rb
       def [](key)
-	getValue(key)
+        getValue(key)
       end
       def each(&blk)
-	(0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
+        (0...getLength).each{|pos| yield [getName(pos), getValue(pos)]}
       end
       def each_key(&blk)
-	(0...getLength).each{|pos| yield getName(pos) }
+        (0...getLength).each{|pos| yield getName(pos) }
       end
       def each_value(&blk)
-	(0...getLength).each{|pos| yield getValue(pos) }
+        (0...getLength).each{|pos| yield getValue(pos) }
       end
       def to_a # Rather use collect? grep for to_a.collect
-	l = []
-	each{|k,v| l << [k,v]}
-	return l
+        l = []
+        each{|k,v| l << [k,v]}
+        return l
       end
       def to_s
-	l = []
-	each{|k,v| l << "#{k} => #{v}"}
-	"{ "+l.join(", ")+" }"
+        l = []
+        each{|k,v| l << "#{k} => #{v}"}
+        "{ "+l.join(", ")+" }"
       end
     end
   end

data/lib/rfeedparser/better_sgmlparser.rb CHANGED

@@ -1,4 +1,4 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 class BetterSGMLParserError < Exception; end;

data/lib/rfeedparser/encoding_helpers.rb CHANGED

@@ -1,4 +1,4 @@
-#!/usr/bin/ruby
+#!/usr/bin/env ruby
 module FeedParserUtilities
@@ -26,73 +26,68 @@ module FeedParserUtilities
   def _ebcdic_to_ascii(s)
     return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
   end
-  def getCharacterEncoding(feed, xml_data)
+  def getCharacterEncoding(http_headers, xml_data)
     # Get the character encoding of the XML document
     $stderr << "In getCharacterEncoding\n" if $debug
     sniffed_xml_encoding = nil
     xml_encoding = nil
     true_encoding = nil
-    begin
-      http_headers = feed.meta
-      http_content_type = feed.meta['content-type'].split(';')[0]
-      encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
-      http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
-      http_encoding = nil if http_encoding.empty?
+    http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
+    encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
+    http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
+    http_encoding = nil if http_encoding.blank?
       # FIXME Open-Uri returns iso8859-1 if there is no charset header,
       # but that doesn't pass the tests. Open-Uri claims its following
       # the right RFC. Are they wrong or do we need to change the tests?
-    rescue NoMethodError
-      http_headers = {}
-      http_content_type = nil
-      http_encoding = nil
-    end
     # Must sniff for non-ASCII-compatible character encodings before
     # searching for XML declaration.  This heuristic is defined in
     # section F of the XML specification:
     # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
     begin
       if xml_data[0..3] == "\x4c\x6f\xa7\x94"
-	# EBCDIC
-	xml_data = _ebcdic_to_ascii(xml_data)
+        # EBCDIC
+        xml_data = __ebcdic_to_ascii(xml_data)
       elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
-	# UTF-16BE
-	sniffed_xml_encoding = 'utf-16be'
-	xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
+        # UTF-16BE
+        sniffed_xml_encoding = 'utf-16be'
+        xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
       elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
-	# UTF-16BE with BOM
-	sniffed_xml_encoding = 'utf-16be'
-	xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
+        # UTF-16BE with BOM
+        sniffed_xml_encoding = 'utf-16be'
+        xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
       elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
-	# UTF-16LE
-	sniffed_xml_encoding = 'utf-16le'
-	xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
+        # UTF-16LE
+        sniffed_xml_encoding = 'utf-16le'
+        xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
       elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
-	# UTF-16LE with BOM
-	sniffed_xml_encoding = 'utf-16le'
-	xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
+        # UTF-16LE with BOM
+        sniffed_xml_encoding = 'utf-16le'
+        xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
       elsif xml_data[0..3] == "\x00\x00\x00\x3c"
-	# UTF-32BE
-	sniffed_xml_encoding = 'utf-32be'
-	xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
+        # UTF-32BE
+        sniffed_xml_encoding = 'utf-32be'
+        xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
       elsif xml_data[0..3] == "\x3c\x00\x00\x00"
-	# UTF-32LE
-	sniffed_xml_encoding = 'utf-32le'
-	xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
+        # UTF-32LE
+        sniffed_xml_encoding = 'utf-32le'
+        xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
       elsif xml_data[0..3] == "\x00\x00\xfe\xff"
-	# UTF-32BE with BOM
-	sniffed_xml_encoding = 'utf-32be'
-	xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
+        # UTF-32BE with BOM
+        sniffed_xml_encoding = 'utf-32be'
+        xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
       elsif xml_data[0..3] == "\xff\xfe\x00\x00"
-	# UTF-32LE with BOM
-	sniffed_xml_encoding = 'utf-32le'
-	xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
+        # UTF-32LE with BOM
+        sniffed_xml_encoding = 'utf-32le'
+        xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
       elsif xml_data[0..2] == "\xef\xbb\xbf"
-	# UTF-8 with BOM
-	sniffed_xml_encoding = 'utf-8'
-	xml_data = xml_data[3..-1]
+        # UTF-8 with BOM
+        sniffed_xml_encoding = 'utf-8'
+        xml_data = xml_data[3..-1]
       else
-	# ASCII-compatible
+        # ASCII-compatible
       end
       xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
     rescue
@@ -102,7 +97,7 @@ module FeedParserUtilities
       xml_encoding = xml_encoding_match[1].downcase
       xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
       if sniffed_xml_encoding and xencodings.include?xml_encoding
-	xml_encoding = sniffed_xml_encoding
+        xml_encoding = sniffed_xml_encoding
       end
     end
@@ -125,54 +120,48 @@ module FeedParserUtilities
     end
     return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
   end
   def toUTF8(data, encoding)
-=begin
-    Changes an XML data stream on the fly to specify a new encoding
-    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
-    encoding is a string recognized by encodings.aliases
-=end
     $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
     # NOTE we must use double quotes when dealing with \x encodings!
     if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
       if $debug
-	$stderr << "stripping BOM\n"
-	if encoding != 'utf-16be'
-	  $stderr << "string utf-16be instead\n"
-	end
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-16be'
+          $stderr << "string utf-16be instead\n"
+        end
       end
       encoding = 'utf-16be'
       data = data[2..-1]
     elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
       if $debug
-	$stderr << "stripping BOM\n"
-	$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
+        $stderr << "stripping BOM\n"
+        $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
       end
       encoding = 'utf-16le'
       data = data[2..-1]
     elsif (data[0..2] == "\xef\xbb\xbf")
       if $debug
-	$stderr << "stripping BOM\n"
-	$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
+        $stderr << "stripping BOM\n"
+        $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
       end
       encoding = 'utf-8'
       data = data[3..-1]
     elsif (data[0..3] == "\x00\x00\xfe\xff")
       if $debug
-	$stderr << "stripping BOM\n"
-	if encoding != 'utf-32be'
-	  $stderr << "trying utf-32be instead\n"
-	end
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-32be'
+          $stderr << "trying utf-32be instead\n"
+        end
       end
       encoding = 'utf-32be'
       data = data[4..-1]
     elsif (data[0..3] == "\xff\xfe\x00\x00")
       if $debug
-	$stderr << "stripping BOM\n"
-	if encoding != 'utf-32le'
-	  $stderr << "trying utf-32le instead\n"
-	end
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-32le'
+          $stderr << "trying utf-32le instead\n"
+        end
       end
       encoding = 'utf-32le'
       data = data[4..-1]
@@ -184,75 +173,79 @@ module FeedParserUtilities
     end
     $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
     declmatch = /^<\?xml[^>]*?>/
-      newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
-      if declmatch =~ newdata
-        newdata.sub!(declmatch, newdecl)
-      else
-        newdata = newdecl + "\n" + newdata
-      end
+    newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
+    if declmatch =~ newdata
+      newdata.sub!(declmatch, newdecl)
+    else
+      newdata = newdecl + "\n" + newdata
+    end
     return newdata
   end
 end
-# http://intertwingly.net/stories/2005/09/28/xchar.rb
-module XChar
-  # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
-  CP1252 = {
-    128 => 8364, # euro sign
-    130 => 8218, # single low-9 quotation mark
-    131 =>  402, # latin small letter f with hook
-    132 => 8222, # double low-9 quotation mark
-    133 => 8230, # horizontal ellipsis
-    134 => 8224, # dagger
-    135 => 8225, # double dagger
-    136 =>  710, # modifier letter circumflex accent
-    137 => 8240, # per mille sign
-    138 =>  352, # latin capital letter s with caron
-    139 => 8249, # single left-pointing angle quotation mark
-    140 =>  338, # latin capital ligature oe
-    142 =>  381, # latin capital letter z with caron
-    145 => 8216, # left single quotation mark
-    146 => 8217, # right single quotation mark
-    147 => 8220, # left double quotation mark
-    148 => 8221, # right double quotation mark
-    149 => 8226, # bullet
-    150 => 8211, # en dash
-    151 => 8212, # em dash
-    152 =>  732, # small tilde
-    153 => 8482, # trade mark sign
-    154 =>  353, # latin small letter s with caron
-    155 => 8250, # single right-pointing angle quotation mark
-    156 =>  339, # latin small ligature oe
-    158 =>  382, # latin small letter z with caron
-    159 =>  376} # latin capital letter y with diaeresis
+unless defined?(Builder::XChar)
+  # http://intertwingly.net/stories/2005/09/28/xchar.rb
+  module XChar
+    # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
+    CP1252 = {
+      128 => 8364, # euro sign
+      130 => 8218, # single low-9 quotation mark
+      131 =>  402, # latin small letter f with hook
+      132 => 8222, # double low-9 quotation mark
+      133 => 8230, # horizontal ellipsis
+      134 => 8224, # dagger
+      135 => 8225, # double dagger
+      136 =>  710, # modifier letter circumflex accent
+      137 => 8240, # per mille sign
+      138 =>  352, # latin capital letter s with caron
+      139 => 8249, # single left-pointing angle quotation mark
+      140 =>  338, # latin capital ligature oe
+      142 =>  381, # latin capital letter z with caron
+      145 => 8216, # left single quotation mark
+      146 => 8217, # right single quotation mark
+      147 => 8220, # left double quotation mark
+      148 => 8221, # right double quotation mark
+      149 => 8226, # bullet
+      150 => 8211, # en dash
+      151 => 8212, # em dash
+      152 =>  732, # small tilde
+      153 => 8482, # trade mark sign
+      154 =>  353, # latin small letter s with caron
+      155 => 8250, # single right-pointing angle quotation mark
+      156 =>  339, # latin small ligature oe
+      158 =>  382, # latin small letter z with caron
+      159 =>  376 # latin capital letter y with diaeresis
+    }
     # http://www.w3.org/TR/REC-xml/#dt-chardata
     PREDEFINED = {
       38 => '&amp;', # ampersand
       60 => '&lt;',  # left angle bracket
-      62 => '&gt;'}  # right angle bracket
-      # http://www.w3.org/TR/REC-xml/#charsets
-      VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
-	(0xE000..0xFFFD), (0x10000..0x10FFFF)]
-end
+      62 => '&gt;'  # right angle bracket
+    }
+    # http://www.w3.org/TR/REC-xml/#charsets
+    VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
+    (0xE000..0xFFFD), (0x10000..0x10FFFF)]
+  end
-class Fixnum
-  # xml escaped version of chr
-  def xchr
-    n = XChar::CP1252[self] || self
-    n = 42 unless XChar::VALID.find {|range| range.include? n}
-    XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
+  class Fixnum
+    # xml escaped version of chr
+    def xchr
+      n = XChar::CP1252[self] || self
+      case n when *XChar::VALID
+        XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
+      else
+        '*'
+      end
+    end
   end
-end
-class String
-  alias :old_index :index
-  def to_xs
-    unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
-  rescue
-    unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
+  class String
+    alias :old_index :index
+    def to_xs
+      unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
+    rescue
+      unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
+    end
   end
 end