RubyGems - rfeedparser - Versions diffs - 0.9.8 → 0.9.9 - Mend

rfeedparser 0.9.8 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

data/lib/rfeedparser.rb +170 -3345
data/lib/rfeedparser/aliases.rb +432 -0
data/lib/rfeedparser/better_attributelist.rb +41 -0
data/lib/rfeedparser/better_sgmlparser.rb +264 -0
data/lib/rfeedparser/encoding_helpers.rb +257 -0
data/lib/rfeedparser/feedparserdict.rb +93 -0
data/lib/rfeedparser/forgiving_uri.rb +93 -0
data/lib/rfeedparser/markup_helpers.rb +73 -0
data/lib/rfeedparser/parser_mixin.rb +1235 -0
data/lib/rfeedparser/parsers.rb +177 -0
data/lib/rfeedparser/scrub.rb +207 -0
data/lib/rfeedparser/time_helpers.rb +408 -0
data/tests/rfeedparsertest.rb +3 -1
metadata +3271 -3250

data/lib/rfeedparser/better_sgmlparser.rb ADDED Viewed

@@ -0,0 +1,264 @@
+#!/usr/bin/ruby
+class BetterSGMLParserError < Exception; end;
+class BetterSGMLParser < HTML::SGMLParser
+  # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
+  # This makes things work.
+  Interesting = /[&<]/u
+  Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|![^<>]*)?', 64)
+  # 64 is the unicode flag
+  Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
+  Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
+  Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
+  Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
+  Endtagopen = /<\//u # Matching the Python SGMLParser
+  Endbracket = /[<>]/u
+  Declopen = /<!/u
+  Piopenbegin = /^<\?/u
+  Piclose = />/u
+  Commentopen = /<!--/u
+  Commentclose = /--\s*>/u
+  Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
+  Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
+			    '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
+			    64)
+  Endtagfind = /\s*\/\s*>/u
+  def initialize(verbose=false)
+    super(verbose)
+  end
+  def feed(*args)
+    super(*args)
+  end
+  def goahead(_end)
+    rawdata = @rawdata # woo, utf-8 magic
+    i = 0
+    n = rawdata.length
+    while i < n
+      if @nomoretags
+	# handle_data_range does nothing more than set a "Range" that is never used. wtf?
+	handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
+	i = n
+	break
+      end
+      j = rawdata.index(Interesting, i)
+      j = n unless j
+      handle_data(rawdata[i...j]) if i < j
+      i = j
+      break if (i == n)
+      if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
+	if rawdata.index(Starttagopen,i) == i
+	  if @literal
+	    handle_data(rawdata[i..i])
+	    i = i+1
+	    next
+	  end
+	  k = parse_starttag(i)
+	  break unless k
+	  i = k
+	  next
+	end
+	if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
+	  k = parse_endtag(i)
+	  break unless k
+	  i = k
+	  @literal = false
+	  next
+	end
+	if @literal
+	  if n > (i+1)
+	    handle_data("<")
+	    i = i+1
+	  else
+	    #incomplete
+	    break
+	  end
+	  next
+	end
+	if rawdata.index(Commentopen,i) == i
+	  k = parse_comment(i)
+	  break unless k
+	  i = k
+	  next
+	end
+	if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
+	  k = parse_pi(i)
+	  break unless k
+	  i += k
+	  next
+	end
+	if rawdata.index(Declopen,i) == i
+	  # This is some sort of declaration; in "HTML as
+	  # deployed," this should only be the document type
+	  # declaration ("<!DOCTYPE html...>").
+	  k = parse_declaration(i)
+	  break unless k
+	  i = k
+	  next
+	end
+      elsif rawdata[i..i] == '&'
+	if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
+	  handle_data(rawdata[i..i])
+	  i += 1
+	  next
+	end
+      # the Char must come first as its #=~ method is the only one that is UTF-8 safe
+      ni,match = index_match(rawdata, Charref, i)
+      if ni and ni == i # See? Ugly
+	handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
+	i += match[0].length  # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
+	i -= 1 unless rawdata[i-1..i-1] == ";"
+	next
+      end
+      ni,match = index_match(rawdata, Entityref, i)
+      if ni and ni == i
+	handle_entityref(match[1])
+	i += match[0].length
+	i -= 1 unless rawdata[i-1..i-1] == ";"
+	next
+      end
+      else
+	error('neither < nor & ??')
+      end
+      # We get here only if incomplete matches but
+      # nothing else
+      ni,match = index_match(rawdata,Incomplete,i)
+      unless ni and ni == 0
+	handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
+	i += 1
+	next
+      end
+      j = ni + match[0].length
+      break if j == n # Really incomplete
+      handle_data(rawdata[i...j])
+      i = j
+    end # end while
+    if _end and i < n
+      handle_data(rawdata[i...n])
+      i = n
+    end
+    @rawdata = rawdata[i..-1]
+    # @offset += i # FIXME BUGME another unused variable in SGMLParser?
+  end
+  # Internal -- parse processing instr, return length or -1 if not terminated
+  def parse_pi(i)
+    rawdata = @rawdata
+    if rawdata[i...i+2] != '<?'
+      error("unexpected call to parse_pi()")
+    end
+    ni,match = index_match(rawdata,Piclose,i+2)
+    return nil unless match
+    j = ni
+    handle_pi(rawdata[i+2...j])
+    j = (j + match[0].length)
+    return j-i
+  end
+  def parse_comment(i)
+    rawdata = @rawdata
+    if rawdata[i...i+4] != "<!--"
+      error("unexpected call to parse_comment()")
+    end
+    ni,match = index_match(rawdata, Commentclose,i)
+    return nil unless match
+    handle_comment(rawdata[i+4..(ni-1)])
+    return ni+match[0].length # Length from i to just past the closing comment tag
+  end
+  def parse_starttag(i)
+    @_starttag_text = nil
+    start_pos = i
+    rawdata = @rawdata
+    ni,match = index_match(rawdata,Shorttagopen,i)
+    if ni == i
+      # SGML shorthand: <tag/data/ == <tag>data</tag>
+      # XXX Can data contain &... (entity or char refs)?
+      # XXX Can data contain < or > (tag characters)?
+      # XXX Can there be whitespace before the first /?
+      k,match = index_match(rawdata,Shorttag,i)
+      return nil unless match
+      tag, data = match[1], match[2]
+      @_starttag_text = "<#{tag}/"
+      tag.downcase!
+      second_end = rawdata.index(Shorttagopen,k)
+      finish_shorttag(tag, data)
+      @_starttag_text = rawdata[start_pos...second_end+1]
+      return k
+    end
+    j = rawdata.index(Endbracket, i+1)
+    return nil unless j
+    attrsd = []
+    if rawdata[i...i+2] == '<>'
+      # SGML shorthand: <> == <last open tag seen>
+      k = j
+      tag = @lasttag
+    else
+      ni,match = index_match(rawdata,Tagfind,i+1)
+      unless match
+	error('unexpected call to parse_starttag')
+      end
+      k = ni+match[0].length+1
+      tag = match[0].downcase
+      @lasttag = tag
+    end
+    while k < j
+      break if rawdata.index(Endtagfind, k) == k
+      ni,match = index_match(rawdata,Attrfind,k)
+      break unless ni
+      matched_length = match[0].length
+      attrname, rest, attrvalue = match[1],match[2],match[3]
+      if rest.nil? or rest.empty?
+	attrvalue = '' # was: = attrname # Why the change?
+      elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
+	attrvalue = attrvalue[1...-1]
+      end
+      attrsd << [attrname.downcase, attrvalue]
+      k += matched_length
+    end
+    if rawdata[j..j] == ">"
+      j += 1
+    end
+    @_starttag_text = rawdata[start_pos...j]
+    finish_starttag(tag, attrsd)
+    return j
+  end
+  def parse_endtag(i)
+    rawdata = @rawdata
+    j, match = index_match(rawdata, /[<>]/,i+1)
+    return nil unless j
+    tag = rawdata[i+2...j].strip.downcase
+    if rawdata[j..j] == ">"
+      j += 1
+    end
+    finish_endtag(tag)
+    return j
+  end
+  def output
+    # Return processed HTML as a single string
+    return @pieces.map{|p| p.to_s}.join
+  end
+  def error(message)
+    raise BetterSGMLParserError.new(message)
+  end
+  def handle_pi(text)
+  end
+  def handle_decl(text)
+  end
+end

data/lib/rfeedparser/encoding_helpers.rb ADDED Viewed

@@ -0,0 +1,257 @@
+#!/usr/bin/ruby
+module FeedParserUtilities
+  def unicode(data, from_encoding)
+    # Takes a single string and converts it from the encoding in
+    # from_encoding to unicode.
+    uconvert(data, from_encoding, 'unicode')
+  end
+  def uconvert(data, from_encoding, to_encoding = 'utf-8')
+    from_encoding = Encoding_Aliases[from_encoding] || from_encoding
+    to_encoding = Encoding_Aliases[to_encoding] || to_encoding
+    Iconv.iconv(to_encoding, from_encoding, data)[0]
+  end
+  def index_match(stri,regexp, offset)
+    i = stri.index(regexp, offset)
+    return nil, nil unless i
+    full = stri[i..-1].match(regexp)
+    return i, full
+  end
+  def _ebcdic_to_ascii(s)
+    return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
+  end
+  def getCharacterEncoding(feed, xml_data)
+    # Get the character encoding of the XML document
+    $stderr << "In getCharacterEncoding\n" if $debug
+    sniffed_xml_encoding = nil
+    xml_encoding = nil
+    true_encoding = nil
+    begin
+      http_headers = feed.meta
+      http_content_type = feed.meta['content-type'].split(';')[0]
+      encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
+      http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
+      http_encoding = nil if http_encoding.empty?
+      # FIXME Open-Uri returns iso8859-1 if there is no charset header,
+      # but that doesn't pass the tests. Open-Uri claims its following
+      # the right RFC. Are they wrong or do we need to change the tests?
+    rescue NoMethodError
+      http_headers = {}
+      http_content_type = nil
+      http_encoding = nil
+    end
+    # Must sniff for non-ASCII-compatible character encodings before
+    # searching for XML declaration.  This heuristic is defined in
+    # section F of the XML specification:
+    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+    begin
+      if xml_data[0..3] == "\x4c\x6f\xa7\x94"
+	# EBCDIC
+	xml_data = _ebcdic_to_ascii(xml_data)
+      elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
+	# UTF-16BE
+	sniffed_xml_encoding = 'utf-16be'
+	xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
+      elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
+	# UTF-16BE with BOM
+	sniffed_xml_encoding = 'utf-16be'
+	xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
+      elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
+	# UTF-16LE
+	sniffed_xml_encoding = 'utf-16le'
+	xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
+      elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
+	# UTF-16LE with BOM
+	sniffed_xml_encoding = 'utf-16le'
+	xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
+      elsif xml_data[0..3] == "\x00\x00\x00\x3c"
+	# UTF-32BE
+	sniffed_xml_encoding = 'utf-32be'
+	xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
+      elsif xml_data[0..3] == "\x3c\x00\x00\x00"
+	# UTF-32LE
+	sniffed_xml_encoding = 'utf-32le'
+	xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
+      elsif xml_data[0..3] == "\x00\x00\xfe\xff"
+	# UTF-32BE with BOM
+	sniffed_xml_encoding = 'utf-32be'
+	xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
+      elsif xml_data[0..3] == "\xff\xfe\x00\x00"
+	# UTF-32LE with BOM
+	sniffed_xml_encoding = 'utf-32le'
+	xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
+      elsif xml_data[0..2] == "\xef\xbb\xbf"
+	# UTF-8 with BOM
+	sniffed_xml_encoding = 'utf-8'
+	xml_data = xml_data[3..-1]
+      else
+	# ASCII-compatible
+      end
+      xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
+    rescue
+      xml_encoding_match = nil
+    end
+    if xml_encoding_match
+      xml_encoding = xml_encoding_match[1].downcase
+      xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
+      if sniffed_xml_encoding and xencodings.include?xml_encoding
+	xml_encoding = sniffed_xml_encoding
+      end
+    end
+    acceptable_content_type = false
+    application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
+    text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
+    if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
+      acceptable_content_type = true
+      true_encoding = http_encoding || xml_encoding || 'utf-8'
+    elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
+      acceptable_content_type = true
+      true_encoding = http_encoding || 'us-ascii'
+    elsif /^text\// =~ http_content_type
+      true_encoding = http_encoding || 'us-ascii'
+    elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
+      true_encoding = xml_encoding || 'iso-8859-1'
+    else
+      true_encoding = xml_encoding || 'utf-8'
+    end
+    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
+  end
+  def toUTF8(data, encoding)
+=begin
+    Changes an XML data stream on the fly to specify a new encoding
+    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
+    encoding is a string recognized by encodings.aliases
+=end
+    $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
+    # NOTE we must use double quotes when dealing with \x encodings!
+    if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
+      if $debug
+	$stderr << "stripping BOM\n"
+	if encoding != 'utf-16be'
+	  $stderr << "string utf-16be instead\n"
+	end
+      end
+      encoding = 'utf-16be'
+      data = data[2..-1]
+    elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
+      if $debug
+	$stderr << "stripping BOM\n"
+	$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
+      end
+      encoding = 'utf-16le'
+      data = data[2..-1]
+    elsif (data[0..2] == "\xef\xbb\xbf")
+      if $debug
+	$stderr << "stripping BOM\n"
+	$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
+      end
+      encoding = 'utf-8'
+      data = data[3..-1]
+    elsif (data[0..3] == "\x00\x00\xfe\xff")
+      if $debug
+	$stderr << "stripping BOM\n"
+	if encoding != 'utf-32be'
+	  $stderr << "trying utf-32be instead\n"
+	end
+      end
+      encoding = 'utf-32be'
+      data = data[4..-1]
+    elsif (data[0..3] == "\xff\xfe\x00\x00")
+      if $debug
+	$stderr << "stripping BOM\n"
+	if encoding != 'utf-32le'
+	  $stderr << "trying utf-32le instead\n"
+	end
+      end
+      encoding = 'utf-32le'
+      data = data[4..-1]
+    end
+    begin
+      newdata = uconvert(data, encoding, 'utf-8')
+    rescue => details
+    end
+    $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
+    declmatch = /^<\?xml[^>]*?>/
+      newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
+      if declmatch =~ newdata
+	newdata.sub!(declmatch, newdecl)
+      else
+	newdata = newdecl + "\n" + newdata
+      end
+    return newdata
+  end
+end
+# http://intertwingly.net/stories/2005/09/28/xchar.rb
+module XChar
+  # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
+  CP1252 = {
+    128 => 8364, # euro sign
+    130 => 8218, # single low-9 quotation mark
+    131 =>  402, # latin small letter f with hook
+    132 => 8222, # double low-9 quotation mark
+    133 => 8230, # horizontal ellipsis
+    134 => 8224, # dagger
+    135 => 8225, # double dagger
+    136 =>  710, # modifier letter circumflex accent
+    137 => 8240, # per mille sign
+    138 =>  352, # latin capital letter s with caron
+    139 => 8249, # single left-pointing angle quotation mark
+    140 =>  338, # latin capital ligature oe
+    142 =>  381, # latin capital letter z with caron
+    145 => 8216, # left single quotation mark
+    146 => 8217, # right single quotation mark
+    147 => 8220, # left double quotation mark
+    148 => 8221, # right double quotation mark
+    149 => 8226, # bullet
+    150 => 8211, # en dash
+    151 => 8212, # em dash
+    152 =>  732, # small tilde
+    153 => 8482, # trade mark sign
+    154 =>  353, # latin small letter s with caron
+    155 => 8250, # single right-pointing angle quotation mark
+    156 =>  339, # latin small ligature oe
+    158 =>  382, # latin small letter z with caron
+    159 =>  376} # latin capital letter y with diaeresis
+    # http://www.w3.org/TR/REC-xml/#dt-chardata
+    PREDEFINED = {
+      38 => '&amp;', # ampersand
+      60 => '&lt;',  # left angle bracket
+      62 => '&gt;'}  # right angle bracket
+      # http://www.w3.org/TR/REC-xml/#charsets
+      VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
+	(0xE000..0xFFFD), (0x10000..0x10FFFF)]
+end
+class Fixnum
+  # xml escaped version of chr
+  def xchr
+    n = XChar::CP1252[self] || self
+    n = 42 unless XChar::VALID.find {|range| range.include? n}
+    XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
+  end
+end
+class String
+  alias :old_index :index
+  def to_xs
+    unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
+  rescue
+    unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
+  end
+end