RubyGems - rfeedparser - Versions diffs - 0.9.9 → 0.9.85 - Mend

rfeedparser 0.9.9 → 0.9.85

Files changed (14) hide show

data/lib/rfeedparser.rb +3354 -170
data/tests/rfeedparsertest.rb +1 -3
metadata +3280 -3301
data/lib/rfeedparser/aliases.rb +0 -432
data/lib/rfeedparser/better_attributelist.rb +0 -41
data/lib/rfeedparser/better_sgmlparser.rb +0 -264
data/lib/rfeedparser/encoding_helpers.rb +0 -257
data/lib/rfeedparser/feedparserdict.rb +0 -93
data/lib/rfeedparser/forgiving_uri.rb +0 -93
data/lib/rfeedparser/markup_helpers.rb +0 -73
data/lib/rfeedparser/parser_mixin.rb +0 -1235
data/lib/rfeedparser/parsers.rb +0 -177
data/lib/rfeedparser/scrub.rb +0 -207
data/lib/rfeedparser/time_helpers.rb +0 -408

data/lib/rfeedparser/better_sgmlparser.rb DELETED

@@ -1,264 +0,0 @@
-#!/usr/bin/ruby
-class BetterSGMLParserError < Exception; end;
-class BetterSGMLParser < HTML::SGMLParser
-  # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
-  # This makes things work.
-  Interesting = /[&<]/u
-  Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|![^<>]*)?', 64)
-  # 64 is the unicode flag
-  Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
-  Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
-  Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
-  Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
-  Endtagopen = /<\//u # Matching the Python SGMLParser
-  Endbracket = /[<>]/u
-  Declopen = /<!/u
-  Piopenbegin = /^<\?/u
-  Piclose = />/u
-  Commentopen = /<!--/u
-  Commentclose = /--\s*>/u
-  Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
-  Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
-			    '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?',
-			    64)
-  Endtagfind = /\s*\/\s*>/u
-  def initialize(verbose=false)
-    super(verbose)
-  end
-  def feed(*args)
-    super(*args)
-  end
-  def goahead(_end)
-    rawdata = @rawdata # woo, utf-8 magic
-    i = 0
-    n = rawdata.length
-    while i < n
-      if @nomoretags
-	# handle_data_range does nothing more than set a "Range" that is never used. wtf?
-	handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
-	i = n
-	break
-      end
-      j = rawdata.index(Interesting, i)
-      j = n unless j
-      handle_data(rawdata[i...j]) if i < j
-      i = j
-      break if (i == n)
-      if rawdata[i..i] == '<' # equivalent to rawdata[i..i] == '<' # Yeah, ugly.
-	if rawdata.index(Starttagopen,i) == i
-	  if @literal
-	    handle_data(rawdata[i..i])
-	    i = i+1
-	    next
-	  end
-	  k = parse_starttag(i)
-	  break unless k
-	  i = k
-	  next
-	end
-	if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
-	  k = parse_endtag(i)
-	  break unless k
-	  i = k
-	  @literal = false
-	  next
-	end
-	if @literal
-	  if n > (i+1)
-	    handle_data("<")
-	    i = i+1
-	  else
-	    #incomplete
-	    break
-	  end
-	  next
-	end
-	if rawdata.index(Commentopen,i) == i
-	  k = parse_comment(i)
-	  break unless k
-	  i = k
-	  next
-	end
-	if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
-	  k = parse_pi(i)
-	  break unless k
-	  i += k
-	  next
-	end
-	if rawdata.index(Declopen,i) == i
-	  # This is some sort of declaration; in "HTML as
-	  # deployed," this should only be the document type
-	  # declaration ("<!DOCTYPE html...>").
-	  k = parse_declaration(i)
-	  break unless k
-	  i = k
-	  next
-	end
-      elsif rawdata[i..i] == '&'
-	if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
-	  handle_data(rawdata[i..i])
-	  i += 1
-	  next
-	end
-      # the Char must come first as its #=~ method is the only one that is UTF-8 safe
-      ni,match = index_match(rawdata, Charref, i)
-      if ni and ni == i # See? Ugly
-	handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
-	i += match[0].length  # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
-	i -= 1 unless rawdata[i-1..i-1] == ";"
-	next
-      end
-      ni,match = index_match(rawdata, Entityref, i)
-      if ni and ni == i
-	handle_entityref(match[1])
-	i += match[0].length
-	i -= 1 unless rawdata[i-1..i-1] == ";"
-	next
-      end
-      else
-	error('neither < nor & ??')
-      end
-      # We get here only if incomplete matches but
-      # nothing else
-      ni,match = index_match(rawdata,Incomplete,i)
-      unless ni and ni == 0
-	handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
-	i += 1
-	next
-      end
-      j = ni + match[0].length
-      break if j == n # Really incomplete
-      handle_data(rawdata[i...j])
-      i = j
-    end # end while
-    if _end and i < n
-      handle_data(rawdata[i...n])
-      i = n
-    end
-    @rawdata = rawdata[i..-1]
-    # @offset += i # FIXME BUGME another unused variable in SGMLParser?
-  end
-  # Internal -- parse processing instr, return length or -1 if not terminated
-  def parse_pi(i)
-    rawdata = @rawdata
-    if rawdata[i...i+2] != '<?'
-      error("unexpected call to parse_pi()")
-    end
-    ni,match = index_match(rawdata,Piclose,i+2)
-    return nil unless match
-    j = ni
-    handle_pi(rawdata[i+2...j])
-    j = (j + match[0].length)
-    return j-i
-  end
-  def parse_comment(i)
-    rawdata = @rawdata
-    if rawdata[i...i+4] != "<!--"
-      error("unexpected call to parse_comment()")
-    end
-    ni,match = index_match(rawdata, Commentclose,i)
-    return nil unless match
-    handle_comment(rawdata[i+4..(ni-1)])
-    return ni+match[0].length # Length from i to just past the closing comment tag
-  end
-  def parse_starttag(i)
-    @_starttag_text = nil
-    start_pos = i
-    rawdata = @rawdata
-    ni,match = index_match(rawdata,Shorttagopen,i)
-    if ni == i
-      # SGML shorthand: <tag/data/ == <tag>data</tag>
-      # XXX Can data contain &... (entity or char refs)?
-      # XXX Can data contain < or > (tag characters)?
-      # XXX Can there be whitespace before the first /?
-      k,match = index_match(rawdata,Shorttag,i)
-      return nil unless match
-      tag, data = match[1], match[2]
-      @_starttag_text = "<#{tag}/"
-      tag.downcase!
-      second_end = rawdata.index(Shorttagopen,k)
-      finish_shorttag(tag, data)
-      @_starttag_text = rawdata[start_pos...second_end+1]
-      return k
-    end
-    j = rawdata.index(Endbracket, i+1)
-    return nil unless j
-    attrsd = []
-    if rawdata[i...i+2] == '<>'
-      # SGML shorthand: <> == <last open tag seen>
-      k = j
-      tag = @lasttag
-    else
-      ni,match = index_match(rawdata,Tagfind,i+1)
-      unless match
-	error('unexpected call to parse_starttag')
-      end
-      k = ni+match[0].length+1
-      tag = match[0].downcase
-      @lasttag = tag
-    end
-    while k < j
-      break if rawdata.index(Endtagfind, k) == k
-      ni,match = index_match(rawdata,Attrfind,k)
-      break unless ni
-      matched_length = match[0].length
-      attrname, rest, attrvalue = match[1],match[2],match[3]
-      if rest.nil? or rest.empty?
-	attrvalue = '' # was: = attrname # Why the change?
-      elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] or [?",?"] == [attrvalue[0],attrvalue[-1]]
-	attrvalue = attrvalue[1...-1]
-      end
-      attrsd << [attrname.downcase, attrvalue]
-      k += matched_length
-    end
-    if rawdata[j..j] == ">"
-      j += 1
-    end
-    @_starttag_text = rawdata[start_pos...j]
-    finish_starttag(tag, attrsd)
-    return j
-  end
-  def parse_endtag(i)
-    rawdata = @rawdata
-    j, match = index_match(rawdata, /[<>]/,i+1)
-    return nil unless j
-    tag = rawdata[i+2...j].strip.downcase
-    if rawdata[j..j] == ">"
-      j += 1
-    end
-    finish_endtag(tag)
-    return j
-  end
-  def output
-    # Return processed HTML as a single string
-    return @pieces.map{|p| p.to_s}.join
-  end
-  def error(message)
-    raise BetterSGMLParserError.new(message)
-  end
-  def handle_pi(text)
-  end
-  def handle_decl(text)
-  end
-end

data/lib/rfeedparser/encoding_helpers.rb DELETED

@@ -1,257 +0,0 @@
-#!/usr/bin/ruby
-module FeedParserUtilities
-  def unicode(data, from_encoding)
-    # Takes a single string and converts it from the encoding in
-    # from_encoding to unicode.
-    uconvert(data, from_encoding, 'unicode')
-  end
-  def uconvert(data, from_encoding, to_encoding = 'utf-8')
-    from_encoding = Encoding_Aliases[from_encoding] || from_encoding
-    to_encoding = Encoding_Aliases[to_encoding] || to_encoding
-    Iconv.iconv(to_encoding, from_encoding, data)[0]
-  end
-  def index_match(stri,regexp, offset)
-    i = stri.index(regexp, offset)
-    return nil, nil unless i
-    full = stri[i..-1].match(regexp)
-    return i, full
-  end
-  def _ebcdic_to_ascii(s)
-    return Iconv.iconv("iso88591", "ebcdic-cp-be", s)[0]
-  end
-  def getCharacterEncoding(feed, xml_data)
-    # Get the character encoding of the XML document
-    $stderr << "In getCharacterEncoding\n" if $debug
-    sniffed_xml_encoding = nil
-    xml_encoding = nil
-    true_encoding = nil
-    begin
-      http_headers = feed.meta
-      http_content_type = feed.meta['content-type'].split(';')[0]
-      encoding_scan = feed.meta['content-type'].to_s.scan(/charset\s*=\s*(.*?)(?:"|')*$/)
-      http_encoding = encoding_scan.flatten[0].to_s.gsub(/("|')/,'')
-      http_encoding = nil if http_encoding.empty?
-      # FIXME Open-Uri returns iso8859-1 if there is no charset header,
-      # but that doesn't pass the tests. Open-Uri claims its following
-      # the right RFC. Are they wrong or do we need to change the tests?
-    rescue NoMethodError
-      http_headers = {}
-      http_content_type = nil
-      http_encoding = nil
-    end
-    # Must sniff for non-ASCII-compatible character encodings before
-    # searching for XML declaration.  This heuristic is defined in
-    # section F of the XML specification:
-    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
-    begin
-      if xml_data[0..3] == "\x4c\x6f\xa7\x94"
-	# EBCDIC
-	xml_data = _ebcdic_to_ascii(xml_data)
-      elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
-	# UTF-16BE
-	sniffed_xml_encoding = 'utf-16be'
-	xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
-      elsif xml_data.size >= 4 and xml_data[0..1] == "\xfe\xff" and xml_data[2..3] != "\x00\x00"
-	# UTF-16BE with BOM
-	sniffed_xml_encoding = 'utf-16be'
-	xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
-      elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
-	# UTF-16LE
-	sniffed_xml_encoding = 'utf-16le'
-	xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
-      elsif xml_data.size >=4 and xml_data[0..1] == "\xff\xfe" and xml_data[2..3] != "\x00\x00"
-	# UTF-16LE with BOM
-	sniffed_xml_encoding = 'utf-16le'
-	xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
-      elsif xml_data[0..3] == "\x00\x00\x00\x3c"
-	# UTF-32BE
-	sniffed_xml_encoding = 'utf-32be'
-	xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
-      elsif xml_data[0..3] == "\x3c\x00\x00\x00"
-	# UTF-32LE
-	sniffed_xml_encoding = 'utf-32le'
-	xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
-      elsif xml_data[0..3] == "\x00\x00\xfe\xff"
-	# UTF-32BE with BOM
-	sniffed_xml_encoding = 'utf-32be'
-	xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
-      elsif xml_data[0..3] == "\xff\xfe\x00\x00"
-	# UTF-32LE with BOM
-	sniffed_xml_encoding = 'utf-32le'
-	xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
-      elsif xml_data[0..2] == "\xef\xbb\xbf"
-	# UTF-8 with BOM
-	sniffed_xml_encoding = 'utf-8'
-	xml_data = xml_data[3..-1]
-      else
-	# ASCII-compatible
-      end
-      xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
-    rescue
-      xml_encoding_match = nil
-    end
-    if xml_encoding_match
-      xml_encoding = xml_encoding_match[1].downcase
-      xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
-      if sniffed_xml_encoding and xencodings.include?xml_encoding
-	xml_encoding = sniffed_xml_encoding
-      end
-    end
-    acceptable_content_type = false
-    application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
-    text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
-    if application_content_types.include?(http_content_type) or (/^application\// =~ http_content_type and /\+xml$/ =~ http_content_type)
-      acceptable_content_type = true
-      true_encoding = http_encoding || xml_encoding || 'utf-8'
-    elsif text_content_types.include?(http_content_type) or (/^text\// =~ http_content_type and /\+xml$/ =~ http_content_type)
-      acceptable_content_type = true
-      true_encoding = http_encoding || 'us-ascii'
-    elsif /^text\// =~ http_content_type
-      true_encoding = http_encoding || 'us-ascii'
-    elsif http_headers and not http_headers.empty? and not http_headers.has_key?'content-type'
-      true_encoding = xml_encoding || 'iso-8859-1'
-    else
-      true_encoding = xml_encoding || 'utf-8'
-    end
-    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
-  end
-  def toUTF8(data, encoding)
-=begin
-    Changes an XML data stream on the fly to specify a new encoding
-    data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
-    encoding is a string recognized by encodings.aliases
-=end
-    $stderr << "entering self.toUTF8, trying encoding %s\n" % encoding if $debug
-    # NOTE we must use double quotes when dealing with \x encodings!
-    if (data.size >= 4 and data[0..1] == "\xfe\xff" and data[2..3] != "\x00\x00")
-      if $debug
-	$stderr << "stripping BOM\n"
-	if encoding != 'utf-16be'
-	  $stderr << "string utf-16be instead\n"
-	end
-      end
-      encoding = 'utf-16be'
-      data = data[2..-1]
-    elsif (data.size >= 4 and data[0..1] == "\xff\xfe" and data[2..3] != "\x00\x00")
-      if $debug
-	$stderr << "stripping BOM\n"
-	$stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
-      end
-      encoding = 'utf-16le'
-      data = data[2..-1]
-    elsif (data[0..2] == "\xef\xbb\xbf")
-      if $debug
-	$stderr << "stripping BOM\n"
-	$stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
-      end
-      encoding = 'utf-8'
-      data = data[3..-1]
-    elsif (data[0..3] == "\x00\x00\xfe\xff")
-      if $debug
-	$stderr << "stripping BOM\n"
-	if encoding != 'utf-32be'
-	  $stderr << "trying utf-32be instead\n"
-	end
-      end
-      encoding = 'utf-32be'
-      data = data[4..-1]
-    elsif (data[0..3] == "\xff\xfe\x00\x00")
-      if $debug
-	$stderr << "stripping BOM\n"
-	if encoding != 'utf-32le'
-	  $stderr << "trying utf-32le instead\n"
-	end
-      end
-      encoding = 'utf-32le'
-      data = data[4..-1]
-    end
-    begin
-      newdata = uconvert(data, encoding, 'utf-8')
-    rescue => details
-    end
-    $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
-    declmatch = /^<\?xml[^>]*?>/
-      newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
-      if declmatch =~ newdata
-	newdata.sub!(declmatch, newdecl)
-      else
-	newdata = newdecl + "\n" + newdata
-      end
-    return newdata
-  end
-end
-# http://intertwingly.net/stories/2005/09/28/xchar.rb
-module XChar
-  # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
-  CP1252 = {
-    128 => 8364, # euro sign
-    130 => 8218, # single low-9 quotation mark
-    131 =>  402, # latin small letter f with hook
-    132 => 8222, # double low-9 quotation mark
-    133 => 8230, # horizontal ellipsis
-    134 => 8224, # dagger
-    135 => 8225, # double dagger
-    136 =>  710, # modifier letter circumflex accent
-    137 => 8240, # per mille sign
-    138 =>  352, # latin capital letter s with caron
-    139 => 8249, # single left-pointing angle quotation mark
-    140 =>  338, # latin capital ligature oe
-    142 =>  381, # latin capital letter z with caron
-    145 => 8216, # left single quotation mark
-    146 => 8217, # right single quotation mark
-    147 => 8220, # left double quotation mark
-    148 => 8221, # right double quotation mark
-    149 => 8226, # bullet
-    150 => 8211, # en dash
-    151 => 8212, # em dash
-    152 =>  732, # small tilde
-    153 => 8482, # trade mark sign
-    154 =>  353, # latin small letter s with caron
-    155 => 8250, # single right-pointing angle quotation mark
-    156 =>  339, # latin small ligature oe
-    158 =>  382, # latin small letter z with caron
-    159 =>  376} # latin capital letter y with diaeresis
-    # http://www.w3.org/TR/REC-xml/#dt-chardata
-    PREDEFINED = {
-      38 => '&amp;', # ampersand
-      60 => '&lt;',  # left angle bracket
-      62 => '&gt;'}  # right angle bracket
-      # http://www.w3.org/TR/REC-xml/#charsets
-      VALID = [[0x9, 0xA, 0xD], (0x20..0xD7FF),
-	(0xE000..0xFFFD), (0x10000..0x10FFFF)]
-end
-class Fixnum
-  # xml escaped version of chr
-  def xchr
-    n = XChar::CP1252[self] || self
-    n = 42 unless XChar::VALID.find {|range| range.include? n}
-    XChar::PREDEFINED[n] or (n<128 ? n.chr : "&##{n};")
-  end
-end
-class String
-  alias :old_index :index
-  def to_xs
-    unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
-  rescue
-    unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
-  end
-end