RubyGems - UnderpantsGnome-rfeedparser - Versions diffs - 0.9.960 - Mend

UnderpantsGnome-rfeedparser 0.9.960

Files changed (18) hide show

data/LICENSE +68 -0
data/README +50 -0
data/RUBY-TESTING +66 -0
data/lib/rfeedparser.rb +551 -0
data/lib/rfeedparser/aliases.rb +432 -0
data/lib/rfeedparser/better_attributelist.rb +41 -0
data/lib/rfeedparser/better_sgmlparser.rb +264 -0
data/lib/rfeedparser/encoding_helpers.rb +260 -0
data/lib/rfeedparser/feedparserdict.rb +106 -0
data/lib/rfeedparser/loose_feed_parser.rb +75 -0
data/lib/rfeedparser/markup_helpers.rb +71 -0
data/lib/rfeedparser/monkey_patches.rb +10 -0
data/lib/rfeedparser/nokogiri_parser.rb +80 -0
data/lib/rfeedparser/parser_mixin.rb +1275 -0
data/lib/rfeedparser/scrub.rb +212 -0
data/lib/rfeedparser/time_helpers.rb +408 -0
data/lib/rfeedparser/utilities.rb +23 -0
metadata +151 -0

data/lib/rfeedparser/better_sgmlparser.rb ADDED Viewed

@@ -0,0 +1,264 @@
+#!/usr/bin/env ruby
+class BetterSGMLParserError < StandardError; end;
+class BetterSGMLParser < HTML::SGMLParser
+  # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
+  # This makes things work.
+  Interesting = /[&<]/u
+  Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|![^<>]*)?', 64)
+  # 64 is the unicode flag
+  Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
+  Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
+  Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
+  Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
+  Endtagopen = /<\//u # Changed the RegExps to match the Python SGMLParser
+  Endbracket = /[<>]/u
+  Declopen = /<!/u
+  Piopenbegin = /^<\?/u
+  Piclose = />/u
+  Commentopen = /<!--/u
+  Commentclose = /--\s*>/u
+  Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
+  Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
+  '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?', # '
+  64)
+  Endtagfind = /\s*\/\s*>/u
+  def initialize(verbose=false)
+    super(verbose)
+  end
+  def feed(*args)
+    super(*args)
+  end
+  def goahead(_end)
+    rawdata = @rawdata # woo, utf-8 magic
+    i = 0
+    n = rawdata.length
+    while i < n
+      if @nomoretags
+        # handle_data_range does nothing more than set a "Range" that is never used. wtf?
+        handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
+        i = n
+        break
+      end
+      j = rawdata.index(Interesting, i)
+      j = n unless j
+      handle_data(rawdata[i...j]) if i < j
+      i = j
+      break if (i == n)
+      if rawdata[i..i] == '<' # Yeah, ugly, but I prefer it to rawdata[i] == ?<
+        if rawdata.index(Starttagopen,i) == i
+          if @literal
+            handle_data(rawdata[i..i])
+            i = i+1
+            next
+          end
+          k = parse_starttag(i)
+          break unless k
+          i = k
+          next
+        end
+        if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
+          k = parse_endtag(i)
+          break unless k
+          i = k
+          @literal = false
+          next
+        end
+        if @literal
+          if n > (i+1)
+            handle_data("<")
+            i = i+1
+          else
+            #incomplete
+            break
+          end
+          next
+        end
+        if rawdata.index(Commentopen,i) == i
+          k = parse_comment(i)
+          break unless k
+          i = k
+          next
+        end
+        if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
+          k = parse_pi(i)
+          break unless k
+          i += k
+          next
+        end
+        if rawdata.index(Declopen,i) == i
+          # This is some sort of declaration; in "HTML as
+          # deployed," this should only be the document type
+          # declaration ("<!DOCTYPE html...>").
+          k = parse_declaration(i)
+          break unless k
+          i = k
+          next
+        end
+      elsif rawdata[i..i] == '&'
+        if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
+          handle_data(rawdata[i..i])
+          i += 1
+          next
+        end
+        # the Char must come first as its #=~ method is the only one that is UTF-8 safe
+        ni,match = index_match(rawdata, Charref, i)
+        if ni && ni == i # See? Ugly
+          handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
+          i += match[0].length  # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
+          i -= 1 unless rawdata[i-1..i-1] == ";"
+          next
+        end
+        ni,match = index_match(rawdata, Entityref, i)
+        if ni && ni == i
+          handle_entityref(match[1])
+          i += match[0].length
+          i -= 1 unless rawdata[i-1..i-1] == ";"
+          next
+        end
+      else
+        error('neither < nor & ??')
+      end
+      # We get here only if incomplete matches but
+      # nothing else
+      ni,match = index_match(rawdata,Incomplete,i)
+      unless ni && ni == 0
+        handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
+        i += 1
+        next
+      end
+      j = ni + match[0].length
+      break if j == n # Really incomplete
+      handle_data(rawdata[i...j])
+      i = j
+    end # end while
+    if _end && i < n
+      handle_data(rawdata[i...n])
+      i = n
+    end
+    @rawdata = rawdata[i..-1]
+    # @offset += i # FIXME BUGME another unused variable in SGMLParser?
+  end
+  # Internal -- parse processing instr, return length or -1 if not terminated
+  def parse_pi(i)
+    rawdata = @rawdata
+    if rawdata[i...i+2] != '<?'
+      error("unexpected call to parse_pi()")
+    end
+    ni,match = index_match(rawdata,Piclose,i+2)
+    return nil unless match
+    j = ni
+    handle_pi(rawdata[i+2...j])
+    j = (j + match[0].length)
+    return j-i
+  end
+  def parse_comment(i)
+    rawdata = @rawdata
+    if rawdata[i...i+4] != "<!--"
+      error("unexpected call to parse_comment()")
+    end
+    ni,match = index_match(rawdata, Commentclose,i)
+    return nil unless match
+    handle_comment(rawdata[i+4..(ni-1)])
+    return ni+match[0].length # Length from i to just past the closing comment tag
+  end
+  def parse_starttag(i)
+    @_starttag_text = nil
+    start_pos = i
+    rawdata = @rawdata
+    ni,match = index_match(rawdata,Shorttagopen,i)
+    if ni == i
+      # SGML shorthand: <tag/data/ == <tag>data</tag>
+      # XXX Can data contain &... (entity or char refs)?
+      # XXX Can data contain < or > (tag characters)?
+      # XXX Can there be whitespace before the first /?
+      k,match = index_match(rawdata,Shorttag,i)
+      return nil unless match
+      tag, data = match[1], match[2]
+      @_starttag_text = "<#{tag}/"
+      tag.downcase!
+      second_end = rawdata.index(Shorttagopen,k)
+      finish_shorttag(tag, data)
+      @_starttag_text = rawdata[start_pos...second_end+1]
+      return k
+    end
+    j = rawdata.index(Endbracket, i+1)
+    return nil unless j
+    attrsd = []
+    if rawdata[i...i+2] == '<>'
+      # SGML shorthand: <> == <last open tag seen>
+      k = j
+      tag = @lasttag
+    else
+      ni,match = index_match(rawdata,Tagfind,i+1)
+      unless match
+        error('unexpected call to parse_starttag')
+      end
+      k = ni+match[0].length+1
+      tag = match[0].downcase
+      @lasttag = tag
+    end
+    while k < j
+      break if rawdata.index(Endtagfind, k) == k
+      ni,match = index_match(rawdata,Attrfind,k)
+      break unless ni
+      matched_length = match[0].length
+      attrname, rest, attrvalue = match[1],match[2],match[3]
+      if rest.nil? || rest.empty?
+        attrvalue = '' # was: = attrname # Why the change?
+      elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] || [?",?"] == [attrvalue[0],attrvalue[-1]]
+        attrvalue = attrvalue[1...-1]
+      end
+      attrsd << [attrname.downcase, attrvalue]
+      k += matched_length
+    end
+    if rawdata[j..j] == ">"
+      j += 1
+    end
+    @_starttag_text = rawdata[start_pos...j]
+    finish_starttag(tag, attrsd)
+    return j
+  end
+  def parse_endtag(i)
+    rawdata = @rawdata
+    j, match = index_match(rawdata, /[<>]/,i+1)
+    return nil unless j
+    tag = rawdata[i+2...j].strip.downcase
+    if rawdata[j..j] == ">"
+      j += 1
+    end
+    finish_endtag(tag)
+    return j
+  end
+  def output
+    # Return processed HTML as a single string
+    return @pieces.map{|p| p.to_s}.join
+  end
+  def error(message)
+    raise BetterSGMLParserError.new(message)
+  end
+  def handle_pi(text)
+  end
+  def handle_decl(text)
+  end
+end

data/lib/rfeedparser/encoding_helpers.rb ADDED Viewed

@@ -0,0 +1,260 @@
+#!/usr/bin/env ruby
+module FeedParserUtilities
+  def unicode(data, from_encoding)
+    # Takes a single string and converts it from the encoding in
+    # from_encoding to unicode.
+    uconvert(data, from_encoding, 'unicode')
+  end
+  def uconvert(data, from_encoding, to_encoding = 'utf-8')
+    from_encoding = Encoding_Aliases[from_encoding] || from_encoding
+    to_encoding = Encoding_Aliases[to_encoding] || to_encoding
+    Iconv.iconv(to_encoding, from_encoding, data)[0]
+  end
+  def index_match(stri ,regexp, offset)
+    i = stri.index(regexp, offset)
+    return nil, nil unless i
+    full = stri[i..-1].match(regexp)
+    return i, full
+  end
+  def _ebcdic_to_ascii(s)
+    Iconv.iconv("iso-8859-1", "cp500", s)[0]
+  end
+  def getCharacterEncoding(http_headers, xml_data)
+    # Get the character encoding of the XML document
+    $stderr << "In getCharacterEncoding\n" if $debug
+    sniffed_xml_encoding = nil
+    xml_encoding = nil
+    true_encoding = nil
+    http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
+    encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
+    http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
+    http_encoding = nil if http_encoding && http_encoding.empty?
+    # FIXME Open-Uri returns iso8859-1 if there is no charset header,
+    # but that doesn't pass the tests. Open-Uri claims its following
+    # the right RFC. Are they wrong or do we need to change the tests?
+    # Must sniff for non-ASCII-compatible character encodings before
+    # searching for XML declaration.  This heuristic is defined in
+    # section F of the XML specification:
+    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+    begin
+      if xml_data[0..3] == "\x4c\x6f\xa7\x94"
+        # EBCDIC
+        xml_data = _ebcdic_to_ascii(xml_data)
+      elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
+        # UTF-16BE
+        sniffed_xml_encoding = 'utf-16be'
+        xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
+      elsif xml_data.size >= 4 && xml_data[0..1] == "\xfe\xff" && xml_data[2..3] != "\x00\x00"
+        # UTF-16BE with BOM
+        sniffed_xml_encoding = 'utf-16be'
+        xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
+      elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
+        # UTF-16LE
+        sniffed_xml_encoding = 'utf-16le'
+        xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
+      elsif xml_data.size >=4 && xml_data[0..1] == "\xff\xfe" && xml_data[2..3] != "\x00\x00"
+        # UTF-16LE with BOM
+        sniffed_xml_encoding = 'utf-16le'
+        xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
+      elsif xml_data[0..3] == "\x00\x00\x00\x3c"
+        # UTF-32BE
+        sniffed_xml_encoding = 'utf-32be'
+        xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
+      elsif xml_data[0..3] == "\x3c\x00\x00\x00"
+        # UTF-32LE
+        sniffed_xml_encoding = 'utf-32le'
+        xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
+      elsif xml_data[0..3] == "\x00\x00\xfe\xff"
+        # UTF-32BE with BOM
+        sniffed_xml_encoding = 'utf-32be'
+        xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
+      elsif xml_data[0..3] == "\xff\xfe\x00\x00"
+        # UTF-32LE with BOM
+        sniffed_xml_encoding = 'utf-32le'
+        xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
+      elsif xml_data[0..2] == "\xef\xbb\xbf"
+        # UTF-8 with BOM
+        sniffed_xml_encoding = 'utf-8'
+        xml_data = xml_data[3..-1]
+      else
+        # ASCII-compatible
+      end
+      xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
+    rescue
+      xml_encoding_match = nil
+    end
+    if xml_encoding_match
+      xml_encoding = xml_encoding_match[1].downcase
+      xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
+      if sniffed_xml_encoding && xencodings.include?(xml_encoding)
+        xml_encoding = sniffed_xml_encoding
+      end
+    end
+    acceptable_content_type = false
+    application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
+    text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
+    if application_content_types.include?(http_content_type) || (/^application\// =~ http_content_type && /\+xml$/ =~ http_content_type)
+      acceptable_content_type = true
+      true_encoding = http_encoding || xml_encoding || 'utf-8'
+    elsif text_content_types.include?(http_content_type) || (/^text\// =~ http_content_type && /\+xml$/ =~ http_content_type)
+      acceptable_content_type = true
+      true_encoding = http_encoding || 'us-ascii'
+    elsif /^text\// =~ http_content_type
+      true_encoding = http_encoding || 'us-ascii'
+    elsif http_headers && !http_headers.empty? && !http_headers['content-type']
+      true_encoding = xml_encoding || 'iso-8859-1'
+    else
+      true_encoding = xml_encoding || 'utf-8'
+    end
+    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
+  end
+  def toUTF8(data, encoding)
+    $stderr << "entering self.toUTF8, trying encoding #{encoding}\n" if $debug
+    # NOTE we must use double quotes when dealing with \x encodings!
+    if (data.size >= 4 && data[0..1] == "\xfe\xff" && data[2..3] != "\x00\x00")
+      if $debug
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-16be'
+          $stderr << "string utf-16be instead\n"
+        end
+      end
+      encoding = 'utf-16be'
+      data = data[2..-1]
+    elsif (data.size >= 4 && data[0..1] == "\xff\xfe" && data[2..3] != "\x00\x00")
+      if $debug
+        $stderr << "stripping BOM\n"
+        $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
+      end
+      encoding = 'utf-16le'
+      data = data[2..-1]
+    elsif (data[0..2] == "\xef\xbb\xbf")
+      if $debug
+        $stderr << "stripping BOM\n"
+        $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
+      end
+      encoding = 'utf-8'
+      data = data[3..-1]
+    elsif (data[0..3] == "\x00\x00\xfe\xff")
+      if $debug
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-32be'
+          $stderr << "trying utf-32be instead\n"
+        end
+      end
+      encoding = 'utf-32be'
+      data = data[4..-1]
+    elsif (data[0..3] == "\xff\xfe\x00\x00")
+      if $debug
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-32le'
+          $stderr << "trying utf-32le instead\n"
+        end
+      end
+      encoding = 'utf-32le'
+      data = data[4..-1]
+    end
+    begin
+      newdata = uconvert(data, encoding, 'utf-8')
+    rescue => details
+      raise details
+    end
+    $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
+    declmatch = /^<\?xml[^>]*?>/
+    newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
+    if declmatch =~ newdata
+      newdata.sub!(declmatch, newdecl)
+    else
+      newdata = newdecl + "\n" + newdata
+    end
+    newdata
+  end
+end
+unless defined?(Builder::XChar)
+  # http://intertwingly.net/stories/2005/09/28/xchar.rb
+  module XChar
+    # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
+    CP1252 = {
+      128 => 8364, # euro sign
+      130 => 8218, # single low-9 quotation mark
+      131 =>  402, # latin small letter f with hook
+      132 => 8222, # double low-9 quotation mark
+      133 => 8230, # horizontal ellipsis
+      134 => 8224, # dagger
+      135 => 8225, # double dagger
+      136 =>  710, # modifier letter circumflex accent
+      137 => 8240, # per mille sign
+      138 =>  352, # latin capital letter s with caron
+      139 => 8249, # single left-pointing angle quotation mark
+      140 =>  338, # latin capital ligature oe
+      142 =>  381, # latin capital letter z with caron
+      145 => 8216, # left single quotation mark
+      146 => 8217, # right single quotation mark
+      147 => 8220, # left double quotation mark
+      148 => 8221, # right double quotation mark
+      149 => 8226, # bullet
+      150 => 8211, # en dash
+      151 => 8212, # em dash
+      152 =>  732, # small tilde
+      153 => 8482, # trade mark sign
+      154 =>  353, # latin small letter s with caron
+      155 => 8250, # single right-pointing angle quotation mark
+      156 =>  339, # latin small ligature oe
+      158 =>  382, # latin small letter z with caron
+      159 =>  376 # latin capital letter y with diaeresis
+    }
+    # http://www.w3.org/TR/REC-xml/#dt-chardata
+    PREDEFINED = {
+      38 => '&amp;', # ampersand
+      60 => '&lt;',  # left angle bracket
+      62 => '&gt;'  # right angle bracket
+    }
+    # http://www.w3.org/TR/REC-xml/#charsets
+    VALID = [
+      0x9, 0xA, 0xD,
+      (0x20..0xD7FF),
+      (0xE000..0xFFFD),
+      (0x10000..0x10FFFF)
+    ]
+  end
+  class Fixnum
+    # xml escaped version of chr
+    def xchr
+      n = XChar::CP1252[self] || self
+      case n when *XChar::VALID
+        XChar::PREDEFINED[n] || (n<128 ? n.chr : "&##{n};")
+      else
+        '*'
+      end
+    end
+  end
+  class String
+    def to_xs
+      unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
+    rescue
+      unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
+    end
+  end
+end