RubyGems - jgre-rfeedparser - Versions diffs - 0.9.961 - Mend

jgre-rfeedparser 0.9.961

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

data/LICENSE +68 -0
data/README +50 -0
data/RUBY-TESTING +66 -0
data/lib/rfeedparser.rb +551 -0
data/lib/rfeedparser/aliases.rb +432 -0
data/lib/rfeedparser/better_attributelist.rb +41 -0
data/lib/rfeedparser/better_sgmlparser.rb +264 -0
data/lib/rfeedparser/encoding_helpers.rb +260 -0
data/lib/rfeedparser/feedparserdict.rb +106 -0
data/lib/rfeedparser/loose_feed_parser.rb +75 -0
data/lib/rfeedparser/markup_helpers.rb +71 -0
data/lib/rfeedparser/monkey_patches.rb +10 -0
data/lib/rfeedparser/nokogiri_parser.rb +80 -0
data/lib/rfeedparser/parser_mixin.rb +1275 -0
data/lib/rfeedparser/scrub.rb +212 -0
data/lib/rfeedparser/time_helpers.rb +408 -0
data/lib/rfeedparser/utilities.rb +23 -0
metadata +187 -0

data/lib/rfeedparser/better_sgmlparser.rb ADDED

@@ -0,0 +1,264 @@
+#!/usr/bin/env ruby
+class BetterSGMLParserError < StandardError; end;
+class BetterSGMLParser < HTML::SGMLParser
+  # Replaced Tagfind and Charref Regexps with the ones in feedparser.py
+  # This makes things work.
+  Interesting = /[&<]/u
+  Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|![^<>]*)?', 64)
+  # 64 is the unicode flag
+  Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*)[^-.a-zA-Z0-9]/u
+  Charref = /&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]/u
+  Shorttagopen = /'<[a-zA-Z][-.a-zA-Z0-9]*/u
+  Shorttag = /'<([a-zA-Z][-.a-zA-Z0-9]*)\/([^\/]*)\//u
+  Endtagopen = /<\//u # Changed the RegExps to match the Python SGMLParser
+  Endbracket = /[<>]/u
+  Declopen = /<!/u
+  Piopenbegin = /^<\?/u
+  Piclose = />/u
+  Commentopen = /<!--/u
+  Commentclose = /--\s*>/u
+  Tagfind = /[a-zA-Z][-_.:a-zA-Z0-9]*/u
+  Attrfind = Regexp.compile('\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'+
+  '(\'[^\']*\'|"[^"]*"|[\]\[\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?', # '
+  64)
+  Endtagfind = /\s*\/\s*>/u
+  def initialize(verbose=false)
+    super(verbose)
+  end
+  def feed(*args)
+    super(*args)
+  end
+  def goahead(_end)
+    rawdata = @rawdata # woo, utf-8 magic
+    i = 0
+    n = rawdata.length
+    while i < n
+      if @nomoretags
+        # handle_data_range does nothing more than set a "Range" that is never used. wtf?
+        handle_data(rawdata[i...n]) # i...n means "range from i to n not including n"
+        i = n
+        break
+      end
+      j = rawdata.index(Interesting, i)
+      j = n unless j
+      handle_data(rawdata[i...j]) if i < j
+      i = j
+      break if (i == n)
+      if rawdata[i..i] == '<' # Yeah, ugly, but I prefer it to rawdata[i] == ?<
+        if rawdata.index(Starttagopen,i) == i
+          if @literal
+            handle_data(rawdata[i..i])
+            i = i+1
+            next
+          end
+          k = parse_starttag(i)
+          break unless k
+          i = k
+          next
+        end
+        if rawdata.index(Endtagopen,i) == i #Don't use Endtagopen
+          k = parse_endtag(i)
+          break unless k
+          i = k
+          @literal = false
+          next
+        end
+        if @literal
+          if n > (i+1)
+            handle_data("<")
+            i = i+1
+          else
+            #incomplete
+            break
+          end
+          next
+        end
+        if rawdata.index(Commentopen,i) == i
+          k = parse_comment(i)
+          break unless k
+          i = k
+          next
+        end
+        if rawdata.index(Piopenbegin,i) == i # Like Piopen but must be at beginning of rawdata
+          k = parse_pi(i)
+          break unless k
+          i += k
+          next
+        end
+        if rawdata.index(Declopen,i) == i
+          # This is some sort of declaration; in "HTML as
+          # deployed," this should only be the document type
+          # declaration ("<!DOCTYPE html...>").
+          k = parse_declaration(i)
+          break unless k
+          i = k
+          next
+        end
+      elsif rawdata[i..i] == '&'
+        if @literal # FIXME BUGME SGMLParser totally does not check this. Bug it.
+          handle_data(rawdata[i..i])
+          i += 1
+          next
+        end
+        # the Char must come first as its #=~ method is the only one that is UTF-8 safe
+        ni,match = index_match(rawdata, Charref, i)
+        if ni && ni == i # See? Ugly
+          handle_charref(match[1]) # $1 is just the first group we captured (with parentheses)
+          i += match[0].length  # $& is the "all" of the match.. it includes the full match we looked for not just the stuff we put parentheses around to capture.
+          i -= 1 unless rawdata[i-1..i-1] == ";"
+          next
+        end
+        ni,match = index_match(rawdata, Entityref, i)
+        if ni && ni == i
+          handle_entityref(match[1])
+          i += match[0].length
+          i -= 1 unless rawdata[i-1..i-1] == ";"
+          next
+        end
+      else
+        error('neither < nor & ??')
+      end
+      # We get here only if incomplete matches but
+      # nothing else
+      ni,match = index_match(rawdata,Incomplete,i)
+      unless ni && ni == 0
+        handle_data(rawdata[i...i+1]) # str[i...i+1] == str[i..i]
+        i += 1
+        next
+      end
+      j = ni + match[0].length
+      break if j == n # Really incomplete
+      handle_data(rawdata[i...j])
+      i = j
+    end # end while
+    if _end && i < n
+      handle_data(rawdata[i...n])
+      i = n
+    end
+    @rawdata = rawdata[i..-1]
+    # @offset += i # FIXME BUGME another unused variable in SGMLParser?
+  end
+  # Internal -- parse processing instr, return length or -1 if not terminated
+  def parse_pi(i)
+    rawdata = @rawdata
+    if rawdata[i...i+2] != '<?'
+      error("unexpected call to parse_pi()")
+    end
+    ni,match = index_match(rawdata,Piclose,i+2)
+    return nil unless match
+    j = ni
+    handle_pi(rawdata[i+2...j])
+    j = (j + match[0].length)
+    return j-i
+  end
+  def parse_comment(i)
+    rawdata = @rawdata
+    if rawdata[i...i+4] != "<!--"
+      error("unexpected call to parse_comment()")
+    end
+    ni,match = index_match(rawdata, Commentclose,i)
+    return nil unless match
+    handle_comment(rawdata[i+4..(ni-1)])
+    return ni+match[0].length # Length from i to just past the closing comment tag
+  end
+  def parse_starttag(i)
+    @_starttag_text = nil
+    start_pos = i
+    rawdata = @rawdata
+    ni,match = index_match(rawdata,Shorttagopen,i)
+    if ni == i
+      # SGML shorthand: <tag/data/ == <tag>data</tag>
+      # XXX Can data contain &... (entity or char refs)?
+      # XXX Can data contain < or > (tag characters)?
+      # XXX Can there be whitespace before the first /?
+      k,match = index_match(rawdata,Shorttag,i)
+      return nil unless match
+      tag, data = match[1], match[2]
+      @_starttag_text = "<#{tag}/"
+      tag.downcase!
+      second_end = rawdata.index(Shorttagopen,k)
+      finish_shorttag(tag, data)
+      @_starttag_text = rawdata[start_pos...second_end+1]
+      return k
+    end
+    j = rawdata.index(Endbracket, i+1)
+    return nil unless j
+    attrsd = []
+    if rawdata[i...i+2] == '<>'
+      # SGML shorthand: <> == <last open tag seen>
+      k = j
+      tag = @lasttag
+    else
+      ni,match = index_match(rawdata,Tagfind,i+1)
+      unless match
+        error('unexpected call to parse_starttag')
+      end
+      k = ni+match[0].length+1
+      tag = match[0].downcase
+      @lasttag = tag
+    end
+    while k < j
+      break if rawdata.index(Endtagfind, k) == k
+      ni,match = index_match(rawdata,Attrfind,k)
+      break unless ni
+      matched_length = match[0].length
+      attrname, rest, attrvalue = match[1],match[2],match[3]
+      if rest.nil? || rest.empty?
+        attrvalue = '' # was: = attrname # Why the change?
+      elsif [?',?'] == [attrvalue[0..0], attrvalue[-1..-1]] || [?",?"] == [attrvalue[0],attrvalue[-1]]
+        attrvalue = attrvalue[1...-1]
+      end
+      attrsd << [attrname.downcase, attrvalue]
+      k += matched_length
+    end
+    if rawdata[j..j] == ">"
+      j += 1
+    end
+    @_starttag_text = rawdata[start_pos...j]
+    finish_starttag(tag, attrsd)
+    return j
+  end
+  def parse_endtag(i)
+    rawdata = @rawdata
+    j, match = index_match(rawdata, /[<>]/,i+1)
+    return nil unless j
+    tag = rawdata[i+2...j].strip.downcase
+    if rawdata[j..j] == ">"
+      j += 1
+    end
+    finish_endtag(tag)
+    return j
+  end
+  def output
+    # Return processed HTML as a single string
+    return @pieces.map{|p| p.to_s}.join
+  end
+  def error(message)
+    raise BetterSGMLParserError.new(message)
+  end
+  def handle_pi(text)
+  end
+  def handle_decl(text)
+  end
+end

data/lib/rfeedparser/encoding_helpers.rb ADDED

@@ -0,0 +1,260 @@
+#!/usr/bin/env ruby
+module FeedParserUtilities
+  def unicode(data, from_encoding)
+    # Takes a single string and converts it from the encoding in
+    # from_encoding to unicode.
+    uconvert(data, from_encoding, 'unicode')
+  end
+  def uconvert(data, from_encoding, to_encoding = 'utf-8')
+    from_encoding = Encoding_Aliases[from_encoding] || from_encoding
+    to_encoding = Encoding_Aliases[to_encoding] || to_encoding
+    Iconv.iconv(to_encoding, from_encoding, data)[0]
+  end
+  def index_match(stri ,regexp, offset)
+    i = stri.index(regexp, offset)
+    return nil, nil unless i
+    full = stri[i..-1].match(regexp)
+    return i, full
+  end
+  def _ebcdic_to_ascii(s)
+    Iconv.iconv("iso-8859-1", "cp500", s)[0]
+  end
+  def getCharacterEncoding(http_headers, xml_data)
+    # Get the character encoding of the XML document
+    $stderr << "In getCharacterEncoding\n" if $debug
+    sniffed_xml_encoding = nil
+    xml_encoding = nil
+    true_encoding = nil
+    http_content_type, charset = http_headers['content-type'].to_s.split(';',2)
+    encoding_regexp = /\s*charset\s*=\s*(?:"|')?(.*?)(?:"|')?\s*$/
+    http_encoding = charset.to_s.scan(encoding_regexp).flatten[0]
+    http_encoding = nil if http_encoding && http_encoding.empty?
+    # FIXME Open-Uri returns iso8859-1 if there is no charset header,
+    # but that doesn't pass the tests. Open-Uri claims its following
+    # the right RFC. Are they wrong or do we need to change the tests?
+    # Must sniff for non-ASCII-compatible character encodings before
+    # searching for XML declaration.  This heuristic is defined in
+    # section F of the XML specification:
+    # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
+    begin
+      if xml_data[0..3] == "\x4c\x6f\xa7\x94"
+        # EBCDIC
+        xml_data = _ebcdic_to_ascii(xml_data)
+      elsif xml_data[0..3] == "\x00\x3c\x00\x3f"
+        # UTF-16BE
+        sniffed_xml_encoding = 'utf-16be'
+        xml_data = uconvert(xml_data, 'utf-16be', 'utf-8')
+      elsif xml_data.size >= 4 && xml_data[0..1] == "\xfe\xff" && xml_data[2..3] != "\x00\x00"
+        # UTF-16BE with BOM
+        sniffed_xml_encoding = 'utf-16be'
+        xml_data = uconvert(xml_data[2..-1], 'utf-16be', 'utf-8')
+      elsif xml_data[0..3] == "\x3c\x00\x3f\x00"
+        # UTF-16LE
+        sniffed_xml_encoding = 'utf-16le'
+        xml_data = uconvert(xml_data, 'utf-16le', 'utf-8')
+      elsif xml_data.size >=4 && xml_data[0..1] == "\xff\xfe" && xml_data[2..3] != "\x00\x00"
+        # UTF-16LE with BOM
+        sniffed_xml_encoding = 'utf-16le'
+        xml_data = uconvert(xml_data[2..-1], 'utf-16le', 'utf-8')
+      elsif xml_data[0..3] == "\x00\x00\x00\x3c"
+        # UTF-32BE
+        sniffed_xml_encoding = 'utf-32be'
+        xml_data = uconvert(xml_data, 'utf-32be', 'utf-8')
+      elsif xml_data[0..3] == "\x3c\x00\x00\x00"
+        # UTF-32LE
+        sniffed_xml_encoding = 'utf-32le'
+        xml_data = uconvert(xml_data, 'utf-32le', 'utf-8')
+      elsif xml_data[0..3] == "\x00\x00\xfe\xff"
+        # UTF-32BE with BOM
+        sniffed_xml_encoding = 'utf-32be'
+        xml_data = uconvert(xml_data[4..-1], 'utf-32BE', 'utf-8')
+      elsif xml_data[0..3] == "\xff\xfe\x00\x00"
+        # UTF-32LE with BOM
+        sniffed_xml_encoding = 'utf-32le'
+        xml_data = uconvert(xml_data[4..-1], 'utf-32le', 'utf-8')
+      elsif xml_data[0..2] == "\xef\xbb\xbf"
+        # UTF-8 with BOM
+        sniffed_xml_encoding = 'utf-8'
+        xml_data = xml_data[3..-1]
+      else
+        # ASCII-compatible
+      end
+      xml_encoding_match = /^<\?.*encoding=[\'"](.*?)[\'"].*\?>/.match(xml_data)
+    rescue
+      xml_encoding_match = nil
+    end
+    if xml_encoding_match
+      xml_encoding = xml_encoding_match[1].downcase
+      xencodings = ['iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16']
+      if sniffed_xml_encoding && xencodings.include?(xml_encoding)
+        xml_encoding = sniffed_xml_encoding
+      end
+    end
+    acceptable_content_type = false
+    application_content_types = ['application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity']
+    text_content_types = ['text/xml', 'text/xml-external-parsed-entity']
+    if application_content_types.include?(http_content_type) || (/^application\// =~ http_content_type && /\+xml$/ =~ http_content_type)
+      acceptable_content_type = true
+      true_encoding = http_encoding || xml_encoding || 'utf-8'
+    elsif text_content_types.include?(http_content_type) || (/^text\// =~ http_content_type && /\+xml$/ =~ http_content_type)
+      acceptable_content_type = true
+      true_encoding = http_encoding || 'us-ascii'
+    elsif /^text\// =~ http_content_type
+      true_encoding = http_encoding || 'us-ascii'
+    elsif http_headers && !http_headers.empty? && !http_headers['content-type']
+      true_encoding = xml_encoding || 'iso-8859-1'
+    else
+      true_encoding = xml_encoding || 'utf-8'
+    end
+    return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
+  end
+  def toUTF8(data, encoding)
+    $stderr << "entering self.toUTF8, trying encoding #{encoding}\n" if $debug
+    # NOTE we must use double quotes when dealing with \x encodings!
+    if (data.size >= 4 && data[0..1] == "\xfe\xff" && data[2..3] != "\x00\x00")
+      if $debug
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-16be'
+          $stderr << "string utf-16be instead\n"
+        end
+      end
+      encoding = 'utf-16be'
+      data = data[2..-1]
+    elsif (data.size >= 4 && data[0..1] == "\xff\xfe" && data[2..3] != "\x00\x00")
+      if $debug
+        $stderr << "stripping BOM\n"
+        $stderr << "trying utf-16le instead\n" if encoding != 'utf-16le'
+      end
+      encoding = 'utf-16le'
+      data = data[2..-1]
+    elsif (data[0..2] == "\xef\xbb\xbf")
+      if $debug
+        $stderr << "stripping BOM\n"
+        $stderr << "trying utf-8 instead\n" if encoding != 'utf-8'
+      end
+      encoding = 'utf-8'
+      data = data[3..-1]
+    elsif (data[0..3] == "\x00\x00\xfe\xff")
+      if $debug
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-32be'
+          $stderr << "trying utf-32be instead\n"
+        end
+      end
+      encoding = 'utf-32be'
+      data = data[4..-1]
+    elsif (data[0..3] == "\xff\xfe\x00\x00")
+      if $debug
+        $stderr << "stripping BOM\n"
+        if encoding != 'utf-32le'
+          $stderr << "trying utf-32le instead\n"
+        end
+      end
+      encoding = 'utf-32le'
+      data = data[4..-1]
+    end
+    begin
+      newdata = uconvert(data, encoding, 'utf-8')
+    rescue => details
+      raise details
+    end
+    $stderr << "successfully converted #{encoding} data to utf-8\n" if $debug
+    declmatch = /^<\?xml[^>]*?>/
+    newdecl = "<?xml version=\'1.0\' encoding=\'utf-8\'?>"
+    if declmatch =~ newdata
+      newdata.sub!(declmatch, newdecl)
+    else
+      newdata = newdecl + "\n" + newdata
+    end
+    newdata
+  end
+end
+unless defined?(Builder::XChar)
+  # http://intertwingly.net/stories/2005/09/28/xchar.rb
+  module XChar
+    # http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
+    CP1252 = {
+      128 => 8364, # euro sign
+      130 => 8218, # single low-9 quotation mark
+      131 =>  402, # latin small letter f with hook
+      132 => 8222, # double low-9 quotation mark
+      133 => 8230, # horizontal ellipsis
+      134 => 8224, # dagger
+      135 => 8225, # double dagger
+      136 =>  710, # modifier letter circumflex accent
+      137 => 8240, # per mille sign
+      138 =>  352, # latin capital letter s with caron
+      139 => 8249, # single left-pointing angle quotation mark
+      140 =>  338, # latin capital ligature oe
+      142 =>  381, # latin capital letter z with caron
+      145 => 8216, # left single quotation mark
+      146 => 8217, # right single quotation mark
+      147 => 8220, # left double quotation mark
+      148 => 8221, # right double quotation mark
+      149 => 8226, # bullet
+      150 => 8211, # en dash
+      151 => 8212, # em dash
+      152 =>  732, # small tilde
+      153 => 8482, # trade mark sign
+      154 =>  353, # latin small letter s with caron
+      155 => 8250, # single right-pointing angle quotation mark
+      156 =>  339, # latin small ligature oe
+      158 =>  382, # latin small letter z with caron
+      159 =>  376 # latin capital letter y with diaeresis
+    }
+    # http://www.w3.org/TR/REC-xml/#dt-chardata
+    PREDEFINED = {
+      38 => '&amp;', # ampersand
+      60 => '&lt;',  # left angle bracket
+      62 => '&gt;'  # right angle bracket
+    }
+    # http://www.w3.org/TR/REC-xml/#charsets
+    VALID = [
+      0x9, 0xA, 0xD,
+      (0x20..0xD7FF),
+      (0xE000..0xFFFD),
+      (0x10000..0x10FFFF)
+    ]
+  end
+  class Fixnum
+    # xml escaped version of chr
+    def xchr
+      n = XChar::CP1252[self] || self
+      case n when *XChar::VALID
+        XChar::PREDEFINED[n] || (n<128 ? n.chr : "&##{n};")
+      else
+        '*'
+      end
+    end
+  end
+  class String
+    def to_xs
+      unpack('U*').map {|n| n.xchr}.join # ASCII, UTF-8
+    rescue
+      unpack('C*').map {|n| n.xchr}.join # ISO-8859-1, WIN-1252
+    end
+  end
+end