RubyGems - mailparser - Versions diffs - 0.4.22a → 0.5.0.beta1 - Mend

mailparser 0.4.22a → 0.5.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

data/README.txt +2 -6
data/lib/mailparser/conv_charset.rb +40 -19
data/lib/mailparser/error.rb +1 -0
data/lib/mailparser/loose.rb +22 -20
data/lib/mailparser/rfc2045/parser.rb +140 -140
data/lib/mailparser/rfc2045/scanner.rb +15 -14
data/lib/mailparser/rfc2045.rb +1 -0
data/lib/mailparser/rfc2047.rb +26 -37
data/lib/mailparser/rfc2183/parser.rb +2 -1
data/lib/mailparser/rfc2183/scanner.rb +1 -0
data/lib/mailparser/rfc2183.rb +1 -0
data/lib/mailparser/rfc2231.rb +6 -5
data/lib/mailparser/rfc2822/parser.rb +584 -544
data/lib/mailparser/rfc2822/scanner.rb +21 -21
data/lib/mailparser/rfc2822.rb +1 -0
data/lib/mailparser.rb +83 -209
data/test/test_loose.rb +17 -8
data/test/test_mailparser.rb +88 -183
data/test/test_rfc2045.rb +1 -1
data/test/test_rfc2047.rb +35 -13
data/test/test_rfc2183.rb +1 -1
data/test/test_rfc2822.rb +6 -2
metadata +22 -9
data/HISTORY +0 -141
data/lib/mailparser/obsolete.rb +0 -403
data/test/test_obsolete.rb +0 -615

data/lib/mailparser/rfc2822/scanner.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: ascii-8bit
 # Copyright (C) 2006-2010 TOMITA Masahiro
 # mailto:tommy@tmtm.org
@@ -25,40 +25,40 @@ class MailParser::RFC2822::Scanner
   def scan()
     until @ss.eos?
       case
-      when s = @ss.scan(/\s*\(/nmo)
+      when s = @ss.scan(/\s*\(/)
         @token << cfws(@ss)
         @token_idx[@token.last.object_id] = @token.size-1
-      when s = @ss.scan(/\s+/nmo)
+      when s = @ss.scan(/\s+/)
         @token << s
         @token_idx[s.object_id] = @token.size-1
-      when s = @ss.scan(/\"(\\[#{TEXT_RE}]|[#{QTEXT_RE}])*\"/no)
+      when s = @ss.scan(/\"(\\[#{TEXT_RE}]|[#{QTEXT_RE}])*\"/o)
         @token << s
         @token_idx[s.object_id] = @token.size-1
-        yield :NO_FOLD_QUOTE, s
-      when s = @ss.scan(/\"(\s*(\\[#{TEXT_RE}]|[#{QTEXT_RE}]))*\s*\"/nmo)
+        yield [:NO_FOLD_QUOTE, s]
+      when s = @ss.scan(/\"(\s*(\\[#{TEXT_RE}]|[#{QTEXT_RE}]))*\s*\"/o)
         @token << s
         @token_idx[s.object_id] = @token.size-1
-        yield :QUOTED_STRING, s
-      when s = @ss.scan(/\[(\\[#{TEXT_RE}]|[#{DTEXT_RE}])*\]/no)
+        yield [:QUOTED_STRING, s]
+      when s = @ss.scan(/\[(\\[#{TEXT_RE}]|[#{DTEXT_RE}])*\]/o)
         @token << s
         @token_idx[s.object_id] = @token.size-1
-        yield :NO_FOLD_LITERAL, s
-      when s = @ss.scan(/\[(\s*(\\[#{TEXT_RE}]|[#{DTEXT_RE}]))*\s*\]/nmo)
+        yield [:NO_FOLD_LITERAL, s]
+      when s = @ss.scan(/\[(\s*(\\[#{TEXT_RE}]|[#{DTEXT_RE}]))*\s*\]/o)
         @token << s
         @token_idx[s.object_id] = @token.size-1
-        yield :DOMAIN_LITERAL, s
-      when s = @ss.scan(/[#{ATEXT_RE}]+/no)
+        yield [:DOMAIN_LITERAL, s]
+      when s = @ss.scan(/[#{ATEXT_RE}]+/o)
         @token << s
         @token_idx[s.object_id] = @token.size-1
         if s =~ /\A\d+\z/ then
-          yield :DIGIT, s
+          yield [:DIGIT, s]
         else
-          yield :ATOM, s
+          yield [:ATOM, s]
         end
-      when s = @ss.scan(/./no)
+      when s = @ss.scan(/./)
         @token << s
         @token_idx[s.object_id] = @token.size-1
-        yield s, s
+        yield [s, s]
       end
     end
     yield nil
@@ -73,9 +73,9 @@ class MailParser::RFC2822::Scanner
     comments = []
     while true
       c = cfws_sub(ss)
-      ss.skip(/\s+/nmo)
+      ss.skip(/\s+/)
       comments << "(#{c})"
-      break unless @ss.scan(/\(/no)
+      break unless @ss.scan(/\(/)
     end
     @comments.concat comments
     return comments.join
@@ -86,12 +86,12 @@ class MailParser::RFC2822::Scanner
   def cfws_sub(ss)
     ret = ""
     until ss.eos? do
-      if ss.scan(/(\s*(\\[#{TEXT_RE}]|[#{CTEXT_RE}]))*\s*/nmo) then
+      if ss.scan(/(\s*(\\[#{TEXT_RE}]|[#{CTEXT_RE}]))*\s*/o) then
         ret << ss.matched
       end
-      if ss.scan(/\)/no) then      # 「)」が来たら復帰
+      if ss.scan(/\)/) then      # 「)」が来たら復帰
         return ret
-      elsif ss.scan(/\(/no) then      # 「(」が来たら再帰
+      elsif ss.scan(/\(/) then      # 「(」が来たら再帰
         c = cfws_sub(ss)
         break if c.nil?
         ret << "(" << c << ")"

data/lib/mailparser/rfc2822.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# coding: ascii-8bit
 # Copyright (C) 2006-2010 TOMITA Masahiro
 # mailto:tommy@tmtm.org

data/lib/mailparser.rb CHANGED Viewed

@@ -1,5 +1,5 @@
-# -*- coding: utf-8 -*-
-# Copyright (C) 2006-2010 TOMITA Masahiro
+# coding: ascii-8bit
+# Copyright (C) 2006-2011 TOMITA Masahiro
 # mailto:tommy@tmtm.org
 require "mailparser/error"
@@ -10,6 +10,7 @@ require "mailparser/rfc2231"
 require "mailparser/rfc2822"
 require "mailparser/loose"
 require "mailparser/conv_charset"
+require "mmapscanner"
 require "stringio"
 require "tempfile"
@@ -164,45 +165,87 @@ module MailParser
   # メール全体またはひとつのパートを表すクラス
   class Message
     # src からヘッダ部を読み込み Header オブジェクトに保持する
-    # src:: gets メソッドを持つオブジェクト(ex. IO, StringIO)
+    # src:: String / File / MmapScanner / read メソッドを持つオブジェクト
     # opt:: オプション(Hash)
-    #  :skip_body::            本文をスキップする
-    #  :text_body_only::       text/* type 以外の本文をスキップする
     #  :extract_message_type:: message/* type を展開する
     #  :decode_mime_header::   MIMEヘッダをデコードする
     #  :decode_mime_filename:: ファイル名を MIME デコードする
     #  :output_charset::       デコード出力文字コード(デフォルト: 変換しない)
     #  :strict::               RFC違反時に ParseError 例外を発生する
-    #  :keep_raw::             生メッセージを保持する
     #  :charset_converter::    文字コード変換用 Proc または Method
-    #  :use_file::             body, raw がこのサイズを超えたらメモリではなくファイルを使用する
-    # boundary:: このパートの終わりを表す文字列の配列
-    def initialize(src, opt={}, boundary=[])
-      src = src.is_a?(String) ? StringIO.new(src) : src
-      @dio = DelimIO.new(src, boundary, opt[:keep_raw], opt[:use_file])
+    def initialize(src, opt={})
+      if src.is_a? String
+        @src = MmapScanner.new src
+      elsif src.is_a? File and src.stat.ftype == 'file'
+        @src = MmapScanner.new src
+      elsif src.is_a? StringIO
+        @src = MmapScanner.new src.string
+      elsif src.is_a? MmapScanner
+        @src = src
+      else
+        tmpf = Tempfile.new 'mailparser'
+        buf = ''
+        while src.read(4096, buf)
+          tmpf.write buf
+        end
+        tmpf.close
+        @src = File.open(tmpf.path){|f| MmapScanner.new f}
+        File.unlink tmpf.path
+      end
       @opt = opt
-      @boundary = boundary
       @from = @to = @cc = @subject = nil
       @type = @subtype = @charset = @content_transfer_encoding = @filename = nil
-      @rawheader = ''
-      @message = nil
-      @body = @body_preconv = DataBuffer.new(opt[:use_file])
+      @rawheader = nil
+      @rawbody = nil
       @part = []
       opt[:charset_converter] ||= ConvCharset.method(:conv_charset)
       read_header
-      read_body
       read_part
     end
-    attr_reader :header, :part, :message
+    attr_reader :header, :part
+    # charset 変換後の本文を返す
     def body
-      @body.str
+      body = body_preconv
+      if type == 'text' and charset and @opt[:output_charset]
+        begin
+          body = @opt[:charset_converter].call(charset, @opt[:output_charset], body)
+        rescue
+          # ignore
+        end
+      end
+      body
     end
+    # charset 変換前の本文を返す
     def body_preconv
-      @body_preconv.str
+      return '' if type == 'multipart' or type == 'message'
+      body = @rawbody.to_s
+      ret = case content_transfer_encoding
+            when "quoted-printable" then RFC2045.qp_decode(body)
+            when "base64" then RFC2045.b64_decode(body)
+            when "uuencode", "x-uuencode", "x-uue" then decode_uuencode(body)
+            else body
+            end
+      if type == 'text' and charset
+        ret.force_encoding(charset) rescue nil
+      end
+      ret
+    end
+    # Content-Type が message の時 Message を返す。そうでなければ nil を返す。
+    def message
+      unless @opt[:extract_message_type] and type == "message"
+        return nil
+      end
+      if ['7bit', '8bit'].include? content_transfer_encoding
+        @rawbody.pos = 0
+        return Message.new(@rawbody, @opt)
+      end
+      return Message.new(body_preconv, @opt)
     end
     # From ヘッダがあれば Mailbox を返す。
@@ -324,95 +367,47 @@ module MailParser
     # 生メッセージを返す
     def raw
-      @dio.keep_buffer.str
+      return @src.to_s
     end
     # 生ヘッダを返す
     def rawheader
-      @rawheader
+      @rawheader.to_s
     end
     private
     # ヘッダ部をパースする
-    # return:: true: 継続行あり
     def read_header()
+      @rawheader = @src.scan_until(/^(?=\r?\n)|\z/)
       @header = Header.new(@opt)
-      headers = []
-      @dio.each_line do |line|
-        break if line.chomp.empty?
-        cont = line =~ /^[ \t]/
-        if (cont and headers.empty?) or (!cont and !line.include? ":") then
-          @dio.ungets
-          break
-        end
-        if line =~ /^[ \t]/ then
-          headers[-1] += line    # :keep_raw 時の行破壊を防ぐため`<<'は使わない
+      until @rawheader.eos?
+        if @rawheader.skip(/(.*?)[ \t]*:[ \t]*(.*(\r?\n[ \t].*)*(\r?\n)?)/)
+          name = @rawheader.matched(1).to_s
+          body = @rawheader.matched(2).to_s
+          @header.add(name, body)
         else
-          headers << line
+          @rawheader.skip(/.*\n/) or break
         end
-        @rawheader << line
-      end
-      headers.each do |h|
-        name, body = h.split(/\s*:\s*/n, 2)
-        @header.add(name, body)
-      end
-    end
-    # 本文を読む
-    def read_body()
-      return if type == "multipart" or @dio.eof?
-      unless @opt[:extract_message_type] and type == "message" then
-        if @opt[:skip_body] or (@opt[:text_body_only] and type != "text")
-          @dio.each_line{}         # 本文skip
-          return
-        end
-      end
-      body = ''
-      @dio.each_line do |line|
-        body << line
-      end
-      body.chomp! unless @dio.real_eof?
-      case content_transfer_encoding
-      when "quoted-printable" then @body << RFC2045.qp_decode(body)
-      when "base64" then @body << RFC2045.b64_decode(body)
-      when "uuencode", "x-uuencode", "x-uue" then @body << decode_uuencode(body)
-      else @body << body
-      end
-      @body_preconv = @body
-      if type == 'text' and charset and @opt[:output_charset] then
-        new_body = DataBuffer.new(@opt[:use_file])
-        begin
-          if @opt[:use_file] and @body.size > @opt[:use_file]
-            newline = @opt[:charset_converter].call(@opt[:output_charset], charset, "\n")
-            @body.io.each_line(newline) do |line|
-              new_body << @opt[:charset_converter].call(charset, @opt[:output_charset], line)
-            end
-          else
-            new_body << @opt[:charset_converter].call(charset, @opt[:output_charset], @body.str)
-          end
-          @body = new_body
-        rescue
-          # ignore
-        end
-      end
-      if @opt[:extract_message_type] and type == "message" and not @body.empty? then
-        @message = Message.new(@body.io, @opt)
       end
+      @src.scan(/\r?\n/)        # 空行スキップ
+      @rawbody = @src.rest
     end
     # 各パートの Message オブジェクトの配列を作成
     def read_part()
-      return if type != "multipart" or @dio.eof?
+      return if type != "multipart" or @src.eos?
       b = @header["content-type"][0].params["boundary"]
-      bd = ["--#{b}--", "--#{b}"]
-      last_line = @dio.each_line(bd){}        # skip preamble
-      while last_line and last_line.chomp == bd.last
-        m = Message.new @dio, @opt, @boundary+bd
-        @part << m
-        last_line = @dio.gets                 # read boundary
+      re = /(?:\A|\r?\n)--#{Regexp.escape b}(?:|(--))(?:\r?\n|\z)/
+      @src.scan_until(re) or return  # skip preamble
+      until @src.eos?
+        unless p = @src.scan_until(re)
+          @part.push Message.new(@src.rest, @opt)
+          break
+        end
+        @part.push Message.new(p.peek(p.size-@src.matched.length), @opt)
+        break if @src.matched(1)
       end
-      @dio.each_line{}                        # skip epilogue
     end
     # uuencode のデコード
@@ -434,125 +429,4 @@ module MailParser
     end
   end
-  # 特定の行を EOF とみなして gets が動く IO モドキ
-  class DelimIO
-    # src:: IO または StringIO
-    # delim:: 区切り行の配列
-    # keep:: 全行保存
-    # use_file:: keep_buffer がこのサイズを超えたらメモリではなくファイルを使用する
-    def initialize(src, delim=nil, keep=false, use_file=nil)
-      @src = src
-      @delim_re = delim && !delim.empty? && Regexp.new(delim.map{|d|"\\A#{Regexp.quote(d)}\\r?\\Z"}.join("|"))
-      @keep = keep
-      @keep_buffer = DataBuffer.new(use_file)
-      @line_buffer = nil
-      @eof = false                # delim に達したら真
-      @real_eof = false
-      @last_read_line = nil
-    end
-    attr_reader :keep_buffer
-    # 行毎にブロックを繰り返す。
-    # delim に一致した場合は中断
-    # delim:: 区切り文字列の配列
-    # return:: delimに一致した行 or nil(EOFに達した)
-    def each_line(delim=nil)
-      return if @eof
-      while line = gets
-        return line if delim and delim.include? line.chomp
-        yield line
-      end
-      nil
-    end
-    alias each each_line
-    # 1行読み込む。@delim_re に一致する行で EOF
-    def gets
-      return if @eof
-      if @line_buffer
-        line = @line_buffer
-        @line_buffer = nil
-      else
-        line = @src.gets
-        unless line  # EOF
-          @keep_buffer << @last_read_line if @keep and @last_read_line
-          @eof = @real_eof = true
-          return
-        end
-      end
-      if @delim_re and @delim_re.match line
-        @keep_buffer << @last_read_line if @keep and @last_read_line
-        @src.ungets
-        @eof = true
-        return
-      end
-      @keep_buffer << @last_read_line if @keep and @last_read_line
-      @last_read_line = line
-      line
-    end
-    def ungets
-      raise "preread line nothing" unless @last_read_line
-      @eof = false
-      @line_buffer = @last_read_line
-      @last_read_line = nil
-    end
-    def eof?
-      @eof
-    end
-    def real_eof?
-      @src.is_a?(DelimIO) ? @src.real_eof? : @real_eof
-    end
-  end
-  # 通常はメモリにデータを保持し、それ以上はファイル(Tempfile)に保持するためのクラス
-  class DataBuffer
-    # limit:: データがこのバイト数を超えたらファイルに保持する。nil の場合は無制限。
-    def initialize(limit)
-      @limit = limit
-      @buffer = StringIO.new
-    end
-    # バッファに文字列を追加する
-    def <<(str)
-      if @limit and @buffer.is_a? StringIO and @buffer.size+str.size > @limit
-        file = Tempfile.new 'mailparser_databuffer'
-        file.unlink rescue nil
-        file.write @buffer.string
-        @buffer = file
-      end
-      @buffer << str
-    end
-    # バッファ内のデータを返す
-    def str
-      if @buffer.is_a? StringIO
-        @buffer.string
-      else
-        @buffer.rewind
-        @buffer.read
-      end
-    end
-    # IOオブジェクト(のようなもの)を返す
-    def io
-      @buffer.rewind
-      @buffer
-    end
-    # データの大きさを返す
-    def size
-      @buffer.pos
-    end
-    # バッファが空かどうかを返す
-    def empty?
-      @buffer.pos == 0
-    end
-  end
 end

data/test/test_loose.rb CHANGED Viewed

@@ -1,9 +1,10 @@
-# -*- coding: utf-8 -*-
+# coding: ascii-8bit
 # Copyright (C) 2007-2010 TOMITA Masahiro
 # mailto:tommy@tmtm.org
 require "mailparser/loose"
 require "test/unit"
+require "timeout"
 class TC_Loose < Test::Unit::TestCase
   include MailParser::Loose
@@ -45,10 +46,10 @@ class TC_Loose < Test::Unit::TestCase
   end
   def test_parse_phrase_list_mime_charset_converter
-    p = parse_phrase_list("abc =?us-ascii?q?def?=, ghi jkl", :decode_mime_header=>true, :output_charset=>"utf-8", :charset_converter=>proc{"12345"})
+    p = parse_phrase_list("abc =?us-ascii?q?def?=, ghi jkl", :decode_mime_header=>true, :output_charset=>"utf-8", :charset_converter=>proc{|_,_,s| s.upcase})
     assert_equal(2, p.size)
-    assert_equal("abc 12345", p[0])
-    assert_equal("ghi jkl", p[1])
+    assert_equal("ABC DEF", p[0])
+    assert_equal("GHI JKL", p[1])
   end
   def test_parse_received()
@@ -233,12 +234,20 @@ class TC_Loose < Test::Unit::TestCase
   def test_parse_other_header_decode
     s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=", :decode_mime_header=>true)
-    assert_equal "\xa4\xa2\xa4\xa4", s
+    if String.method_defined? :encode
+      assert_equal 'あい'.encode('euc-jp', 'utf-8'), s
+    else
+      assert_equal "\xa4\xa2\xa4\xa4", s
+    end
   end
   def test_parse_other_header_decode_charset
     s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=", :decode_mime_header=>true, :output_charset=>"utf-8")
-    assert_equal "あい", s
+    if String.method_defined? :force_encoding
+      assert_equal "あい".force_encoding('utf-8'), s
+    else
+      assert_equal "あい", s
+    end
   end
   def test_parse_other_header_decode_charset_converter
@@ -270,9 +279,9 @@ class TC_Loose < Test::Unit::TestCase
   end
   def test_mailbox_charset_converter
-    ml = mailbox_list("hoge =?us-ascii?q?hoge?= <hoge.hoge@example.com>", {:decode_mime_header=>true, :output_charset=>"us-ascii", :charset_converter=>proc{"fuga"}})
+    ml = mailbox_list("hoge =?us-ascii?q?hoge?= <hoge.hoge@example.com>", {:decode_mime_header=>true, :output_charset=>"us-ascii", :charset_converter=>proc{|_,_,s| s.upcase}})
     assert_equal(1, ml.size)
-    assert_equal("hoge fuga", ml[0].phrase)
+    assert_equal("HOGE HOGE", ml[0].phrase)
     assert_equal("hoge.hoge", ml[0].addr_spec.local_part)
     assert_equal("example.com", ml[0].addr_spec.domain)
   end