mailparser 0.4.22a → 0.5.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,4 +1,4 @@
1
- # -*- coding: utf-8 -*-
1
+ # coding: ascii-8bit
2
2
  # Copyright (C) 2006-2010 TOMITA Masahiro
3
3
  # mailto:tommy@tmtm.org
4
4
 
@@ -25,40 +25,40 @@ class MailParser::RFC2822::Scanner
25
25
  def scan()
26
26
  until @ss.eos?
27
27
  case
28
- when s = @ss.scan(/\s*\(/nmo)
28
+ when s = @ss.scan(/\s*\(/)
29
29
  @token << cfws(@ss)
30
30
  @token_idx[@token.last.object_id] = @token.size-1
31
- when s = @ss.scan(/\s+/nmo)
31
+ when s = @ss.scan(/\s+/)
32
32
  @token << s
33
33
  @token_idx[s.object_id] = @token.size-1
34
- when s = @ss.scan(/\"(\\[#{TEXT_RE}]|[#{QTEXT_RE}])*\"/no)
34
+ when s = @ss.scan(/\"(\\[#{TEXT_RE}]|[#{QTEXT_RE}])*\"/o)
35
35
  @token << s
36
36
  @token_idx[s.object_id] = @token.size-1
37
- yield :NO_FOLD_QUOTE, s
38
- when s = @ss.scan(/\"(\s*(\\[#{TEXT_RE}]|[#{QTEXT_RE}]))*\s*\"/nmo)
37
+ yield [:NO_FOLD_QUOTE, s]
38
+ when s = @ss.scan(/\"(\s*(\\[#{TEXT_RE}]|[#{QTEXT_RE}]))*\s*\"/o)
39
39
  @token << s
40
40
  @token_idx[s.object_id] = @token.size-1
41
- yield :QUOTED_STRING, s
42
- when s = @ss.scan(/\[(\\[#{TEXT_RE}]|[#{DTEXT_RE}])*\]/no)
41
+ yield [:QUOTED_STRING, s]
42
+ when s = @ss.scan(/\[(\\[#{TEXT_RE}]|[#{DTEXT_RE}])*\]/o)
43
43
  @token << s
44
44
  @token_idx[s.object_id] = @token.size-1
45
- yield :NO_FOLD_LITERAL, s
46
- when s = @ss.scan(/\[(\s*(\\[#{TEXT_RE}]|[#{DTEXT_RE}]))*\s*\]/nmo)
45
+ yield [:NO_FOLD_LITERAL, s]
46
+ when s = @ss.scan(/\[(\s*(\\[#{TEXT_RE}]|[#{DTEXT_RE}]))*\s*\]/o)
47
47
  @token << s
48
48
  @token_idx[s.object_id] = @token.size-1
49
- yield :DOMAIN_LITERAL, s
50
- when s = @ss.scan(/[#{ATEXT_RE}]+/no)
49
+ yield [:DOMAIN_LITERAL, s]
50
+ when s = @ss.scan(/[#{ATEXT_RE}]+/o)
51
51
  @token << s
52
52
  @token_idx[s.object_id] = @token.size-1
53
53
  if s =~ /\A\d+\z/ then
54
- yield :DIGIT, s
54
+ yield [:DIGIT, s]
55
55
  else
56
- yield :ATOM, s
56
+ yield [:ATOM, s]
57
57
  end
58
- when s = @ss.scan(/./no)
58
+ when s = @ss.scan(/./)
59
59
  @token << s
60
60
  @token_idx[s.object_id] = @token.size-1
61
- yield s, s
61
+ yield [s, s]
62
62
  end
63
63
  end
64
64
  yield nil
@@ -73,9 +73,9 @@ class MailParser::RFC2822::Scanner
73
73
  comments = []
74
74
  while true
75
75
  c = cfws_sub(ss)
76
- ss.skip(/\s+/nmo)
76
+ ss.skip(/\s+/)
77
77
  comments << "(#{c})"
78
- break unless @ss.scan(/\(/no)
78
+ break unless @ss.scan(/\(/)
79
79
  end
80
80
  @comments.concat comments
81
81
  return comments.join
@@ -86,12 +86,12 @@ class MailParser::RFC2822::Scanner
86
86
  def cfws_sub(ss)
87
87
  ret = ""
88
88
  until ss.eos? do
89
- if ss.scan(/(\s*(\\[#{TEXT_RE}]|[#{CTEXT_RE}]))*\s*/nmo) then
89
+ if ss.scan(/(\s*(\\[#{TEXT_RE}]|[#{CTEXT_RE}]))*\s*/o) then
90
90
  ret << ss.matched
91
91
  end
92
- if ss.scan(/\)/no) then # 「)」が来たら復帰
92
+ if ss.scan(/\)/) then # 「)」が来たら復帰
93
93
  return ret
94
- elsif ss.scan(/\(/no) then # 「(」が来たら再帰
94
+ elsif ss.scan(/\(/) then # 「(」が来たら再帰
95
95
  c = cfws_sub(ss)
96
96
  break if c.nil?
97
97
  ret << "(" << c << ")"
@@ -1,3 +1,4 @@
1
+ # coding: ascii-8bit
1
2
  # Copyright (C) 2006-2010 TOMITA Masahiro
2
3
  # mailto:tommy@tmtm.org
3
4
 
data/lib/mailparser.rb CHANGED
@@ -1,5 +1,5 @@
1
- # -*- coding: utf-8 -*-
2
- # Copyright (C) 2006-2010 TOMITA Masahiro
1
+ # coding: ascii-8bit
2
+ # Copyright (C) 2006-2011 TOMITA Masahiro
3
3
  # mailto:tommy@tmtm.org
4
4
 
5
5
  require "mailparser/error"
@@ -10,6 +10,7 @@ require "mailparser/rfc2231"
10
10
  require "mailparser/rfc2822"
11
11
  require "mailparser/loose"
12
12
  require "mailparser/conv_charset"
13
+ require "mmapscanner"
13
14
 
14
15
  require "stringio"
15
16
  require "tempfile"
@@ -164,45 +165,87 @@ module MailParser
164
165
  # メール全体またはひとつのパートを表すクラス
165
166
  class Message
166
167
  # src からヘッダ部を読み込み Header オブジェクトに保持する
167
- # src:: gets メソッドを持つオブジェクト(ex. IO, StringIO)
168
+ # src:: String / File / MmapScanner / read メソッドを持つオブジェクト
168
169
  # opt:: オプション(Hash)
169
- # :skip_body:: 本文をスキップする
170
- # :text_body_only:: text/* type 以外の本文をスキップする
171
170
  # :extract_message_type:: message/* type を展開する
172
171
  # :decode_mime_header:: MIMEヘッダをデコードする
173
172
  # :decode_mime_filename:: ファイル名を MIME デコードする
174
173
  # :output_charset:: デコード出力文字コード(デフォルト: 変換しない)
175
174
  # :strict:: RFC違反時に ParseError 例外を発生する
176
- # :keep_raw:: 生メッセージを保持する
177
175
  # :charset_converter:: 文字コード変換用 Proc または Method
178
- # :use_file:: body, raw がこのサイズを超えたらメモリではなくファイルを使用する
179
- # boundary:: このパートの終わりを表す文字列の配列
180
- def initialize(src, opt={}, boundary=[])
181
- src = src.is_a?(String) ? StringIO.new(src) : src
182
- @dio = DelimIO.new(src, boundary, opt[:keep_raw], opt[:use_file])
176
+ def initialize(src, opt={})
177
+ if src.is_a? String
178
+ @src = MmapScanner.new src
179
+ elsif src.is_a? File and src.stat.ftype == 'file'
180
+ @src = MmapScanner.new src
181
+ elsif src.is_a? StringIO
182
+ @src = MmapScanner.new src.string
183
+ elsif src.is_a? MmapScanner
184
+ @src = src
185
+ else
186
+ tmpf = Tempfile.new 'mailparser'
187
+ buf = ''
188
+ while src.read(4096, buf)
189
+ tmpf.write buf
190
+ end
191
+ tmpf.close
192
+ @src = File.open(tmpf.path){|f| MmapScanner.new f}
193
+ File.unlink tmpf.path
194
+ end
195
+
183
196
  @opt = opt
184
- @boundary = boundary
185
197
  @from = @to = @cc = @subject = nil
186
198
  @type = @subtype = @charset = @content_transfer_encoding = @filename = nil
187
- @rawheader = ''
188
- @message = nil
189
- @body = @body_preconv = DataBuffer.new(opt[:use_file])
199
+ @rawheader = nil
200
+ @rawbody = nil
190
201
  @part = []
191
202
  opt[:charset_converter] ||= ConvCharset.method(:conv_charset)
192
203
 
193
204
  read_header
194
- read_body
195
205
  read_part
196
206
  end
197
207
 
198
- attr_reader :header, :part, :message
208
+ attr_reader :header, :part
199
209
 
210
+ # charset 変換後の本文を返す
200
211
  def body
201
- @body.str
212
+ body = body_preconv
213
+ if type == 'text' and charset and @opt[:output_charset]
214
+ begin
215
+ body = @opt[:charset_converter].call(charset, @opt[:output_charset], body)
216
+ rescue
217
+ # ignore
218
+ end
219
+ end
220
+ body
202
221
  end
203
222
 
223
+ # charset 変換前の本文を返す
204
224
  def body_preconv
205
- @body_preconv.str
225
+ return '' if type == 'multipart' or type == 'message'
226
+ body = @rawbody.to_s
227
+ ret = case content_transfer_encoding
228
+ when "quoted-printable" then RFC2045.qp_decode(body)
229
+ when "base64" then RFC2045.b64_decode(body)
230
+ when "uuencode", "x-uuencode", "x-uue" then decode_uuencode(body)
231
+ else body
232
+ end
233
+ if type == 'text' and charset
234
+ ret.force_encoding(charset) rescue nil
235
+ end
236
+ ret
237
+ end
238
+
239
+ # Content-Type が message の時 Message を返す。そうでなければ nil を返す。
240
+ def message
241
+ unless @opt[:extract_message_type] and type == "message"
242
+ return nil
243
+ end
244
+ if ['7bit', '8bit'].include? content_transfer_encoding
245
+ @rawbody.pos = 0
246
+ return Message.new(@rawbody, @opt)
247
+ end
248
+ return Message.new(body_preconv, @opt)
206
249
  end
207
250
 
208
251
  # From ヘッダがあれば Mailbox を返す。
@@ -324,95 +367,47 @@ module MailParser
324
367
 
325
368
  # 生メッセージを返す
326
369
  def raw
327
- @dio.keep_buffer.str
370
+ return @src.to_s
328
371
  end
329
372
 
330
373
  # 生ヘッダを返す
331
374
  def rawheader
332
- @rawheader
375
+ @rawheader.to_s
333
376
  end
334
377
 
335
378
  private
336
379
 
337
380
  # ヘッダ部をパースする
338
- # return:: true: 継続行あり
339
381
  def read_header()
382
+ @rawheader = @src.scan_until(/^(?=\r?\n)|\z/)
340
383
  @header = Header.new(@opt)
341
- headers = []
342
- @dio.each_line do |line|
343
- break if line.chomp.empty?
344
- cont = line =~ /^[ \t]/
345
- if (cont and headers.empty?) or (!cont and !line.include? ":") then
346
- @dio.ungets
347
- break
348
- end
349
- if line =~ /^[ \t]/ then
350
- headers[-1] += line # :keep_raw 時の行破壊を防ぐため`<<'は使わない
384
+ until @rawheader.eos?
385
+ if @rawheader.skip(/(.*?)[ \t]*:[ \t]*(.*(\r?\n[ \t].*)*(\r?\n)?)/)
386
+ name = @rawheader.matched(1).to_s
387
+ body = @rawheader.matched(2).to_s
388
+ @header.add(name, body)
351
389
  else
352
- headers << line
390
+ @rawheader.skip(/.*\n/) or break
353
391
  end
354
- @rawheader << line
355
- end
356
- headers.each do |h|
357
- name, body = h.split(/\s*:\s*/n, 2)
358
- @header.add(name, body)
359
- end
360
- end
361
-
362
- # 本文を読む
363
- def read_body()
364
- return if type == "multipart" or @dio.eof?
365
- unless @opt[:extract_message_type] and type == "message" then
366
- if @opt[:skip_body] or (@opt[:text_body_only] and type != "text")
367
- @dio.each_line{} # 本文skip
368
- return
369
- end
370
- end
371
- body = ''
372
- @dio.each_line do |line|
373
- body << line
374
- end
375
- body.chomp! unless @dio.real_eof?
376
- case content_transfer_encoding
377
- when "quoted-printable" then @body << RFC2045.qp_decode(body)
378
- when "base64" then @body << RFC2045.b64_decode(body)
379
- when "uuencode", "x-uuencode", "x-uue" then @body << decode_uuencode(body)
380
- else @body << body
381
- end
382
- @body_preconv = @body
383
- if type == 'text' and charset and @opt[:output_charset] then
384
- new_body = DataBuffer.new(@opt[:use_file])
385
- begin
386
- if @opt[:use_file] and @body.size > @opt[:use_file]
387
- newline = @opt[:charset_converter].call(@opt[:output_charset], charset, "\n")
388
- @body.io.each_line(newline) do |line|
389
- new_body << @opt[:charset_converter].call(charset, @opt[:output_charset], line)
390
- end
391
- else
392
- new_body << @opt[:charset_converter].call(charset, @opt[:output_charset], @body.str)
393
- end
394
- @body = new_body
395
- rescue
396
- # ignore
397
- end
398
- end
399
- if @opt[:extract_message_type] and type == "message" and not @body.empty? then
400
- @message = Message.new(@body.io, @opt)
401
392
  end
393
+ @src.scan(/\r?\n/) # 空行スキップ
394
+ @rawbody = @src.rest
402
395
  end
403
396
 
404
397
  # 各パートの Message オブジェクトの配列を作成
405
398
  def read_part()
406
- return if type != "multipart" or @dio.eof?
399
+ return if type != "multipart" or @src.eos?
407
400
  b = @header["content-type"][0].params["boundary"]
408
- bd = ["--#{b}--", "--#{b}"]
409
- last_line = @dio.each_line(bd){} # skip preamble
410
- while last_line and last_line.chomp == bd.last
411
- m = Message.new @dio, @opt, @boundary+bd
412
- @part << m
413
- last_line = @dio.gets # read boundary
401
+ re = /(?:\A|\r?\n)--#{Regexp.escape b}(?:|(--))(?:\r?\n|\z)/
402
+ @src.scan_until(re) or return # skip preamble
403
+ until @src.eos?
404
+ unless p = @src.scan_until(re)
405
+ @part.push Message.new(@src.rest, @opt)
406
+ break
407
+ end
408
+ @part.push Message.new(p.peek(p.size-@src.matched.length), @opt)
409
+ break if @src.matched(1)
414
410
  end
415
- @dio.each_line{} # skip epilogue
416
411
  end
417
412
 
418
413
  # uuencode のデコード
@@ -434,125 +429,4 @@ module MailParser
434
429
  end
435
430
 
436
431
  end
437
-
438
- # 特定の行を EOF とみなして gets が動く IO モドキ
439
- class DelimIO
440
- # src:: IO または StringIO
441
- # delim:: 区切り行の配列
442
- # keep:: 全行保存
443
- # use_file:: keep_buffer がこのサイズを超えたらメモリではなくファイルを使用する
444
- def initialize(src, delim=nil, keep=false, use_file=nil)
445
- @src = src
446
- @delim_re = delim && !delim.empty? && Regexp.new(delim.map{|d|"\\A#{Regexp.quote(d)}\\r?\\Z"}.join("|"))
447
- @keep = keep
448
- @keep_buffer = DataBuffer.new(use_file)
449
- @line_buffer = nil
450
- @eof = false # delim に達したら真
451
- @real_eof = false
452
- @last_read_line = nil
453
- end
454
-
455
- attr_reader :keep_buffer
456
-
457
- # 行毎にブロックを繰り返す。
458
- # delim に一致した場合は中断
459
- # delim:: 区切り文字列の配列
460
- # return:: delimに一致した行 or nil(EOFに達した)
461
- def each_line(delim=nil)
462
- return if @eof
463
- while line = gets
464
- return line if delim and delim.include? line.chomp
465
- yield line
466
- end
467
- nil
468
- end
469
- alias each each_line
470
-
471
- # 1行読み込む。@delim_re に一致する行で EOF
472
- def gets
473
- return if @eof
474
- if @line_buffer
475
- line = @line_buffer
476
- @line_buffer = nil
477
- else
478
- line = @src.gets
479
- unless line # EOF
480
- @keep_buffer << @last_read_line if @keep and @last_read_line
481
- @eof = @real_eof = true
482
- return
483
- end
484
- end
485
- if @delim_re and @delim_re.match line
486
- @keep_buffer << @last_read_line if @keep and @last_read_line
487
- @src.ungets
488
- @eof = true
489
- return
490
- end
491
- @keep_buffer << @last_read_line if @keep and @last_read_line
492
- @last_read_line = line
493
- line
494
- end
495
-
496
- def ungets
497
- raise "preread line nothing" unless @last_read_line
498
- @eof = false
499
- @line_buffer = @last_read_line
500
- @last_read_line = nil
501
- end
502
-
503
- def eof?
504
- @eof
505
- end
506
-
507
- def real_eof?
508
- @src.is_a?(DelimIO) ? @src.real_eof? : @real_eof
509
- end
510
-
511
- end
512
-
513
- # 通常はメモリにデータを保持し、それ以上はファイル(Tempfile)に保持するためのクラス
514
- class DataBuffer
515
- # limit:: データがこのバイト数を超えたらファイルに保持する。nil の場合は無制限。
516
- def initialize(limit)
517
- @limit = limit
518
- @buffer = StringIO.new
519
- end
520
-
521
- # バッファに文字列を追加する
522
- def <<(str)
523
- if @limit and @buffer.is_a? StringIO and @buffer.size+str.size > @limit
524
- file = Tempfile.new 'mailparser_databuffer'
525
- file.unlink rescue nil
526
- file.write @buffer.string
527
- @buffer = file
528
- end
529
- @buffer << str
530
- end
531
-
532
- # バッファ内のデータを返す
533
- def str
534
- if @buffer.is_a? StringIO
535
- @buffer.string
536
- else
537
- @buffer.rewind
538
- @buffer.read
539
- end
540
- end
541
-
542
- # IOオブジェクト(のようなもの)を返す
543
- def io
544
- @buffer.rewind
545
- @buffer
546
- end
547
-
548
- # データの大きさを返す
549
- def size
550
- @buffer.pos
551
- end
552
-
553
- # バッファが空かどうかを返す
554
- def empty?
555
- @buffer.pos == 0
556
- end
557
- end
558
432
  end
data/test/test_loose.rb CHANGED
@@ -1,9 +1,10 @@
1
- # -*- coding: utf-8 -*-
1
+ # coding: ascii-8bit
2
2
  # Copyright (C) 2007-2010 TOMITA Masahiro
3
3
  # mailto:tommy@tmtm.org
4
4
 
5
5
  require "mailparser/loose"
6
6
  require "test/unit"
7
+ require "timeout"
7
8
 
8
9
  class TC_Loose < Test::Unit::TestCase
9
10
  include MailParser::Loose
@@ -45,10 +46,10 @@ class TC_Loose < Test::Unit::TestCase
45
46
  end
46
47
 
47
48
  def test_parse_phrase_list_mime_charset_converter
48
- p = parse_phrase_list("abc =?us-ascii?q?def?=, ghi jkl", :decode_mime_header=>true, :output_charset=>"utf-8", :charset_converter=>proc{"12345"})
49
+ p = parse_phrase_list("abc =?us-ascii?q?def?=, ghi jkl", :decode_mime_header=>true, :output_charset=>"utf-8", :charset_converter=>proc{|_,_,s| s.upcase})
49
50
  assert_equal(2, p.size)
50
- assert_equal("abc 12345", p[0])
51
- assert_equal("ghi jkl", p[1])
51
+ assert_equal("ABC DEF", p[0])
52
+ assert_equal("GHI JKL", p[1])
52
53
  end
53
54
 
54
55
  def test_parse_received()
@@ -233,12 +234,20 @@ class TC_Loose < Test::Unit::TestCase
233
234
 
234
235
  def test_parse_other_header_decode
235
236
  s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=", :decode_mime_header=>true)
236
- assert_equal "\xa4\xa2\xa4\xa4", s
237
+ if String.method_defined? :encode
238
+ assert_equal 'あい'.encode('euc-jp', 'utf-8'), s
239
+ else
240
+ assert_equal "\xa4\xa2\xa4\xa4", s
241
+ end
237
242
  end
238
243
 
239
244
  def test_parse_other_header_decode_charset
240
245
  s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=", :decode_mime_header=>true, :output_charset=>"utf-8")
241
- assert_equal "あい", s
246
+ if String.method_defined? :force_encoding
247
+ assert_equal "あい".force_encoding('utf-8'), s
248
+ else
249
+ assert_equal "あい", s
250
+ end
242
251
  end
243
252
 
244
253
  def test_parse_other_header_decode_charset_converter
@@ -270,9 +279,9 @@ class TC_Loose < Test::Unit::TestCase
270
279
  end
271
280
 
272
281
  def test_mailbox_charset_converter
273
- ml = mailbox_list("hoge =?us-ascii?q?hoge?= <hoge.hoge@example.com>", {:decode_mime_header=>true, :output_charset=>"us-ascii", :charset_converter=>proc{"fuga"}})
282
+ ml = mailbox_list("hoge =?us-ascii?q?hoge?= <hoge.hoge@example.com>", {:decode_mime_header=>true, :output_charset=>"us-ascii", :charset_converter=>proc{|_,_,s| s.upcase}})
274
283
  assert_equal(1, ml.size)
275
- assert_equal("hoge fuga", ml[0].phrase)
284
+ assert_equal("HOGE HOGE", ml[0].phrase)
276
285
  assert_equal("hoge.hoge", ml[0].addr_spec.local_part)
277
286
  assert_equal("example.com", ml[0].addr_spec.domain)
278
287
  end