mailparser 0.4.22a → 0.5.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- # -*- coding: utf-8 -*-
1
+ # coding: ascii-8bit
2
2
  # Copyright (C) 2006-2010 TOMITA Masahiro
3
3
  # mailto:tommy@tmtm.org
4
4
 
@@ -25,40 +25,40 @@ class MailParser::RFC2822::Scanner
25
25
  def scan()
26
26
  until @ss.eos?
27
27
  case
28
- when s = @ss.scan(/\s*\(/nmo)
28
+ when s = @ss.scan(/\s*\(/)
29
29
  @token << cfws(@ss)
30
30
  @token_idx[@token.last.object_id] = @token.size-1
31
- when s = @ss.scan(/\s+/nmo)
31
+ when s = @ss.scan(/\s+/)
32
32
  @token << s
33
33
  @token_idx[s.object_id] = @token.size-1
34
- when s = @ss.scan(/\"(\\[#{TEXT_RE}]|[#{QTEXT_RE}])*\"/no)
34
+ when s = @ss.scan(/\"(\\[#{TEXT_RE}]|[#{QTEXT_RE}])*\"/o)
35
35
  @token << s
36
36
  @token_idx[s.object_id] = @token.size-1
37
- yield :NO_FOLD_QUOTE, s
38
- when s = @ss.scan(/\"(\s*(\\[#{TEXT_RE}]|[#{QTEXT_RE}]))*\s*\"/nmo)
37
+ yield [:NO_FOLD_QUOTE, s]
38
+ when s = @ss.scan(/\"(\s*(\\[#{TEXT_RE}]|[#{QTEXT_RE}]))*\s*\"/o)
39
39
  @token << s
40
40
  @token_idx[s.object_id] = @token.size-1
41
- yield :QUOTED_STRING, s
42
- when s = @ss.scan(/\[(\\[#{TEXT_RE}]|[#{DTEXT_RE}])*\]/no)
41
+ yield [:QUOTED_STRING, s]
42
+ when s = @ss.scan(/\[(\\[#{TEXT_RE}]|[#{DTEXT_RE}])*\]/o)
43
43
  @token << s
44
44
  @token_idx[s.object_id] = @token.size-1
45
- yield :NO_FOLD_LITERAL, s
46
- when s = @ss.scan(/\[(\s*(\\[#{TEXT_RE}]|[#{DTEXT_RE}]))*\s*\]/nmo)
45
+ yield [:NO_FOLD_LITERAL, s]
46
+ when s = @ss.scan(/\[(\s*(\\[#{TEXT_RE}]|[#{DTEXT_RE}]))*\s*\]/o)
47
47
  @token << s
48
48
  @token_idx[s.object_id] = @token.size-1
49
- yield :DOMAIN_LITERAL, s
50
- when s = @ss.scan(/[#{ATEXT_RE}]+/no)
49
+ yield [:DOMAIN_LITERAL, s]
50
+ when s = @ss.scan(/[#{ATEXT_RE}]+/o)
51
51
  @token << s
52
52
  @token_idx[s.object_id] = @token.size-1
53
53
  if s =~ /\A\d+\z/ then
54
- yield :DIGIT, s
54
+ yield [:DIGIT, s]
55
55
  else
56
- yield :ATOM, s
56
+ yield [:ATOM, s]
57
57
  end
58
- when s = @ss.scan(/./no)
58
+ when s = @ss.scan(/./)
59
59
  @token << s
60
60
  @token_idx[s.object_id] = @token.size-1
61
- yield s, s
61
+ yield [s, s]
62
62
  end
63
63
  end
64
64
  yield nil
@@ -73,9 +73,9 @@ class MailParser::RFC2822::Scanner
73
73
  comments = []
74
74
  while true
75
75
  c = cfws_sub(ss)
76
- ss.skip(/\s+/nmo)
76
+ ss.skip(/\s+/)
77
77
  comments << "(#{c})"
78
- break unless @ss.scan(/\(/no)
78
+ break unless @ss.scan(/\(/)
79
79
  end
80
80
  @comments.concat comments
81
81
  return comments.join
@@ -86,12 +86,12 @@ class MailParser::RFC2822::Scanner
86
86
  def cfws_sub(ss)
87
87
  ret = ""
88
88
  until ss.eos? do
89
- if ss.scan(/(\s*(\\[#{TEXT_RE}]|[#{CTEXT_RE}]))*\s*/nmo) then
89
+ if ss.scan(/(\s*(\\[#{TEXT_RE}]|[#{CTEXT_RE}]))*\s*/o) then
90
90
  ret << ss.matched
91
91
  end
92
- if ss.scan(/\)/no) then # 「)」が来たら復帰
92
+ if ss.scan(/\)/) then # 「)」が来たら復帰
93
93
  return ret
94
- elsif ss.scan(/\(/no) then # 「(」が来たら再帰
94
+ elsif ss.scan(/\(/) then # 「(」が来たら再帰
95
95
  c = cfws_sub(ss)
96
96
  break if c.nil?
97
97
  ret << "(" << c << ")"
@@ -1,3 +1,4 @@
1
+ # coding: ascii-8bit
1
2
  # Copyright (C) 2006-2010 TOMITA Masahiro
2
3
  # mailto:tommy@tmtm.org
3
4
 
data/lib/mailparser.rb CHANGED
@@ -1,5 +1,5 @@
1
- # -*- coding: utf-8 -*-
2
- # Copyright (C) 2006-2010 TOMITA Masahiro
1
+ # coding: ascii-8bit
2
+ # Copyright (C) 2006-2011 TOMITA Masahiro
3
3
  # mailto:tommy@tmtm.org
4
4
 
5
5
  require "mailparser/error"
@@ -10,6 +10,7 @@ require "mailparser/rfc2231"
10
10
  require "mailparser/rfc2822"
11
11
  require "mailparser/loose"
12
12
  require "mailparser/conv_charset"
13
+ require "mmapscanner"
13
14
 
14
15
  require "stringio"
15
16
  require "tempfile"
@@ -164,45 +165,87 @@ module MailParser
164
165
  # メール全体またはひとつのパートを表すクラス
165
166
  class Message
166
167
  # src からヘッダ部を読み込み Header オブジェクトに保持する
167
- # src:: gets メソッドを持つオブジェクト(ex. IO, StringIO)
168
+ # src:: String / File / MmapScanner / read メソッドを持つオブジェクト
168
169
  # opt:: オプション(Hash)
169
- # :skip_body:: 本文をスキップする
170
- # :text_body_only:: text/* type 以外の本文をスキップする
171
170
  # :extract_message_type:: message/* type を展開する
172
171
  # :decode_mime_header:: MIMEヘッダをデコードする
173
172
  # :decode_mime_filename:: ファイル名を MIME デコードする
174
173
  # :output_charset:: デコード出力文字コード(デフォルト: 変換しない)
175
174
  # :strict:: RFC違反時に ParseError 例外を発生する
176
- # :keep_raw:: 生メッセージを保持する
177
175
  # :charset_converter:: 文字コード変換用 Proc または Method
178
- # :use_file:: body, raw がこのサイズを超えたらメモリではなくファイルを使用する
179
- # boundary:: このパートの終わりを表す文字列の配列
180
- def initialize(src, opt={}, boundary=[])
181
- src = src.is_a?(String) ? StringIO.new(src) : src
182
- @dio = DelimIO.new(src, boundary, opt[:keep_raw], opt[:use_file])
176
+ def initialize(src, opt={})
177
+ if src.is_a? String
178
+ @src = MmapScanner.new src
179
+ elsif src.is_a? File and src.stat.ftype == 'file'
180
+ @src = MmapScanner.new src
181
+ elsif src.is_a? StringIO
182
+ @src = MmapScanner.new src.string
183
+ elsif src.is_a? MmapScanner
184
+ @src = src
185
+ else
186
+ tmpf = Tempfile.new 'mailparser'
187
+ buf = ''
188
+ while src.read(4096, buf)
189
+ tmpf.write buf
190
+ end
191
+ tmpf.close
192
+ @src = File.open(tmpf.path){|f| MmapScanner.new f}
193
+ File.unlink tmpf.path
194
+ end
195
+
183
196
  @opt = opt
184
- @boundary = boundary
185
197
  @from = @to = @cc = @subject = nil
186
198
  @type = @subtype = @charset = @content_transfer_encoding = @filename = nil
187
- @rawheader = ''
188
- @message = nil
189
- @body = @body_preconv = DataBuffer.new(opt[:use_file])
199
+ @rawheader = nil
200
+ @rawbody = nil
190
201
  @part = []
191
202
  opt[:charset_converter] ||= ConvCharset.method(:conv_charset)
192
203
 
193
204
  read_header
194
- read_body
195
205
  read_part
196
206
  end
197
207
 
198
- attr_reader :header, :part, :message
208
+ attr_reader :header, :part
199
209
 
210
+ # charset 変換後の本文を返す
200
211
  def body
201
- @body.str
212
+ body = body_preconv
213
+ if type == 'text' and charset and @opt[:output_charset]
214
+ begin
215
+ body = @opt[:charset_converter].call(charset, @opt[:output_charset], body)
216
+ rescue
217
+ # ignore
218
+ end
219
+ end
220
+ body
202
221
  end
203
222
 
223
+ # charset 変換前の本文を返す
204
224
  def body_preconv
205
- @body_preconv.str
225
+ return '' if type == 'multipart' or type == 'message'
226
+ body = @rawbody.to_s
227
+ ret = case content_transfer_encoding
228
+ when "quoted-printable" then RFC2045.qp_decode(body)
229
+ when "base64" then RFC2045.b64_decode(body)
230
+ when "uuencode", "x-uuencode", "x-uue" then decode_uuencode(body)
231
+ else body
232
+ end
233
+ if type == 'text' and charset
234
+ ret.force_encoding(charset) rescue nil
235
+ end
236
+ ret
237
+ end
238
+
239
+ # Content-Type が message の時 Message を返す。そうでなければ nil を返す。
240
+ def message
241
+ unless @opt[:extract_message_type] and type == "message"
242
+ return nil
243
+ end
244
+ if ['7bit', '8bit'].include? content_transfer_encoding
245
+ @rawbody.pos = 0
246
+ return Message.new(@rawbody, @opt)
247
+ end
248
+ return Message.new(body_preconv, @opt)
206
249
  end
207
250
 
208
251
  # From ヘッダがあれば Mailbox を返す。
@@ -324,95 +367,47 @@ module MailParser
324
367
 
325
368
  # 生メッセージを返す
326
369
  def raw
327
- @dio.keep_buffer.str
370
+ return @src.to_s
328
371
  end
329
372
 
330
373
  # 生ヘッダを返す
331
374
  def rawheader
332
- @rawheader
375
+ @rawheader.to_s
333
376
  end
334
377
 
335
378
  private
336
379
 
337
380
  # ヘッダ部をパースする
338
- # return:: true: 継続行あり
339
381
  def read_header()
382
+ @rawheader = @src.scan_until(/^(?=\r?\n)|\z/)
340
383
  @header = Header.new(@opt)
341
- headers = []
342
- @dio.each_line do |line|
343
- break if line.chomp.empty?
344
- cont = line =~ /^[ \t]/
345
- if (cont and headers.empty?) or (!cont and !line.include? ":") then
346
- @dio.ungets
347
- break
348
- end
349
- if line =~ /^[ \t]/ then
350
- headers[-1] += line # :keep_raw 時の行破壊を防ぐため`<<'は使わない
384
+ until @rawheader.eos?
385
+ if @rawheader.skip(/(.*?)[ \t]*:[ \t]*(.*(\r?\n[ \t].*)*(\r?\n)?)/)
386
+ name = @rawheader.matched(1).to_s
387
+ body = @rawheader.matched(2).to_s
388
+ @header.add(name, body)
351
389
  else
352
- headers << line
390
+ @rawheader.skip(/.*\n/) or break
353
391
  end
354
- @rawheader << line
355
- end
356
- headers.each do |h|
357
- name, body = h.split(/\s*:\s*/n, 2)
358
- @header.add(name, body)
359
- end
360
- end
361
-
362
- # 本文を読む
363
- def read_body()
364
- return if type == "multipart" or @dio.eof?
365
- unless @opt[:extract_message_type] and type == "message" then
366
- if @opt[:skip_body] or (@opt[:text_body_only] and type != "text")
367
- @dio.each_line{} # 本文skip
368
- return
369
- end
370
- end
371
- body = ''
372
- @dio.each_line do |line|
373
- body << line
374
- end
375
- body.chomp! unless @dio.real_eof?
376
- case content_transfer_encoding
377
- when "quoted-printable" then @body << RFC2045.qp_decode(body)
378
- when "base64" then @body << RFC2045.b64_decode(body)
379
- when "uuencode", "x-uuencode", "x-uue" then @body << decode_uuencode(body)
380
- else @body << body
381
- end
382
- @body_preconv = @body
383
- if type == 'text' and charset and @opt[:output_charset] then
384
- new_body = DataBuffer.new(@opt[:use_file])
385
- begin
386
- if @opt[:use_file] and @body.size > @opt[:use_file]
387
- newline = @opt[:charset_converter].call(@opt[:output_charset], charset, "\n")
388
- @body.io.each_line(newline) do |line|
389
- new_body << @opt[:charset_converter].call(charset, @opt[:output_charset], line)
390
- end
391
- else
392
- new_body << @opt[:charset_converter].call(charset, @opt[:output_charset], @body.str)
393
- end
394
- @body = new_body
395
- rescue
396
- # ignore
397
- end
398
- end
399
- if @opt[:extract_message_type] and type == "message" and not @body.empty? then
400
- @message = Message.new(@body.io, @opt)
401
392
  end
393
+ @src.scan(/\r?\n/) # 空行スキップ
394
+ @rawbody = @src.rest
402
395
  end
403
396
 
404
397
  # 各パートの Message オブジェクトの配列を作成
405
398
  def read_part()
406
- return if type != "multipart" or @dio.eof?
399
+ return if type != "multipart" or @src.eos?
407
400
  b = @header["content-type"][0].params["boundary"]
408
- bd = ["--#{b}--", "--#{b}"]
409
- last_line = @dio.each_line(bd){} # skip preamble
410
- while last_line and last_line.chomp == bd.last
411
- m = Message.new @dio, @opt, @boundary+bd
412
- @part << m
413
- last_line = @dio.gets # read boundary
401
+ re = /(?:\A|\r?\n)--#{Regexp.escape b}(?:|(--))(?:\r?\n|\z)/
402
+ @src.scan_until(re) or return # skip preamble
403
+ until @src.eos?
404
+ unless p = @src.scan_until(re)
405
+ @part.push Message.new(@src.rest, @opt)
406
+ break
407
+ end
408
+ @part.push Message.new(p.peek(p.size-@src.matched.length), @opt)
409
+ break if @src.matched(1)
414
410
  end
415
- @dio.each_line{} # skip epilogue
416
411
  end
417
412
 
418
413
  # uuencode のデコード
@@ -434,125 +429,4 @@ module MailParser
434
429
  end
435
430
 
436
431
  end
437
-
438
- # 特定の行を EOF とみなして gets が動く IO モドキ
439
- class DelimIO
440
- # src:: IO または StringIO
441
- # delim:: 区切り行の配列
442
- # keep:: 全行保存
443
- # use_file:: keep_buffer がこのサイズを超えたらメモリではなくファイルを使用する
444
- def initialize(src, delim=nil, keep=false, use_file=nil)
445
- @src = src
446
- @delim_re = delim && !delim.empty? && Regexp.new(delim.map{|d|"\\A#{Regexp.quote(d)}\\r?\\Z"}.join("|"))
447
- @keep = keep
448
- @keep_buffer = DataBuffer.new(use_file)
449
- @line_buffer = nil
450
- @eof = false # delim に達したら真
451
- @real_eof = false
452
- @last_read_line = nil
453
- end
454
-
455
- attr_reader :keep_buffer
456
-
457
- # 行毎にブロックを繰り返す。
458
- # delim に一致した場合は中断
459
- # delim:: 区切り文字列の配列
460
- # return:: delimに一致した行 or nil(EOFに達した)
461
- def each_line(delim=nil)
462
- return if @eof
463
- while line = gets
464
- return line if delim and delim.include? line.chomp
465
- yield line
466
- end
467
- nil
468
- end
469
- alias each each_line
470
-
471
- # 1行読み込む。@delim_re に一致する行で EOF
472
- def gets
473
- return if @eof
474
- if @line_buffer
475
- line = @line_buffer
476
- @line_buffer = nil
477
- else
478
- line = @src.gets
479
- unless line # EOF
480
- @keep_buffer << @last_read_line if @keep and @last_read_line
481
- @eof = @real_eof = true
482
- return
483
- end
484
- end
485
- if @delim_re and @delim_re.match line
486
- @keep_buffer << @last_read_line if @keep and @last_read_line
487
- @src.ungets
488
- @eof = true
489
- return
490
- end
491
- @keep_buffer << @last_read_line if @keep and @last_read_line
492
- @last_read_line = line
493
- line
494
- end
495
-
496
- def ungets
497
- raise "preread line nothing" unless @last_read_line
498
- @eof = false
499
- @line_buffer = @last_read_line
500
- @last_read_line = nil
501
- end
502
-
503
- def eof?
504
- @eof
505
- end
506
-
507
- def real_eof?
508
- @src.is_a?(DelimIO) ? @src.real_eof? : @real_eof
509
- end
510
-
511
- end
512
-
513
- # 通常はメモリにデータを保持し、それ以上はファイル(Tempfile)に保持するためのクラス
514
- class DataBuffer
515
- # limit:: データがこのバイト数を超えたらファイルに保持する。nil の場合は無制限。
516
- def initialize(limit)
517
- @limit = limit
518
- @buffer = StringIO.new
519
- end
520
-
521
- # バッファに文字列を追加する
522
- def <<(str)
523
- if @limit and @buffer.is_a? StringIO and @buffer.size+str.size > @limit
524
- file = Tempfile.new 'mailparser_databuffer'
525
- file.unlink rescue nil
526
- file.write @buffer.string
527
- @buffer = file
528
- end
529
- @buffer << str
530
- end
531
-
532
- # バッファ内のデータを返す
533
- def str
534
- if @buffer.is_a? StringIO
535
- @buffer.string
536
- else
537
- @buffer.rewind
538
- @buffer.read
539
- end
540
- end
541
-
542
- # IOオブジェクト(のようなもの)を返す
543
- def io
544
- @buffer.rewind
545
- @buffer
546
- end
547
-
548
- # データの大きさを返す
549
- def size
550
- @buffer.pos
551
- end
552
-
553
- # バッファが空かどうかを返す
554
- def empty?
555
- @buffer.pos == 0
556
- end
557
- end
558
432
  end
data/test/test_loose.rb CHANGED
@@ -1,9 +1,10 @@
1
- # -*- coding: utf-8 -*-
1
+ # coding: ascii-8bit
2
2
  # Copyright (C) 2007-2010 TOMITA Masahiro
3
3
  # mailto:tommy@tmtm.org
4
4
 
5
5
  require "mailparser/loose"
6
6
  require "test/unit"
7
+ require "timeout"
7
8
 
8
9
  class TC_Loose < Test::Unit::TestCase
9
10
  include MailParser::Loose
@@ -45,10 +46,10 @@ class TC_Loose < Test::Unit::TestCase
45
46
  end
46
47
 
47
48
  def test_parse_phrase_list_mime_charset_converter
48
- p = parse_phrase_list("abc =?us-ascii?q?def?=, ghi jkl", :decode_mime_header=>true, :output_charset=>"utf-8", :charset_converter=>proc{"12345"})
49
+ p = parse_phrase_list("abc =?us-ascii?q?def?=, ghi jkl", :decode_mime_header=>true, :output_charset=>"utf-8", :charset_converter=>proc{|_,_,s| s.upcase})
49
50
  assert_equal(2, p.size)
50
- assert_equal("abc 12345", p[0])
51
- assert_equal("ghi jkl", p[1])
51
+ assert_equal("ABC DEF", p[0])
52
+ assert_equal("GHI JKL", p[1])
52
53
  end
53
54
 
54
55
  def test_parse_received()
@@ -233,12 +234,20 @@ class TC_Loose < Test::Unit::TestCase
233
234
 
234
235
  def test_parse_other_header_decode
235
236
  s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=", :decode_mime_header=>true)
236
- assert_equal "\xa4\xa2\xa4\xa4", s
237
+ if String.method_defined? :encode
238
+ assert_equal 'あい'.encode('euc-jp', 'utf-8'), s
239
+ else
240
+ assert_equal "\xa4\xa2\xa4\xa4", s
241
+ end
237
242
  end
238
243
 
239
244
  def test_parse_other_header_decode_charset
240
245
  s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=", :decode_mime_header=>true, :output_charset=>"utf-8")
241
- assert_equal "あい", s
246
+ if String.method_defined? :force_encoding
247
+ assert_equal "あい".force_encoding('utf-8'), s
248
+ else
249
+ assert_equal "あい", s
250
+ end
242
251
  end
243
252
 
244
253
  def test_parse_other_header_decode_charset_converter
@@ -270,9 +279,9 @@ class TC_Loose < Test::Unit::TestCase
270
279
  end
271
280
 
272
281
  def test_mailbox_charset_converter
273
- ml = mailbox_list("hoge =?us-ascii?q?hoge?= <hoge.hoge@example.com>", {:decode_mime_header=>true, :output_charset=>"us-ascii", :charset_converter=>proc{"fuga"}})
282
+ ml = mailbox_list("hoge =?us-ascii?q?hoge?= <hoge.hoge@example.com>", {:decode_mime_header=>true, :output_charset=>"us-ascii", :charset_converter=>proc{|_,_,s| s.upcase}})
274
283
  assert_equal(1, ml.size)
275
- assert_equal("hoge fuga", ml[0].phrase)
284
+ assert_equal("HOGE HOGE", ml[0].phrase)
276
285
  assert_equal("hoge.hoge", ml[0].addr_spec.local_part)
277
286
  assert_equal("example.com", ml[0].addr_spec.domain)
278
287
  end