mailparser 0.4.22a

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,119 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (C) 2006-2010 TOMITA Masahiro
3
+ # mailto:tommy@tmtm.org
4
+
5
+ require "strscan"
6
+
7
+ class MailParser::RFC2822::Scanner
8
+ TEXT_RE = '\x00-\x7f'
9
+ QTEXT_RE = '\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f'
10
+ ATEXT_RE = 'A-Za-z0-9\!\#\$\%\&\'\*\+\\-\/\=\?\^\_\`\{\|\}\~'
11
+ CTEXT_RE = '\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x27\x2a-\x5b\x5d-\x7f'
12
+ UTEXT_RE = '\x00-\x7f'
13
+ DTEXT_RE = '\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x5e-\x7e'
14
+
15
+ def initialize(header_type, str)
16
+ @header_type = header_type
17
+ @comments = []
18
+ @token = []
19
+ @token_idx = {}
20
+ @ss = StringScanner.new(str)
21
+ end
22
+
23
+ attr_reader :comments
24
+
25
+ def scan()
26
+ until @ss.eos?
27
+ case
28
+ when s = @ss.scan(/\s*\(/nmo)
29
+ @token << cfws(@ss)
30
+ @token_idx[@token.last.object_id] = @token.size-1
31
+ when s = @ss.scan(/\s+/nmo)
32
+ @token << s
33
+ @token_idx[s.object_id] = @token.size-1
34
+ when s = @ss.scan(/\"(\\[#{TEXT_RE}]|[#{QTEXT_RE}])*\"/no)
35
+ @token << s
36
+ @token_idx[s.object_id] = @token.size-1
37
+ yield :NO_FOLD_QUOTE, s
38
+ when s = @ss.scan(/\"(\s*(\\[#{TEXT_RE}]|[#{QTEXT_RE}]))*\s*\"/nmo)
39
+ @token << s
40
+ @token_idx[s.object_id] = @token.size-1
41
+ yield :QUOTED_STRING, s
42
+ when s = @ss.scan(/\[(\\[#{TEXT_RE}]|[#{DTEXT_RE}])*\]/no)
43
+ @token << s
44
+ @token_idx[s.object_id] = @token.size-1
45
+ yield :NO_FOLD_LITERAL, s
46
+ when s = @ss.scan(/\[(\s*(\\[#{TEXT_RE}]|[#{DTEXT_RE}]))*\s*\]/nmo)
47
+ @token << s
48
+ @token_idx[s.object_id] = @token.size-1
49
+ yield :DOMAIN_LITERAL, s
50
+ when s = @ss.scan(/[#{ATEXT_RE}]+/no)
51
+ @token << s
52
+ @token_idx[s.object_id] = @token.size-1
53
+ if s =~ /\A\d+\z/ then
54
+ yield :DIGIT, s
55
+ else
56
+ yield :ATOM, s
57
+ end
58
+ when s = @ss.scan(/./no)
59
+ @token << s
60
+ @token_idx[s.object_id] = @token.size-1
61
+ yield s, s
62
+ end
63
+ end
64
+ yield nil
65
+ end
66
+
67
+ def rest()
68
+ @ss.rest
69
+ end
70
+
71
+ # 「(」の直後からコメント部の終わりまでスキャン
72
+ def cfws(ss)
73
+ comments = []
74
+ while true
75
+ c = cfws_sub(ss)
76
+ ss.skip(/\s+/nmo)
77
+ comments << "(#{c})"
78
+ break unless @ss.scan(/\(/no)
79
+ end
80
+ @comments.concat comments
81
+ return comments.join
82
+ end
83
+
84
+ # コメント部の処理
85
+ # return: コメント部の文字列
86
+ def cfws_sub(ss)
87
+ ret = ""
88
+ until ss.eos? do
89
+ if ss.scan(/(\s*(\\[#{TEXT_RE}]|[#{CTEXT_RE}]))*\s*/nmo) then
90
+ ret << ss.matched
91
+ end
92
+ if ss.scan(/\)/no) then # 「)」が来たら復帰
93
+ return ret
94
+ elsif ss.scan(/\(/no) then # 「(」が来たら再帰
95
+ c = cfws_sub(ss)
96
+ break if c.nil?
97
+ ret << "(" << c << ")"
98
+ else
99
+ raise MailParser::ParseError, ss.rest
100
+ end
101
+ end
102
+ # 「)」がなかったら例外
103
+ raise MailParser::ParseError, ss.rest
104
+ end
105
+
106
+ # @token中の位置が s から e までの間のコメント文字列の配列を得る
107
+ def get_comment(s, e)
108
+ a = @token[s..e].select{|i| i =~ /^\s*\(/}.map{|i| i.strip}
109
+ return a
110
+ end
111
+
112
+ # @token中の object_id が s_id から e_id までの間のコメント文字列の配列を得る
113
+ def get_comment_by_id(s_id, e_id)
114
+ s = s_id ? @token_idx[s_id] : 0
115
+ e = e_id ? @token_idx[e_id] : -1
116
+ return get_comment(s, e)
117
+ end
118
+
119
+ end
data/test.rb ADDED
@@ -0,0 +1,26 @@
1
+ # Copyright (C) 2007-2010 TOMITA Masahiro
2
+ # mailto:tommy@tmtm.org
3
+
4
+ require "mailparser"
5
+
6
+ unless ARGV.empty?
7
+ ARGV.each do |fname|
8
+ rawh = nil
9
+ begin
10
+ File.open(fname) do |f|
11
+ puts fname
12
+ m = MailParser::Message.new(f, :decode_mime_header=>true, :output_charset=>"UTF-8")
13
+ m.header.keys.each do |k| rawh = "#{k}: #{m.header.raw(k)}"; m.header[k] end
14
+ m.body
15
+ end
16
+ rescue MailParser::ParseError => e
17
+ puts fname
18
+ puts rawh
19
+ p e
20
+ end
21
+ end
22
+ exit
23
+ end
24
+
25
+ require "test/unit"
26
+ Test::Unit::AutoRunner.run(true, "test")
@@ -0,0 +1,371 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (C) 2007-2010 TOMITA Masahiro
3
+ # mailto:tommy@tmtm.org
4
+
5
+ require "mailparser/loose"
6
+ require "test/unit"
7
+
8
+ class TC_Loose < Test::Unit::TestCase
9
+ include MailParser::Loose
10
+
11
+ def setup()
12
+ end
13
+ def teardown()
14
+ end
15
+
16
+ def test_parse_date()
17
+ tzbak = ENV["TZ"]
18
+ begin
19
+ ENV["TZ"] = "GMT"
20
+ d = parse_date("Wed, 10 Jan 2007 12:53:55 +0900")
21
+ assert_equal(2007, d.year)
22
+ assert_equal(1, d.month)
23
+ assert_equal(10, d.day)
24
+ assert_equal(3, d.hour)
25
+ assert_equal(53, d.min)
26
+ assert_equal(55, d.sec)
27
+ assert_equal("+0000", d.zone)
28
+ ensure
29
+ ENV["TZ"] = tzbak
30
+ end
31
+ end
32
+
33
+ def test_parse_phrase_list()
34
+ p = parse_phrase_list("abc def, ghi jkl")
35
+ assert_equal(2, p.size)
36
+ assert_equal("abc def", p[0])
37
+ assert_equal("ghi jkl", p[1])
38
+ end
39
+
40
+ def test_parse_phrase_list_mime()
41
+ p = parse_phrase_list("abc =?us-ascii?q?def?=, ghi jkl", :decode_mime_header=>true)
42
+ assert_equal(2, p.size)
43
+ assert_equal("abc def", p[0])
44
+ assert_equal("ghi jkl", p[1])
45
+ end
46
+
47
+ def test_parse_phrase_list_mime_charset_converter
48
+ p = parse_phrase_list("abc =?us-ascii?q?def?=, ghi jkl", :decode_mime_header=>true, :output_charset=>"utf-8", :charset_converter=>proc{"12345"})
49
+ assert_equal(2, p.size)
50
+ assert_equal("abc 12345", p[0])
51
+ assert_equal("ghi jkl", p[1])
52
+ end
53
+
54
+ def test_parse_received()
55
+ tzbak = ENV["TZ"]
56
+ begin
57
+ ENV["TZ"] = "GMT"
58
+ r = parse_received("from host.example.com by my.server for <user@domain.name>; Wed, 10 Jan 2007 12:09:55 +0900")
59
+ assert_equal(2007, r.date_time.year)
60
+ assert_equal(1, r.date_time.month)
61
+ assert_equal(10, r.date_time.day)
62
+ assert_equal(3, r.date_time.hour)
63
+ assert_equal(9, r.date_time.min)
64
+ assert_equal(55, r.date_time.sec)
65
+ assert_equal("+0000", r.date_time.zone)
66
+ assert_equal("host.example.com", r.name_val["from"])
67
+ assert_equal("my.server", r.name_val["by"])
68
+ assert_equal("<user@domain.name>", r.name_val["for"])
69
+ ensure
70
+ ENV["TZ"] = tzbak
71
+ end
72
+ end
73
+
74
+ def test_parse_received_empty()
75
+ tzbak = ENV["TZ"]
76
+ begin
77
+ ENV["TZ"] = "GMT"
78
+ r = parse_received("")
79
+ t = Time.now
80
+ assert_equal(t.year, r.date_time.year)
81
+ assert_equal(t.month, r.date_time.month)
82
+ assert_equal(t.day, r.date_time.day)
83
+ assert_equal(t.hour, r.date_time.hour)
84
+ assert_equal(t.min, r.date_time.min)
85
+ assert_equal(t.sec, r.date_time.sec)
86
+ assert_equal("+0000", r.date_time.zone)
87
+ assert_equal({}, r.name_val)
88
+ ensure
89
+ ENV["TZ"] = tzbak
90
+ end
91
+ end
92
+
93
+ def test_parse_received_no_semicolon()
94
+ tzbak = ENV["TZ"]
95
+ begin
96
+ ENV["TZ"] = "GMT"
97
+ r = parse_received("from host.example.com by my.server for <user@domain.name>")
98
+ t = Time.now
99
+ assert_equal(t.year, r.date_time.year)
100
+ assert_equal(t.month, r.date_time.month)
101
+ assert_equal(t.day, r.date_time.day)
102
+ assert_equal(t.hour, r.date_time.hour)
103
+ assert_equal(t.min, r.date_time.min)
104
+ assert_equal(t.sec, r.date_time.sec)
105
+ assert_equal("+0000", r.date_time.zone)
106
+ assert_equal("host.example.com", r.name_val["from"])
107
+ assert_equal("my.server", r.name_val["by"])
108
+ assert_equal("<user@domain.name>", r.name_val["for"])
109
+ ensure
110
+ ENV["TZ"] = tzbak
111
+ end
112
+ end
113
+
114
+ def test_parse_received_only_semicolon()
115
+ tzbak = ENV["TZ"]
116
+ begin
117
+ ENV["TZ"] = "GMT"
118
+ r = parse_received(";")
119
+ t = Time.now
120
+ assert_equal(t.year, r.date_time.year)
121
+ assert_equal(t.month, r.date_time.month)
122
+ assert_equal(t.day, r.date_time.day)
123
+ assert_equal(t.hour, r.date_time.hour)
124
+ assert_equal(t.min, r.date_time.min)
125
+ assert_equal(t.sec, r.date_time.sec)
126
+ assert_equal("+0000", r.date_time.zone)
127
+ assert_equal({}, r.name_val)
128
+ ensure
129
+ ENV["TZ"] = tzbak
130
+ end
131
+ end
132
+
133
+ def test_parse_received_odd_param()
134
+ tzbak = ENV["TZ"]
135
+ begin
136
+ ENV["TZ"] = "GMT"
137
+ r = parse_received("a b c;")
138
+ t = Time.now
139
+ assert_equal(t.year, r.date_time.year)
140
+ assert_equal(t.month, r.date_time.month)
141
+ assert_equal(t.day, r.date_time.day)
142
+ assert_equal(t.hour, r.date_time.hour)
143
+ assert_equal(t.min, r.date_time.min)
144
+ assert_equal(t.sec, r.date_time.sec)
145
+ assert_equal("+0000", r.date_time.zone)
146
+ assert_equal({"a"=>"b", "c"=>nil}, r.name_val)
147
+ ensure
148
+ ENV["TZ"] = tzbak
149
+ end
150
+ end
151
+
152
+ def test_parse_content_type()
153
+ ct = parse_content_type("text/plain; charset=iso-2022-jp")
154
+ assert_equal("text", ct.type)
155
+ assert_equal("plain", ct.subtype)
156
+ assert_equal({"charset"=>"iso-2022-jp"}, ct.params)
157
+ end
158
+
159
+ def test_parse_content_type_miss()
160
+ ct = parse_content_type("text")
161
+ assert_equal("text", ct.type)
162
+ assert_equal("plain", ct.subtype)
163
+ assert_equal({}, ct.params)
164
+ end
165
+
166
+ def test_parse_content_type_name()
167
+ ct = parse_content_type("text/plain; name=hoge.txt")
168
+ assert_equal("text", ct.type)
169
+ assert_equal("plain", ct.subtype)
170
+ assert_equal({"name"=>"hoge.txt"}, ct.params)
171
+ end
172
+
173
+ def test_parse_content_type_empty
174
+ ct = parse_content_type("")
175
+ assert_equal("text", ct.type)
176
+ assert_equal("plain", ct.subtype)
177
+ end
178
+
179
+ def test_parse_content_type_name_quoted()
180
+ ct = parse_content_type("text/plain; name=\"hoge.txt\"")
181
+ assert_equal("text", ct.type)
182
+ assert_equal("plain", ct.subtype)
183
+ assert_equal({"name"=>"hoge.txt"}, ct.params)
184
+ end
185
+
186
+ def test_parse_content_type_other
187
+ ct = parse_content_type("other")
188
+ assert_equal("other", ct.type)
189
+ assert_equal("", ct.subtype)
190
+ end
191
+
192
+ def test_parse_content_transfer_encoding
193
+ cte = parse_content_transfer_encoding("7BIT")
194
+ assert_equal "7bit", cte.mechanism
195
+ end
196
+
197
+ def test_parse_content_transfer_encoding_empty
198
+ cte = parse_content_transfer_encoding("")
199
+ assert_equal "", cte.mechanism
200
+ end
201
+
202
+ def test_parse_mime_version
203
+ assert_equal "1.0", parse_mime_version("1.0")
204
+ assert_equal "1.0", parse_mime_version("1 . 0")
205
+ assert_equal "1.0", parse_mime_version("1. 0")
206
+ end
207
+
208
+ def test_parse_mime_version_empty
209
+ assert_equal "", parse_mime_version("")
210
+ end
211
+
212
+ def test_parse_content_disposition()
213
+ c = parse_content_disposition("attachment; filename=hoge.txt")
214
+ assert_equal("attachment", c.type)
215
+ assert_equal({"filename"=>"hoge.txt"}, c.params)
216
+ end
217
+
218
+ def test_parse_content_disposition_quoted()
219
+ c = parse_content_disposition("attachment; filename=\"hoge.txt\"")
220
+ assert_equal("attachment", c.type)
221
+ assert_equal({"filename"=>"hoge.txt"}, c.params)
222
+ end
223
+
224
+ def test_parse_content_disposition_empty
225
+ c = parse_content_disposition("")
226
+ assert_equal "", c.type
227
+ end
228
+
229
+ def test_parse_other_header
230
+ s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=")
231
+ assert_equal "=?euc-jp?q?=A4=A2=A4=A4?=", s
232
+ end
233
+
234
+ def test_parse_other_header_decode
235
+ s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=", :decode_mime_header=>true)
236
+ assert_equal "\xa4\xa2\xa4\xa4", s
237
+ end
238
+
239
+ def test_parse_other_header_decode_charset
240
+ s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=", :decode_mime_header=>true, :output_charset=>"utf-8")
241
+ assert_equal "あい", s
242
+ end
243
+
244
+ def test_parse_other_header_decode_charset_converter
245
+ s = parse("subject", "=?euc-jp?q?=A4=A2=A4=A4?=", :decode_mime_header=>true, :output_charset=>"utf-8", :charset_converter=>proc{"abcdefg"})
246
+ assert_equal "abcdefg", s
247
+ end
248
+
249
+ def test_split_by()
250
+ assert_equal([["aa","bb"],["cc"],["dd"]], split_by(%w(aa bb , cc , dd), ","))
251
+ end
252
+
253
+ def test_mailbox_list()
254
+ ml = mailbox_list("hoge <hoge@example.com>, fuga@example.net", {})
255
+ assert_equal(2, ml.size)
256
+ assert_equal("hoge", ml[0].phrase)
257
+ assert_equal("hoge", ml[0].addr_spec.local_part)
258
+ assert_equal("example.com", ml[0].addr_spec.domain)
259
+ assert_equal("", ml[1].phrase)
260
+ assert_equal("fuga", ml[1].addr_spec.local_part)
261
+ assert_equal("example.net", ml[1].addr_spec.domain)
262
+ end
263
+
264
+ def test_mailbox_list2()
265
+ ml = mailbox_list("hoge hoge (comment) <hoge.hoge@example.com>", {})
266
+ assert_equal(1, ml.size)
267
+ assert_equal("hoge hoge", ml[0].phrase)
268
+ assert_equal("hoge.hoge", ml[0].addr_spec.local_part)
269
+ assert_equal("example.com", ml[0].addr_spec.domain)
270
+ end
271
+
272
+ def test_mailbox_charset_converter
273
+ ml = mailbox_list("hoge =?us-ascii?q?hoge?= <hoge.hoge@example.com>", {:decode_mime_header=>true, :output_charset=>"us-ascii", :charset_converter=>proc{"fuga"}})
274
+ assert_equal(1, ml.size)
275
+ assert_equal("hoge fuga", ml[0].phrase)
276
+ assert_equal("hoge.hoge", ml[0].addr_spec.local_part)
277
+ assert_equal("example.com", ml[0].addr_spec.domain)
278
+ end
279
+
280
+ def test_msg_id_list_old_in_reply_to()
281
+ m = msg_id_list "hoge@hoge.hoge message <local-part@domain.name>"
282
+ assert_equal 1, m.size
283
+ assert_equal "local-part@domain.name", m[0].msg_id
284
+ end
285
+
286
+ def test_msg_id_list_multiple()
287
+ m = msg_id_list "<aa@bb.cc> <dd@ee.ff>"
288
+ assert_equal 2, m.size
289
+ assert_equal "aa@bb.cc", m[0].msg_id
290
+ assert_equal "dd@ee.ff", m[1].msg_id
291
+ end
292
+
293
+ def test_msg_id_list_nobracket()
294
+ m = msg_id_list "aa@bb.cc"
295
+ assert_equal 1, m.size
296
+ assert_equal "aa@bb.cc", m[0].msg_id
297
+ end
298
+
299
+ def test_msg_id_invalid_with_space()
300
+ m = msg_id_list " aa bb "
301
+ assert_equal 2, m.size
302
+ assert_equal "aa", m[0].msg_id
303
+ assert_equal "bb", m[1].msg_id
304
+ end
305
+
306
+ def test_msg_id_empty()
307
+ m = msg_id_list ""
308
+ assert_equal m, []
309
+ end
310
+
311
+ end
312
+
313
+ class TC_Loose_Tokenizer < Test::Unit::TestCase
314
+ include MailParser::Loose
315
+
316
+ def setup()
317
+ end
318
+ def teardown()
319
+ end
320
+
321
+ def test_token()
322
+ assert_equal(["a",",","b",",","c"], Tokenizer.token("a,b,c"))
323
+ end
324
+
325
+ def test_token2()
326
+ assert_equal(["a/b/c"], Tokenizer.token("a/b/c"))
327
+ end
328
+
329
+ def test_token_quoted_string()
330
+ assert_equal(["\"a,b,c\"",",","d",",","e"], Tokenizer.token("\"a,b,c\",d,e"))
331
+ end
332
+
333
+ def test_token_quoted_string2()
334
+ assert_equal(["\"ab\\\"c\"",",","d",",","e"], Tokenizer.token("\"ab\\\"c\",d,e"))
335
+ end
336
+
337
+ def test_token_comment()
338
+ assert_equal(["aa",",","cc",",","ee"], Tokenizer.token("aa(bb),cc(dd),ee"))
339
+ end
340
+
341
+ def test_token_nested_comment()
342
+ assert_equal(["aa",",","cc",",","ee"], Tokenizer.token("aa(bb(xx)),cc(dd),ee"))
343
+ end
344
+
345
+ def test_token_invalid_comment()
346
+ assert_equal(["aa","(","bb","(","xx",",","cc",",","ee"], Tokenizer.token("aa(bb(xx,cc(dd),ee"))
347
+ end
348
+
349
+ def test_token_received()
350
+ assert_equal(["aa","bb","cc"], Tokenizer.token_received("aa bb cc"))
351
+ end
352
+
353
+ def test_token_received_comment()
354
+ assert_equal(["a","b","c"], Tokenizer.token_received("a(hoge)b(hoge)c"))
355
+ end
356
+
357
+ def test_token_received_quotedstring()
358
+ assert_equal(["\"a b c\"", "<a@b.c>"], Tokenizer.token_received("\"a b c\" <a@b.c>"))
359
+ end
360
+
361
+ def test_token_received_semicolon()
362
+ assert_equal(["a","b",";","d","e"], Tokenizer.token_received("a b;d e"))
363
+ end
364
+
365
+ def test_token_received_VT()
366
+ timeout(2) do
367
+ assert_equal(["a","b","c","d"], Tokenizer.token_received("a b\vc d"))
368
+ end
369
+ end
370
+
371
+ end