ruby-msg 1.3.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +108 -113
- data/Rakefile +42 -28
- data/bin/mapitool +195 -0
- data/lib/mapi.rb +109 -0
- data/lib/mapi/convert.rb +61 -0
- data/lib/mapi/convert/contact.rb +142 -0
- data/lib/mapi/convert/note-mime.rb +274 -0
- data/lib/mapi/convert/note-tmail.rb +287 -0
- data/lib/mapi/msg.rb +440 -0
- data/lib/mapi/property_set.rb +269 -0
- data/lib/mapi/pst.rb +1806 -0
- data/lib/mapi/rtf.rb +169 -0
- data/lib/mapi/types.rb +51 -0
- data/lib/rtf.rb +0 -9
- data/test/test_convert_contact.rb +60 -0
- data/test/test_convert_note.rb +66 -0
- data/test/test_mime.rb +4 -2
- data/test/test_msg.rb +29 -0
- data/test/test_property_set.rb +116 -0
- data/test/test_types.rb +17 -0
- metadata +78 -48
- data/bin/msgtool +0 -65
- data/lib/msg.rb +0 -522
- data/lib/msg/properties.rb +0 -532
- data/lib/msg/rtf.rb +0 -236
@@ -0,0 +1,287 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'tmail'
|
3
|
+
|
4
|
+
# these will be removed later
|
5
|
+
require 'time'
|
6
|
+
require 'mime'
|
7
|
+
|
8
|
+
# there is some Msg specific stuff in here.
|
9
|
+
|
10
|
+
class TMail::Mail
|
11
|
+
def quoted_body= str
|
12
|
+
body_port.wopen { |f| f.write str }
|
13
|
+
str
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
module Mapi
|
18
|
+
class Message
|
19
|
+
def mime
|
20
|
+
return @mime if @mime
|
21
|
+
# if these headers exist at all, they can be helpful. we may however get a
|
22
|
+
# application/ms-tnef mime root, which means there will be little other than
|
23
|
+
# headers. we may get nothing.
|
24
|
+
# and other times, when received from external, we get the full cigar, boundaries
|
25
|
+
# etc and all.
|
26
|
+
# sometimes its multipart, with no boundaries. that throws an error. so we'll be more
|
27
|
+
# forgiving here
|
28
|
+
@mime = Mime.new props.transport_message_headers.to_s, true
|
29
|
+
populate_headers
|
30
|
+
@mime
|
31
|
+
end
|
32
|
+
|
33
|
+
def headers
|
34
|
+
mime.headers
|
35
|
+
end
|
36
|
+
|
37
|
+
# copy data from msg properties storage to standard mime. headers
|
38
|
+
# i've now seen it where the existing headers had heaps on stuff, and the msg#props had
|
39
|
+
# practically nothing. think it was because it was a tnef - msg conversion done by exchange.
|
40
|
+
def populate_headers
|
41
|
+
# construct a From value
|
42
|
+
# should this kind of thing only be done when headers don't exist already? maybe not. if its
|
43
|
+
# sent, then modified and saved, the headers could be wrong?
|
44
|
+
# hmmm. i just had an example where a mail is sent, from an internal user, but it has transport
|
45
|
+
# headers, i think because one recipient was external. the only place the senders email address
|
46
|
+
# exists is in the transport headers. so its maybe not good to overwrite from.
|
47
|
+
# recipients however usually have smtp address available.
|
48
|
+
# maybe we'll do it for all addresses that are smtp? (is that equivalent to
|
49
|
+
# sender_email_address !~ /^\//
|
50
|
+
name, email = props.sender_name, props.sender_email_address
|
51
|
+
if props.sender_addrtype == 'SMTP'
|
52
|
+
headers['From'] = if name and email and name != email
|
53
|
+
[%{"#{name}" <#{email}>}]
|
54
|
+
else
|
55
|
+
[email || name]
|
56
|
+
end
|
57
|
+
elsif !headers.has_key?('From')
|
58
|
+
# some messages were never sent, so that sender stuff isn't filled out. need to find another
|
59
|
+
# way to get something
|
60
|
+
# what about marking whether we thing the email was sent or not? or draft?
|
61
|
+
# for partition into an eventual Inbox, Sent, Draft mbox set?
|
62
|
+
# i've now seen cases where this stuff is missing, but exists in transport message headers,
|
63
|
+
# so maybe i should inhibit this in that case.
|
64
|
+
if email
|
65
|
+
# disabling this warning for now
|
66
|
+
#Log.warn "* no smtp sender email address available (only X.400). creating fake one"
|
67
|
+
# this is crap. though i've specially picked the logic so that it generates the correct
|
68
|
+
# email addresses in my case (for my organisation).
|
69
|
+
# this user stuff will give valid email i think, based on alias.
|
70
|
+
user = name ? name.sub(/(.*), (.*)/, "\\2.\\1") : email[/\w+$/].downcase
|
71
|
+
domain = (email[%r{^/O=([^/]+)}i, 1].downcase + '.com' rescue email)
|
72
|
+
headers['From'] = [name ? %{"#{name}" <#{user}@#{domain}>} : "<#{user}@#{domain}>" ]
|
73
|
+
elsif name
|
74
|
+
# we only have a name? thats screwed up.
|
75
|
+
# disabling this warning for now
|
76
|
+
#Log.warn "* no smtp sender email address available (only name). creating fake one"
|
77
|
+
headers['From'] = [%{"#{name}"}]
|
78
|
+
else
|
79
|
+
# disabling this warning for now
|
80
|
+
#Log.warn "* no sender email address available at all. FIXME"
|
81
|
+
end
|
82
|
+
# else we leave the transport message header version
|
83
|
+
end
|
84
|
+
|
85
|
+
# for all of this stuff, i'm assigning in utf8 strings.
|
86
|
+
# thats ok i suppose, maybe i can say its the job of the mime class to handle that.
|
87
|
+
# but a lot of the headers are overloaded in different ways. plain string, many strings
|
88
|
+
# other stuff. what happens to a person who has a " in their name etc etc. encoded words
|
89
|
+
# i suppose. but that then happens before assignment. and can't be automatically undone
|
90
|
+
# until the header is decomposed into recipients.
|
91
|
+
recips_by_type = recipients.group_by { |r| r.type }
|
92
|
+
# i want to the the types in a specific order.
|
93
|
+
[:to, :cc, :bcc].each do |type|
|
94
|
+
# don't know why i bother, but if we can, we try to sort recipients by the numerical part
|
95
|
+
# of the ole name, or just leave it if we can't
|
96
|
+
recips = recips_by_type[type]
|
97
|
+
recips = (recips.sort_by { |r| r.obj.name[/\d{8}$/].hex } rescue recips)
|
98
|
+
# switched to using , for separation, not ;. see issue #4
|
99
|
+
# recips.empty? is strange. i wouldn't have thought it possible, but it was right?
|
100
|
+
headers[type.to_s.sub(/^(.)/) { $1.upcase }] = [recips.join(', ')] unless recips.empty?
|
101
|
+
end
|
102
|
+
headers['Subject'] = [props.subject] if props.subject
|
103
|
+
|
104
|
+
# fill in a date value. by default, we won't mess with existing value hear
|
105
|
+
if !headers.has_key?('Date')
|
106
|
+
# we want to get a received date, as i understand it.
|
107
|
+
# use this preference order, or pull the most recent?
|
108
|
+
keys = %w[message_delivery_time client_submit_time last_modification_time creation_time]
|
109
|
+
time = keys.each { |key| break time if time = props.send(key) }
|
110
|
+
time = nil unless Date === time
|
111
|
+
|
112
|
+
# now convert and store
|
113
|
+
# this is a little funky. not sure about time zone stuff either?
|
114
|
+
# actually seems ok. maybe its always UTC and interpreted anyway. or can be timezoneless.
|
115
|
+
# i have no timezone info anyway.
|
116
|
+
# in gmail, i see stuff like 15 Jan 2007 00:48:19 -0000, and it displays as 11:48.
|
117
|
+
# can also add .localtime here if desired. but that feels wrong.
|
118
|
+
headers['Date'] = [Time.iso8601(time.to_s).rfc2822] if time
|
119
|
+
end
|
120
|
+
|
121
|
+
# some very simplistic mapping between internet message headers and the
|
122
|
+
# mapi properties
|
123
|
+
# any of these could be causing duplicates due to case issues. the hack in #to_mime
|
124
|
+
# just stops re-duplication at that point. need to move some smarts into the mime
|
125
|
+
# code to handle it.
|
126
|
+
mapi_header_map = [
|
127
|
+
[:internet_message_id, 'Message-ID'],
|
128
|
+
[:in_reply_to_id, 'In-Reply-To'],
|
129
|
+
# don't set these values if they're equal to the defaults anyway
|
130
|
+
[:importance, 'Importance', proc { |val| val.to_s == '1' ? nil : val }],
|
131
|
+
[:priority, 'Priority', proc { |val| val.to_s == '1' ? nil : val }],
|
132
|
+
[:sensitivity, 'Sensitivity', proc { |val| val.to_s == '0' ? nil : val }],
|
133
|
+
# yeah?
|
134
|
+
[:conversation_topic, 'Thread-Topic'],
|
135
|
+
# not sure of the distinction here
|
136
|
+
# :originator_delivery_report_requested ??
|
137
|
+
[:read_receipt_requested, 'Disposition-Notification-To', proc { |val| from }]
|
138
|
+
]
|
139
|
+
mapi_header_map.each do |mapi, mime, *f|
|
140
|
+
next unless q = val = props.send(mapi) or headers.has_key?(mime)
|
141
|
+
next if f[0] and !(val = f[0].call(val))
|
142
|
+
headers[mime] = [val.to_s]
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# redundant?
|
147
|
+
def type
|
148
|
+
props.message_class[/IPM\.(.*)/, 1].downcase rescue nil
|
149
|
+
end
|
150
|
+
|
151
|
+
# shortcuts to some things from the headers
|
152
|
+
%w[From To Cc Bcc Subject].each do |key|
|
153
|
+
define_method(key.downcase) { headers[key].join(' ') if headers.has_key?(key) }
|
154
|
+
end
|
155
|
+
|
156
|
+
def body_to_tmail
|
157
|
+
# to create the body
|
158
|
+
# should have some options about serializing rtf. and possibly options to check the rtf
|
159
|
+
# for rtf2html conversion, stripping those html tags or other similar stuff. maybe want to
|
160
|
+
# ignore it in the cases where it is generated from incoming html. but keep it if it was the
|
161
|
+
# source for html and plaintext.
|
162
|
+
if props.body_rtf or props.body_html
|
163
|
+
# should plain come first?
|
164
|
+
part = TMail::Mail.new
|
165
|
+
# its actually possible for plain body to be empty, but the others not.
|
166
|
+
# if i can get an html version, then maybe a callout to lynx can be made...
|
167
|
+
part.parts << TMail::Mail.parse("Content-Type: text/plain\r\n\r\n" + props.body) if props.body
|
168
|
+
# this may be automatically unwrapped from the rtf if the rtf includes the html
|
169
|
+
part.parts << TMail::Mail.parse("Content-Type: text/html\r\n\r\n" + props.body_html) if props.body_html
|
170
|
+
# temporarily disabled the rtf. its just showing up as an attachment anyway.
|
171
|
+
#mime.parts << Mime.new("Content-Type: text/rtf\r\n\r\n" + props.body_rtf) if props.body_rtf
|
172
|
+
# its thus currently possible to get no body at all if the only body is rtf. that is not
|
173
|
+
# really acceptable FIXME
|
174
|
+
part['Content-Type'] = 'multipart/alternative'
|
175
|
+
part
|
176
|
+
else
|
177
|
+
# check no header case. content type? etc?. not sure if my Mime class will accept
|
178
|
+
Log.debug "taking that other path"
|
179
|
+
# body can be nil, hence the to_s
|
180
|
+
TMail::Mail.parse "Content-Type: text/plain\r\n\r\n" + props.body.to_s
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def to_tmail
|
185
|
+
# intended to be used for IPM.note, which is the email type. can use it for others if desired,
|
186
|
+
# YMMV
|
187
|
+
Log.warn "to_mime used on a #{props.message_class}" unless props.message_class == 'IPM.Note'
|
188
|
+
# we always have a body
|
189
|
+
mail = body = body_to_tmail
|
190
|
+
|
191
|
+
# If we have attachments, we take the current mime root (body), and make it the first child
|
192
|
+
# of a new tree that will contain body and attachments.
|
193
|
+
unless attachments.empty?
|
194
|
+
raise NotImplementedError
|
195
|
+
mime = Mime.new "Content-Type: multipart/mixed\r\n\r\n"
|
196
|
+
mime.parts << body
|
197
|
+
# i don't know any better way to do this. need multipart/related for inline images
|
198
|
+
# referenced by cid: urls to work, but don't want to use it otherwise...
|
199
|
+
related = false
|
200
|
+
attachments.each do |attach|
|
201
|
+
part = attach.to_mime
|
202
|
+
related = true if part.headers.has_key?('Content-ID') or part.headers.has_key?('Content-Location')
|
203
|
+
mime.parts << part
|
204
|
+
end
|
205
|
+
mime.headers['Content-Type'] = ['multipart/related'] if related
|
206
|
+
end
|
207
|
+
|
208
|
+
# at this point, mime is either
|
209
|
+
# - a single text/plain, consisting of the body ('taking that other path' above. rare)
|
210
|
+
# - a multipart/alternative, consiting of a few bodies (plain and html body. common)
|
211
|
+
# - a multipart/mixed, consisting of 1 of the above 2 types of bodies, and attachments.
|
212
|
+
# we add this standard preamble if its multipart
|
213
|
+
# FIXME preamble.replace, and body.replace both suck.
|
214
|
+
# preamble= is doable. body= wasn't being done because body will get rewritten from parts
|
215
|
+
# if multipart, and is only there readonly. can do that, or do a reparse...
|
216
|
+
# The way i do this means that only the first preamble will say it, not preambles of nested
|
217
|
+
# multipart chunks.
|
218
|
+
mail.quoted_body = "This is a multi-part message in MIME format.\r\n" if mail.multipart?
|
219
|
+
|
220
|
+
# now that we have a root, we can mix in all our headers
|
221
|
+
headers.each do |key, vals|
|
222
|
+
# don't overwrite the content-type, encoding style stuff
|
223
|
+
next if mail[key]
|
224
|
+
# some new temporary hacks
|
225
|
+
next if key =~ /content-type/i and vals[0] =~ /base64/
|
226
|
+
#next if mime.headers.keys.map(&:downcase).include? key.downcase
|
227
|
+
mail[key] = vals.first
|
228
|
+
end
|
229
|
+
# just a stupid hack to make the content-type header last, when using OrderedHash
|
230
|
+
#mime.headers['Content-Type'] = mime.headers.delete 'Content-Type'
|
231
|
+
|
232
|
+
mail
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
class Attachment
|
237
|
+
def to_tmail
|
238
|
+
# TODO: smarter mime typing.
|
239
|
+
mimetype = props.attach_mime_tag || 'application/octet-stream'
|
240
|
+
part = TMail::Mail.parse "Content-Type: #{mimetype}\r\n\r\n"
|
241
|
+
part['Content-Disposition'] = %{attachment; filename="#{filename}"}
|
242
|
+
part['Content-Transfer-Encoding'] = 'base64'
|
243
|
+
part['Content-Location'] = props.attach_content_location if props.attach_content_location
|
244
|
+
part['Content-ID'] = props.attach_content_id if props.attach_content_id
|
245
|
+
# data.to_s for now. data was nil for some reason.
|
246
|
+
# perhaps it was a data object not correctly handled?
|
247
|
+
# hmmm, have to use read here. that assumes that the data isa stream.
|
248
|
+
# but if the attachment data is a string, then it won't work. possible?
|
249
|
+
data_str = if @embedded_msg
|
250
|
+
raise NotImplementedError
|
251
|
+
mime.headers['Content-Type'] = 'message/rfc822'
|
252
|
+
# lets try making it not base64 for now
|
253
|
+
mime.headers.delete 'Content-Transfer-Encoding'
|
254
|
+
# not filename. rather name, or something else right?
|
255
|
+
# maybe it should be inline?? i forget attach_method / access meaning
|
256
|
+
mime.headers['Content-Disposition'] = [%{attachment; filename="#{@embedded_msg.subject}"}]
|
257
|
+
@embedded_msg.to_mime.to_s
|
258
|
+
elsif @embedded_ole
|
259
|
+
raise NotImplementedError
|
260
|
+
# kind of hacky
|
261
|
+
io = StringIO.new
|
262
|
+
Ole::Storage.new io do |ole|
|
263
|
+
ole.root.type = :dir
|
264
|
+
Ole::Storage::Dirent.copy @embedded_ole, ole.root
|
265
|
+
end
|
266
|
+
io.string
|
267
|
+
else
|
268
|
+
data.read.to_s
|
269
|
+
end
|
270
|
+
part.body = @embedded_msg ? data_str : Base64.encode64(data_str).gsub(/\n/, "\r\n")
|
271
|
+
part
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
class Msg < Message
|
276
|
+
def populate_headers
|
277
|
+
super
|
278
|
+
if !headers.has_key?('Date')
|
279
|
+
# can employ other methods for getting a time. heres one in a similar vein to msgconvert.pl,
|
280
|
+
# ie taking the time from an ole object
|
281
|
+
time = @root.ole.dirents.map { |dirent| dirent.modify_time || dirent.create_time }.compact.sort.last
|
282
|
+
headers['Date'] = [Time.iso8601(time.to_s).rfc2822] if time
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
data/lib/mapi/msg.rb
ADDED
@@ -0,0 +1,440 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'ole/storage'
|
3
|
+
require 'mapi'
|
4
|
+
require 'mapi/rtf'
|
5
|
+
|
6
|
+
module Mapi
|
7
|
+
#
|
8
|
+
# = Introduction
|
9
|
+
#
|
10
|
+
# Primary class interface to the vagaries of .msg files.
|
11
|
+
#
|
12
|
+
# The core of the work is done by the <tt>Msg::PropertyStore</tt> class.
|
13
|
+
#
|
14
|
+
class Msg < Message
|
15
|
+
#
|
16
|
+
# = Introduction
|
17
|
+
#
|
18
|
+
# A big compononent of +Msg+ files is the property store, which holds
|
19
|
+
# all the key/value pairs of properties. The message itself, and all
|
20
|
+
# its <tt>Attachment</tt>s and <tt>Recipient</tt>s have an instance of
|
21
|
+
# this class.
|
22
|
+
#
|
23
|
+
# = Storage model
|
24
|
+
#
|
25
|
+
# Property keys (tags?) can be either simple hex numbers, in the
|
26
|
+
# range 0x0000 - 0xffff, or they can be named properties. In fact,
|
27
|
+
# properties in the range 0x0000 to 0x7fff are supposed to be the non-
|
28
|
+
# named properties, and can be considered to be in the +PS_MAPI+
|
29
|
+
# namespace. (correct?)
|
30
|
+
#
|
31
|
+
# Named properties are serialized in the 0x8000 to 0xffff range,
|
32
|
+
# and are referenced as a guid and long/string pair.
|
33
|
+
#
|
34
|
+
# There are key ranges, which can be used to imply things generally
|
35
|
+
# about keys.
|
36
|
+
#
|
37
|
+
# Further, we can give symbolic names to most keys, coming from
|
38
|
+
# constants in various places. Eg:
|
39
|
+
#
|
40
|
+
# 0x0037 => subject
|
41
|
+
# {00062002-0000-0000-C000-000000000046}/0x8218 => response_status
|
42
|
+
# # displayed as categories in outlook
|
43
|
+
# {00020329-0000-0000-C000-000000000046}/"Keywords" => categories
|
44
|
+
#
|
45
|
+
# Futher, there are completely different names, coming from other
|
46
|
+
# object models that get mapped to these things (CDO's model,
|
47
|
+
# Outlook's model etc). Eg "urn:schemas:httpmail:subject"
|
48
|
+
# I think these can be ignored though, as they aren't defined clearly
|
49
|
+
# in terms of mapi properties, and i'm really just trying to make
|
50
|
+
# a mapi property store. (It should also be relatively easy to
|
51
|
+
# support them later.)
|
52
|
+
#
|
53
|
+
# = Usage
|
54
|
+
#
|
55
|
+
# The api is driven by a desire to have the simple stuff "just work", ie
|
56
|
+
#
|
57
|
+
# properties.subject
|
58
|
+
# properties.display_name
|
59
|
+
#
|
60
|
+
# There also needs to be a way to look up properties more specifically:
|
61
|
+
#
|
62
|
+
# properties[0x0037] # => gets the subject
|
63
|
+
# properties[0x0037, PS_MAPI] # => still gets the subject
|
64
|
+
# properties['Keywords', PS_PUBLIC_STRINGS] # => gets outlook's categories array
|
65
|
+
#
|
66
|
+
# The abbreviated versions work by "resolving" the symbols to full keys:
|
67
|
+
#
|
68
|
+
# # the guid here is just PS_PUBLIC_STRINGS
|
69
|
+
# properties.resolve :keywords # => #<Key {00020329-0000-0000-c000-000000000046}/"Keywords">
|
70
|
+
# # the result here is actually also a key
|
71
|
+
# k = properties.resolve :subject # => 0x0037
|
72
|
+
# # it has a guid
|
73
|
+
# k.guid == Msg::Properties::PS_MAPI # => true
|
74
|
+
#
|
75
|
+
# = Parsing
|
76
|
+
#
|
77
|
+
# There are three objects that need to be parsed to load a +Msg+ property store:
|
78
|
+
#
|
79
|
+
# 1. The +nameid+ directory (<tt>Properties.parse_nameid</tt>)
|
80
|
+
# 2. The many +substg+ objects, whose names should match <tt>Properties::SUBSTG_RX</tt>
|
81
|
+
# (<tt>Properties#parse_substg</tt>)
|
82
|
+
# 3. The +properties+ file (<tt>Properties#parse_properties</tt>)
|
83
|
+
#
|
84
|
+
# Understanding of the formats is by no means perfect.
|
85
|
+
#
|
86
|
+
# = TODO
|
87
|
+
#
|
88
|
+
# * While the key objects are sufficient, the value objects are just plain
|
89
|
+
# ruby types. It currently isn't possible to write to the values, or to know
|
90
|
+
# which encoding the value had.
|
91
|
+
# * Update this doc.
|
92
|
+
# * Perhaps change from eager loading, to be load-on-demand.
|
93
|
+
#
|
94
|
+
class PropertyStore
|
95
|
+
include PropertySet::Constants
|
96
|
+
Key = PropertySet::Key
|
97
|
+
|
98
|
+
# note that binary and default both use obj.open. not the block form. this means we should
|
99
|
+
# #close it later, which we don't. as we're only reading though, it shouldn't matter right?
|
100
|
+
# not really good though FIXME
|
101
|
+
# change these to use mapi symbolic const names
|
102
|
+
ENCODINGS = {
|
103
|
+
0x000d => proc { |obj| obj }, # seems to be used when its going to be a directory instead of a file. eg nested ole. 3701 usually. in which case we shouldn't get here right?
|
104
|
+
0x001f => proc { |obj| Ole::Types::FROM_UTF16.iconv obj.read }, # unicode
|
105
|
+
# ascii
|
106
|
+
# FIXME hack did a[0..-2] before, seems right sometimes, but for some others it chopped the text. chomp
|
107
|
+
0x001e => proc { |obj| obj.read.chomp 0.chr },
|
108
|
+
0x0102 => proc { |obj| obj.open }, # binary?
|
109
|
+
:default => proc { |obj| obj.open }
|
110
|
+
}
|
111
|
+
|
112
|
+
SUBSTG_RX = /^__substg1\.0_([0-9A-F]{4})([0-9A-F]{4})(?:-([0-9A-F]{8}))?$/
|
113
|
+
PROPERTIES_RX = /^__properties_version1\.0$/
|
114
|
+
NAMEID_RX = /^__nameid_version1\.0$/
|
115
|
+
VALID_RX = /#{SUBSTG_RX}|#{PROPERTIES_RX}|#{NAMEID_RX}/
|
116
|
+
|
117
|
+
attr_reader :nameid
|
118
|
+
|
119
|
+
def initialize
|
120
|
+
@nameid = nil
|
121
|
+
# not exactly a cache currently
|
122
|
+
@cache = {}
|
123
|
+
end
|
124
|
+
|
125
|
+
#--
|
126
|
+
# The parsing methods
|
127
|
+
#++
|
128
|
+
|
129
|
+
def self.load obj
|
130
|
+
prop = new
|
131
|
+
prop.load obj
|
132
|
+
prop
|
133
|
+
end
|
134
|
+
|
135
|
+
# Parse properties from the +Dirent+ obj
|
136
|
+
def load obj
|
137
|
+
# we need to do the nameid first, as it provides the map for later user defined properties
|
138
|
+
if nameid_obj = obj.children.find { |child| child.name =~ NAMEID_RX }
|
139
|
+
@nameid = PropertyStore.parse_nameid nameid_obj
|
140
|
+
# hack to make it available to all msg files from the same ole storage object
|
141
|
+
# FIXME - come up with a neater way
|
142
|
+
class << obj.ole
|
143
|
+
attr_accessor :msg_nameid
|
144
|
+
end
|
145
|
+
obj.ole.msg_nameid = @nameid
|
146
|
+
elsif obj.ole
|
147
|
+
@nameid = obj.ole.msg_nameid rescue nil
|
148
|
+
end
|
149
|
+
# now parse the actual properties. i think dirs that match the substg should be decoded
|
150
|
+
# as properties to. 0x000d is just another encoding, the dir encoding. it should match
|
151
|
+
# whether the object is file / dir. currently only example is embedded msgs anyway
|
152
|
+
obj.children.each do |child|
|
153
|
+
next unless child.file?
|
154
|
+
case child.name
|
155
|
+
when PROPERTIES_RX
|
156
|
+
parse_properties child
|
157
|
+
when SUBSTG_RX
|
158
|
+
parse_substg(*($~[1..-1].map { |num| num.hex rescue nil } + [child]))
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
# Read nameid from the +Dirent+ obj, which is used for mapping of named properties keys to
|
164
|
+
# proxy keys in the 0x8000 - 0xffff range.
|
165
|
+
# Returns a hash of integer -> Key.
|
166
|
+
def self.parse_nameid obj
|
167
|
+
remaining = obj.children.dup
|
168
|
+
guids_obj, props_obj, names_obj =
|
169
|
+
%w[__substg1.0_00020102 __substg1.0_00030102 __substg1.0_00040102].map do |name|
|
170
|
+
remaining.delete obj/name
|
171
|
+
end
|
172
|
+
|
173
|
+
# parse guids
|
174
|
+
# this is the guids for named properities (other than builtin ones)
|
175
|
+
# i think PS_PUBLIC_STRINGS, and PS_MAPI are builtin.
|
176
|
+
guids = [PS_PUBLIC_STRINGS] + guids_obj.read.scan(/.{16}/m).map do |str|
|
177
|
+
Ole::Types.load_guid str
|
178
|
+
end
|
179
|
+
|
180
|
+
# parse names.
|
181
|
+
# the string ids for named properties
|
182
|
+
# they are no longer parsed, as they're referred to by offset not
|
183
|
+
# index. they are simply sequentially packed, as a long, giving
|
184
|
+
# the string length, then padding to 4 byte multiple, and repeat.
|
185
|
+
names_data = names_obj.read
|
186
|
+
|
187
|
+
# parse actual props.
|
188
|
+
# not sure about any of this stuff really.
|
189
|
+
# should flip a few bits in the real msg, to get a better understanding of how this works.
|
190
|
+
props = props_obj.read.scan(/.{8}/m).map do |str|
|
191
|
+
flags, offset = str[4..-1].unpack 'v2'
|
192
|
+
# the property will be serialised as this pseudo property, mapping it to this named property
|
193
|
+
pseudo_prop = 0x8000 + offset
|
194
|
+
named = flags & 1 == 1
|
195
|
+
prop = if named
|
196
|
+
str_off = *str.unpack('V')
|
197
|
+
len = *names_data[str_off, 4].unpack('V')
|
198
|
+
Ole::Types::FROM_UTF16.iconv names_data[str_off + 4, len]
|
199
|
+
else
|
200
|
+
a, b = str.unpack('v2')
|
201
|
+
Log.debug "b not 0" if b != 0
|
202
|
+
a
|
203
|
+
end
|
204
|
+
# a bit sus
|
205
|
+
guid_off = flags >> 1
|
206
|
+
# missing a few builtin PS_*
|
207
|
+
Log.debug "guid off < 2 (#{guid_off})" if guid_off < 2
|
208
|
+
guid = guids[guid_off - 2]
|
209
|
+
[pseudo_prop, Key.new(prop, guid)]
|
210
|
+
end
|
211
|
+
|
212
|
+
#Log.warn "* ignoring #{remaining.length} objects in nameid" unless remaining.empty?
|
213
|
+
# this leaves a bunch of other unknown chunks of data with completely unknown meaning.
|
214
|
+
# pp [:unknown, child.name, child.data.unpack('H*')[0].scan(/.{16}/m)]
|
215
|
+
Hash[*props.flatten]
|
216
|
+
end
|
217
|
+
|
218
|
+
# Parse an +Dirent+, as per <tt>msgconvert.pl</tt>. This is how larger properties, such
|
219
|
+
# as strings, binary blobs, and other ole sub-directories (eg nested Msg) are stored.
|
220
|
+
def parse_substg key, encoding, offset, obj
|
221
|
+
if (encoding & 0x1000) != 0
|
222
|
+
if !offset
|
223
|
+
# there is typically one with no offset first, whose data is a series of numbers
|
224
|
+
# equal to the lengths of all the sub parts. gives an implied array size i suppose.
|
225
|
+
# maybe you can initialize the array at this time. the sizes are the same as all the
|
226
|
+
# ole object sizes anyway, its to pre-allocate i suppose.
|
227
|
+
#p obj.data.unpack('V*')
|
228
|
+
# ignore this one
|
229
|
+
return
|
230
|
+
else
|
231
|
+
# remove multivalue flag for individual pieces
|
232
|
+
encoding &= ~0x1000
|
233
|
+
end
|
234
|
+
else
|
235
|
+
Log.warn "offset specified for non-multivalue encoding #{obj.name}" if offset
|
236
|
+
offset = nil
|
237
|
+
end
|
238
|
+
# offset is for multivalue encodings.
|
239
|
+
unless encoder = ENCODINGS[encoding]
|
240
|
+
Log.warn "unknown encoding #{encoding}"
|
241
|
+
#encoder = proc { |obj| obj.io } #.read }. maybe not a good idea
|
242
|
+
encoder = ENCODINGS[:default]
|
243
|
+
end
|
244
|
+
add_property key, encoder[obj], offset
|
245
|
+
end
|
246
|
+
|
247
|
+
# For parsing the +properties+ file. Smaller properties are serialized in one chunk,
|
248
|
+
# such as longs, bools, times etc. The parsing has problems.
|
249
|
+
def parse_properties obj
|
250
|
+
data = obj.read
|
251
|
+
# don't really understand this that well...
|
252
|
+
pad = data.length % 16
|
253
|
+
unless (pad == 0 || pad == 8) and data[0...pad] == "\000" * pad
|
254
|
+
Log.warn "padding was not as expected #{pad} (#{data.length}) -> #{data[0...pad].inspect}"
|
255
|
+
end
|
256
|
+
data[pad..-1].scan(/.{16}/m).each do |data|
|
257
|
+
property, encoding = ('%08x' % data.unpack('V')).scan /.{4}/
|
258
|
+
key = property.hex
|
259
|
+
# doesn't make any sense to me. probably because its a serialization of some internal
|
260
|
+
# outlook structure...
|
261
|
+
next if property == '0000'
|
262
|
+
case encoding
|
263
|
+
when '0102', '001e', '001f', '101e', '101f', '000d'
|
264
|
+
# ignore on purpose. not sure what its for
|
265
|
+
# multivalue versions ignored also
|
266
|
+
when '0003' # long
|
267
|
+
# don't know what all the other data is for
|
268
|
+
add_property key, *data[8, 4].unpack('V')
|
269
|
+
when '000b' # boolean
|
270
|
+
# again, heaps more data than needed. and its not always 0 or 1.
|
271
|
+
# they are in fact quite big numbers. this is wrong.
|
272
|
+
# p [property, data[4..-1].unpack('H*')[0]]
|
273
|
+
add_property key, data[8, 4].unpack('V')[0] != 0
|
274
|
+
when '0040' # systime
|
275
|
+
# seems to work:
|
276
|
+
add_property key, Ole::Types.load_time(data[8..-1])
|
277
|
+
else
|
278
|
+
#Log.warn "ignoring data in __properties section, encoding: #{encoding}"
|
279
|
+
#Log << data.unpack('H*').inspect + "\n"
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
def add_property key, value, pos=nil
|
285
|
+
# map keys in the named property range through nameid
|
286
|
+
if Integer === key and key >= 0x8000
|
287
|
+
if !@nameid
|
288
|
+
Log.warn "no nameid section yet named properties used"
|
289
|
+
key = Key.new key
|
290
|
+
elsif real_key = @nameid[key]
|
291
|
+
key = real_key
|
292
|
+
else
|
293
|
+
# i think i hit these when i have a named property, in the PS_MAPI
|
294
|
+
# guid
|
295
|
+
Log.warn "property in named range not in nameid #{key.inspect}"
|
296
|
+
key = Key.new key
|
297
|
+
end
|
298
|
+
else
|
299
|
+
key = Key.new key
|
300
|
+
end
|
301
|
+
if pos
|
302
|
+
@cache[key] ||= []
|
303
|
+
Log.warn "duplicate property" unless Array === @cache[key]
|
304
|
+
# ^ this is actually a trickier problem. the issue is more that they must all be of
|
305
|
+
# the same type.
|
306
|
+
@cache[key][pos] = value
|
307
|
+
else
|
308
|
+
# take the last.
|
309
|
+
Log.warn "duplicate property #{key.inspect}" if @cache[key]
|
310
|
+
@cache[key] = value
|
311
|
+
end
|
312
|
+
end
|
313
|
+
|
314
|
+
# delegate to cache
|
315
|
+
def method_missing name, *args, &block
|
316
|
+
@cache.send name, *args, &block
|
317
|
+
end
|
318
|
+
end
|
319
|
+
|
320
|
+
# these 2 will actually be of the form
|
321
|
+
# 1\.0_#([0-9A-Z]{8}), where $1 is the 0 based index number in hex
|
322
|
+
# should i parse that and use it as an index, or just return in
|
323
|
+
# file order? probably should use it later...
|
324
|
+
ATTACH_RX = /^__attach_version1\.0_.*/
|
325
|
+
RECIP_RX = /^__recip_version1\.0_.*/
|
326
|
+
VALID_RX = /#{PropertyStore::VALID_RX}|#{ATTACH_RX}|#{RECIP_RX}/
|
327
|
+
|
328
|
+
attr_reader :root
|
329
|
+
attr_accessor :close_parent
|
330
|
+
|
331
|
+
# Alternate constructor, to create an +Msg+ directly from +arg+ and +mode+, passed
|
332
|
+
# directly to Ole::Storage (ie either filename or seekable IO object).
|
333
|
+
def self.open arg, mode=nil
|
334
|
+
msg = new Ole::Storage.open(arg, mode).root
|
335
|
+
# we will close the ole when we are #closed
|
336
|
+
msg.close_parent = true
|
337
|
+
if block_given?
|
338
|
+
begin yield msg
|
339
|
+
ensure; msg.close
|
340
|
+
end
|
341
|
+
else msg
|
342
|
+
end
|
343
|
+
end
|
344
|
+
|
345
|
+
# Create an Msg from +root+, an <tt>Ole::Storage::Dirent</tt> object
|
346
|
+
def initialize root
|
347
|
+
@root = root
|
348
|
+
@close_parent = false
|
349
|
+
super PropertySet.new(PropertyStore.load(@root))
|
350
|
+
Msg.warn_unknown @root
|
351
|
+
end
|
352
|
+
|
353
|
+
def self.warn_unknown obj
|
354
|
+
# bit of validation. not important if there is extra stuff, though would be
|
355
|
+
# interested to know what it is. doesn't check dir/file stuff.
|
356
|
+
unknown = obj.children.reject { |child| child.name =~ VALID_RX }
|
357
|
+
Log.warn "skipped #{unknown.length} unknown msg object(s)" unless unknown.empty?
|
358
|
+
end
|
359
|
+
|
360
|
+
def close
|
361
|
+
@root.ole.close if @close_parent
|
362
|
+
end
|
363
|
+
|
364
|
+
def attachments
|
365
|
+
@attachments ||= @root.children.
|
366
|
+
select { |child| child.dir? and child.name =~ ATTACH_RX }.
|
367
|
+
map { |child| Attachment.new child }.
|
368
|
+
select { |attach| attach.valid? }
|
369
|
+
end
|
370
|
+
|
371
|
+
def recipients
|
372
|
+
@recipients ||= @root.children.
|
373
|
+
select { |child| child.dir? and child.name =~ RECIP_RX }.
|
374
|
+
map { |child| Recipient.new child }
|
375
|
+
end
|
376
|
+
|
377
|
+
class Attachment < Mapi::Attachment
|
378
|
+
attr_reader :obj, :properties
|
379
|
+
alias props :properties
|
380
|
+
|
381
|
+
def initialize obj
|
382
|
+
@obj = obj
|
383
|
+
@embedded_ole = nil
|
384
|
+
@embedded_msg = nil
|
385
|
+
|
386
|
+
super PropertySet.new(PropertyStore.load(@obj))
|
387
|
+
Msg.warn_unknown @obj
|
388
|
+
|
389
|
+
@obj.children.each do |child|
|
390
|
+
# temp hack. PropertyStore doesn't do directory properties atm - FIXME
|
391
|
+
if child.dir? and child.name =~ PropertyStore::SUBSTG_RX and
|
392
|
+
$1 == '3701' and $2.downcase == '000d'
|
393
|
+
@embedded_ole = child
|
394
|
+
class << @embedded_ole
|
395
|
+
def compobj
|
396
|
+
return nil unless compobj = self["\001CompObj"]
|
397
|
+
compobj.read[/^.{32}([^\x00]+)/m, 1]
|
398
|
+
end
|
399
|
+
|
400
|
+
def embedded_type
|
401
|
+
temp = compobj and return temp
|
402
|
+
# try to guess more
|
403
|
+
if children.select { |child| child.name =~ /__(substg|properties|recip|attach|nameid)/ }.length > 2
|
404
|
+
return 'Microsoft Office Outlook Message'
|
405
|
+
end
|
406
|
+
nil
|
407
|
+
end
|
408
|
+
end
|
409
|
+
if @embedded_ole.embedded_type == 'Microsoft Office Outlook Message'
|
410
|
+
@embedded_msg = Msg.new @embedded_ole
|
411
|
+
end
|
412
|
+
end
|
413
|
+
end
|
414
|
+
end
|
415
|
+
|
416
|
+
def valid?
|
417
|
+
# something i started to notice when handling embedded ole object attachments is
|
418
|
+
# the particularly strange case where there are empty attachments
|
419
|
+
not props.raw.keys.empty?
|
420
|
+
end
|
421
|
+
end
|
422
|
+
|
423
|
+
#
|
424
|
+
# +Recipient+ serves as a container for the +recip+ directories in the .msg.
|
425
|
+
# It has things like office_location, business_telephone_number, but I don't
|
426
|
+
# think enough to make a vCard out of?
|
427
|
+
#
|
428
|
+
class Recipient < Mapi::Recipient
|
429
|
+
attr_reader :obj, :properties
|
430
|
+
alias props :properties
|
431
|
+
|
432
|
+
def initialize obj
|
433
|
+
@obj = obj
|
434
|
+
super PropertySet.new(PropertyStore.load(@obj))
|
435
|
+
Msg.warn_unknown @obj
|
436
|
+
end
|
437
|
+
end
|
438
|
+
end
|
439
|
+
end
|
440
|
+
|