ruby-msg-nx 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/mapi/msg.rb ADDED
@@ -0,0 +1,516 @@
1
+ require 'ole/storage'
2
+ require 'mapi'
3
+ require 'mapi/rtf'
4
+ require 'mapi/helper'
5
+
6
+ module Mapi
7
+ #
8
+ # # Introduction
9
+ #
10
+ # Primary class interface to the vagaries of .msg files.
11
+ #
12
+ # The core of the work is done by the {Msg::PropertyStore} class.
13
+ #
14
+ class Msg < Message
15
+ #
16
+ # # Introduction
17
+ #
18
+ # A big compononent of {Msg} files is the property store, which holds
19
+ # all the key/value pairs of properties. The message itself, and all
20
+ # its {Attachment}s and {Recipient}s have an instance of
21
+ # this class.
22
+ #
23
+ # # Storage model
24
+ #
25
+ # Property keys (tags?) can be either simple hex numbers, in the
26
+ # range 0x0000 - 0xffff, or they can be named properties. In fact,
27
+ # properties in the range 0x0000 to 0x7fff are supposed to be the non-
28
+ # named properties, and can be considered to be in the `PS_MAPI`
29
+ # namespace. (correct?)
30
+ #
31
+ # Named properties are serialized in the 0x8000 to 0xffff range,
32
+ # and are referenced as a guid and long/string pair.
33
+ #
34
+ # There are key ranges, which can be used to imply things generally
35
+ # about keys.
36
+ #
37
+ # Further, we can give symbolic names to most keys, coming from
38
+ # constants in various places. Eg:
39
+ #
40
+ # ```
41
+ # 0x0037 => subject
42
+ # {00062002-0000-0000-C000-000000000046}/0x8218 => response_status
43
+ # # displayed as categories in outlook
44
+ # {00020329-0000-0000-C000-000000000046}/"Keywords" => categories
45
+ # ```
46
+ #
47
+ # Futher, there are completely different names, coming from other
48
+ # object models that get mapped to these things (CDO's model,
49
+ # Outlook's model etc). Eg "urn:schemas:httpmail:subject"
50
+ # I think these can be ignored though, as they aren't defined clearly
51
+ # in terms of mapi properties, and i'm really just trying to make
52
+ # a mapi property store. (It should also be relatively easy to
53
+ # support them later.)
54
+ #
55
+ # # Usage
56
+ #
57
+ # The api is driven by a desire to have the simple stuff "just work", ie
58
+ #
59
+ # ```
60
+ # properties.subject
61
+ # properties.display_name
62
+ # ```
63
+ #
64
+ # There also needs to be a way to look up properties more specifically:
65
+ #
66
+ # ```
67
+ # properties[0x0037] # => gets the subject
68
+ # properties[0x0037, PS_MAPI] # => still gets the subject
69
+ # properties['Keywords', PS_PUBLIC_STRINGS] # => gets outlook's categories array
70
+ # ```
71
+ #
72
+ # The abbreviated versions work by "resolving" the symbols to full keys:
73
+ #
74
+ # ```
75
+ # # the guid here is just PS_PUBLIC_STRINGS
76
+ # properties.resolve :keywords # => #<Key {00020329-0000-0000-c000-000000000046}/"Keywords">
77
+ # # the result here is actually also a key
78
+ # k = properties.resolve :subject # => 0x0037
79
+ # # it has a guid
80
+ # k.guid == Msg::Properties::PS_MAPI # => true
81
+ # ```
82
+ #
83
+ # # Parsing
84
+ #
85
+ # There are three objects that need to be parsed to load a +Msg+ property store:
86
+ #
87
+ # 1. The `nameid` directory (`Properties.parse_nameid`)
88
+ # 2. The many `substg` objects, whose names should match `Properties::SUBSTG_RX`
89
+ # (`Properties#parse_substg`)
90
+ # 3. The `properties` file (`Properties#parse_properties`)
91
+ #
92
+ # Understanding of the formats is by no means perfect.
93
+ #
94
+ # # TODO
95
+ #
96
+ # * While the key objects are sufficient, the value objects are just plain
97
+ # ruby types. It currently isn't possible to write to the values, or to know
98
+ # which encoding the value had.
99
+ # * Update this doc.
100
+ # * Perhaps change from eager loading, to be load-on-demand.
101
+ #
102
+ # @private
103
+ class PropertyStore
104
+ include PropertySet::Constants
105
+ Key = PropertySet::Key
106
+
107
+ # note that binary and default both use obj.open. not the block form. this means we should
108
+ # #close it later, which we don't. as we're only reading though, it shouldn't matter right?
109
+ # not really good though FIXME
110
+ # change these to use mapi symbolic const names
111
+ ENCODINGS = {
112
+ 0x000d => proc { |obj, helper| obj }, # seems to be used when its going to be a directory instead of a file. eg nested ole. 3701 usually. in which case we shouldn't get here right?
113
+ 0x001f => proc { |obj, helper| Ole::Types::FROM_UTF16.iconv obj.read }, # unicode
114
+ # ascii
115
+ # FIXME hack did a[0..-2] before, seems right sometimes, but for some others it chopped the text. chomp
116
+ 0x001e => proc { |obj, helper| helper.convert_ansi_str(obj.read.chomp 0.chr) },
117
+ 0x0102 => proc { |obj, helper| obj.open }, # binary?
118
+ :default => proc { |obj, helper| obj.open }
119
+ }
120
+
121
+ SUBSTG_RX = /^__substg1\.0_([0-9A-F]{4})([0-9A-F]{4})(?:-([0-9A-F]{8}))?$/
122
+ PROPERTIES_RX = /^__properties_version1\.0$/
123
+ NAMEID_RX = /^__nameid_version1\.0$/
124
+ VALID_RX = /#{SUBSTG_RX}|#{PROPERTIES_RX}|#{NAMEID_RX}/
125
+
126
+ # @return [Hash]
127
+ attr_reader :nameid
128
+ # @return [Helper]
129
+ attr_reader :helper
130
+
131
+ # @param helper [Helper]
132
+ def initialize helper
133
+ @nameid = nil
134
+ @helper = helper
135
+ # not exactly a cache currently
136
+ @cache = {}
137
+ end
138
+
139
+ # The parsing methods
140
+ #
141
+ # @param obj [Ole::Storage::Dirent]
142
+ # @param helper [Helper]
143
+ # @return [PropertyStore]
144
+ def self.load obj, helper
145
+ prop = new helper
146
+ prop.load obj
147
+ prop
148
+ end
149
+
150
+ # Parse properties from the +Dirent+ obj
151
+ #
152
+ # @param obj [Ole::Storage::Dirent]
153
+ def load obj
154
+ # we need to do the nameid first, as it provides the map for later user defined properties
155
+ if nameid_obj = obj.children.find { |child| child.name =~ NAMEID_RX }
156
+ @nameid = PropertyStore.parse_nameid nameid_obj
157
+ # hack to make it available to all msg files from the same ole storage object
158
+ # FIXME - come up with a neater way
159
+ class << obj.ole
160
+ attr_accessor :msg_nameid
161
+ end
162
+ obj.ole.msg_nameid = @nameid
163
+ elsif obj.ole
164
+ @nameid = obj.ole.msg_nameid rescue nil
165
+ end
166
+ # now parse the actual properties. i think dirs that match the substg should be decoded
167
+ # as properties to. 0x000d is just another encoding, the dir encoding. it should match
168
+ # whether the object is file / dir. currently only example is embedded msgs anyway
169
+ obj.children.each do |child|
170
+ next unless child.file?
171
+ case child.name
172
+ when PROPERTIES_RX
173
+ parse_properties child
174
+ when SUBSTG_RX
175
+ parse_substg(*($~[1..-1].map { |num| num.hex rescue nil } + [child]))
176
+ end
177
+ end
178
+ end
179
+
180
+ # Read nameid from the +Dirent+ obj, which is used for mapping of named properties keys to
181
+ # proxy keys in the 0x8000 - 0xffff range.
182
+ # Returns a hash of integer -> Key.
183
+ #
184
+ # @param obj [Ole::Storage::Dirent]
185
+ def self.parse_nameid obj
186
+ remaining = obj.children.dup
187
+ guids_obj, props_obj, names_obj =
188
+ %w[__substg1.0_00020102 __substg1.0_00030102 __substg1.0_00040102].map do |name|
189
+ remaining.delete obj/name
190
+ end
191
+
192
+ # parse guids
193
+ # this is the guids for named properities (other than builtin ones)
194
+ # i think PS_PUBLIC_STRINGS, and PS_MAPI are builtin.
195
+ # Scan using an ascii pattern - it's binary data we're looking
196
+ # at, so we don't want to look for unicode characters
197
+ guids = [PS_PUBLIC_STRINGS] + guids_obj.read.scan(/.{16}/mn).map do |str|
198
+ Ole::Types.load_guid str
199
+ end
200
+
201
+ # parse names.
202
+ # the string ids for named properties
203
+ # they are no longer parsed, as they're referred to by offset not
204
+ # index. they are simply sequentially packed, as a long, giving
205
+ # the string length, then padding to 4 byte multiple, and repeat.
206
+ names_data = names_obj.read
207
+
208
+ # parse actual props.
209
+ # not sure about any of this stuff really.
210
+ # should flip a few bits in the real msg, to get a better understanding of how this works.
211
+ # Scan using an ascii pattern - it's binary data we're looking
212
+ # at, so we don't want to look for unicode characters
213
+ props = props_obj.read.scan(/.{8}/mn).map do |str|
214
+ flags, offset = str[4..-1].unpack 'v2'
215
+ # the property will be serialised as this pseudo property, mapping it to this named property
216
+ pseudo_prop = 0x8000 + offset
217
+ named = flags & 1 == 1
218
+ prop = if named
219
+ str_off = str.unpack('V').first
220
+ len = names_data[str_off, 4].unpack('V').first
221
+ Ole::Types::FROM_UTF16.iconv names_data[str_off + 4, len]
222
+ else
223
+ a, b = str.unpack('v2')
224
+ Log.debug "b not 0" if b != 0
225
+ a
226
+ end
227
+ # a bit sus
228
+ guid_off = flags >> 1
229
+ # missing a few builtin PS_*
230
+ Log.debug "guid off < 2 (#{guid_off})" if guid_off < 2
231
+ guid = guids[guid_off - 2]
232
+ [pseudo_prop, Key.new(prop, guid)]
233
+ end
234
+
235
+ #Log.warn "* ignoring #{remaining.length} objects in nameid" unless remaining.empty?
236
+ # this leaves a bunch of other unknown chunks of data with completely unknown meaning.
237
+ # pp [:unknown, child.name, child.data.unpack('H*')[0].scan(/.{16}/m)]
238
+ Hash[*props.flatten]
239
+ end
240
+
241
+ # Parse an +Dirent+, as per <tt>msgconvert.pl</tt>. This is how larger properties, such
242
+ # as strings, binary blobs, and other ole sub-directories (eg nested Msg) are stored.
243
+ #
244
+ # @param key [Number]
245
+ # @param encoding [Number]
246
+ # @param offset [Number]
247
+ # @param obj [Ole::Storage::Dirent]
248
+ def parse_substg key, encoding, offset, obj
249
+ if (encoding & 0x1000) != 0
250
+ if !offset
251
+ # there is typically one with no offset first, whose data is a series of numbers
252
+ # equal to the lengths of all the sub parts. gives an implied array size i suppose.
253
+ # maybe you can initialize the array at this time. the sizes are the same as all the
254
+ # ole object sizes anyway, its to pre-allocate i suppose.
255
+ #p obj.data.unpack('V*')
256
+ # ignore this one
257
+ return
258
+ else
259
+ # remove multivalue flag for individual pieces
260
+ encoding &= ~0x1000
261
+ end
262
+ else
263
+ Log.warn "offset specified for non-multivalue encoding #{obj.name}" if offset
264
+ offset = nil
265
+ end
266
+ # offset is for multivalue encodings.
267
+ unless encoder = ENCODINGS[encoding]
268
+ Log.warn "unknown encoding #{encoding}"
269
+ #encoder = proc { |obj| obj.io } #.read }. maybe not a good idea
270
+ encoder = ENCODINGS[:default]
271
+ end
272
+ add_property key, encoder[obj, @helper], offset
273
+ end
274
+
275
+ # For parsing the +properties+ file. Smaller properties are serialized in one chunk,
276
+ # such as longs, bools, times etc. The parsing has problems.
277
+ #
278
+ # @param obj [Ole::Storage::Dirent]
279
+ def parse_properties obj
280
+ data = obj.read
281
+ # don't really understand this that well...
282
+
283
+ pad = data.length % 16
284
+ unless (pad == 0 || pad == 8) and data[0...pad] == "\000" * pad
285
+ Log.warn "padding was not as expected #{pad} (#{data.length}) -> #{data[0...pad].inspect}"
286
+ end
287
+ # Scan using an ascii pattern - it's binary data we're looking
288
+ # at, so we don't want to look for unicode characters
289
+ data[pad..-1].scan(/.{16}/mn).each do |data|
290
+ property, encoding = ('%08x' % data.unpack('V')).scan /.{4}/
291
+ key = property.hex
292
+ # doesn't make any sense to me. probably because its a serialization of some internal
293
+ # outlook structure...
294
+ next if property == '0000'
295
+ case encoding
296
+ when '0102', '001e', '001f', '101e', '101f', '000d'
297
+ # ignore on purpose. not sure what its for
298
+ # multivalue versions ignored also
299
+ when '0003' # long
300
+ # don't know what all the other data is for
301
+ add_property key, *data[8, 4].unpack('V')
302
+ when '000b' # boolean
303
+ # again, heaps more data than needed. and its not always 0 or 1.
304
+ # they are in fact quite big numbers. this is wrong.
305
+ # p [property, data[4..-1].unpack('H*')[0]]
306
+ add_property key, data[8, 4].unpack('V')[0] != 0
307
+ when '0040' # systime
308
+ # seems to work:
309
+ add_property key, Ole::Types.load_time(data[8..-1])
310
+ else
311
+ #Log.warn "ignoring data in __properties section, encoding: #{encoding}"
312
+ #Log << data.unpack('H*').inspect + "\n"
313
+ end
314
+ end
315
+ end
316
+
317
+ # @param key [Integer]
318
+ # @param value [Object]
319
+ # @param pos [Integer, nil]
320
+ def add_property key, value, pos=nil
321
+ # map keys in the named property range through nameid
322
+ if Integer === key and key >= 0x8000
323
+ if !@nameid
324
+ Log.warn "no nameid section yet named properties used"
325
+ key = Key.new key
326
+ elsif real_key = @nameid[key]
327
+ key = real_key
328
+ else
329
+ # i think i hit these when i have a named property, in the PS_MAPI
330
+ # guid
331
+ Log.warn "property in named range not in nameid #{key.inspect}"
332
+ key = Key.new key
333
+ end
334
+ else
335
+ key = Key.new key
336
+ end
337
+ if pos
338
+ @cache[key] ||= []
339
+ Log.warn "duplicate property" unless Array === @cache[key]
340
+ # ^ this is actually a trickier problem. the issue is more that they must all be of
341
+ # the same type.
342
+ @cache[key][pos] = value
343
+ else
344
+ # take the last.
345
+ Log.warn "duplicate property #{key.inspect}" if @cache[key]
346
+ @cache[key] = value
347
+ end
348
+ end
349
+
350
+ # delegate to cache
351
+ def method_missing name, *args, &block
352
+ @cache.send name, *args, &block
353
+ end
354
+ end
355
+
356
+ # these 2 will actually be of the form
357
+ # `1\.0_#([0-9A-Z]{8})`, where `$1` is the 0 based index number in hex
358
+ # should i parse that and use it as an index, or just return in
359
+ # file order? probably should use it later...
360
+
361
+ # @private
362
+ ATTACH_RX = /^__attach_version1\.0_.*/
363
+ # @private
364
+ RECIP_RX = /^__recip_version1\.0_.*/
365
+ # @private
366
+ VALID_RX = /#{PropertyStore::VALID_RX}|#{ATTACH_RX}|#{RECIP_RX}/
367
+
368
+ # @return [Ole::Storage::Dirent]
369
+ # @private
370
+ attr_reader :root
371
+ # @return [Helper]
372
+ # @private
373
+ attr_reader :helper
374
+ # @return [Boolean]
375
+ attr_accessor :close_parent
376
+
377
+ # Alternate constructor, to create an {Msg} directly from `arg` and `mode`, passed
378
+ # directly to {Ole::Storage} (ie either filename or seekable IO object).
379
+ #
380
+ # @param arg [Object]
381
+ # @param mode [Object]
382
+ # @param helper [Helper]
383
+ # @return [Ole::Storage::Dirent]
384
+ def self.open arg, mode=nil, helper=nil
385
+ msg = new Ole::Storage.open(arg, mode).root, helper || Helper.new
386
+ # we will close the ole when we are #closed
387
+ msg.close_parent = true
388
+ if block_given?
389
+ begin yield msg
390
+ ensure; msg.close
391
+ end
392
+ else msg
393
+ end
394
+ end
395
+
396
+ # Create an Msg from `root`, an {Ole::Storage::Dirent} object
397
+ #
398
+ # @param root [Ole::Storage::Dirent]
399
+ # @param helper [Helper]
400
+ def initialize root, helper
401
+ @root = root
402
+ @helper = helper
403
+ @close_parent = false
404
+ super PropertySet.new(PropertyStore.load(@root, helper))
405
+ Msg.warn_unknown @root
406
+ end
407
+
408
+ # @param obj [Ole::Storage::Dirent]
409
+ # @private
410
+ def self.warn_unknown obj
411
+ # bit of validation. not important if there is extra stuff, though would be
412
+ # interested to know what it is. doesn't check dir/file stuff.
413
+ unknown = obj.children.reject { |child| child.name =~ VALID_RX }
414
+ Log.warn "skipped #{unknown.length} unknown msg object(s)" unless unknown.empty?
415
+ end
416
+
417
+ def close
418
+ @root.ole.close if @close_parent
419
+ end
420
+
421
+ # @return [Array<Attachment>]
422
+ def attachments
423
+ @attachments ||= @root.children.
424
+ select { |child| child.dir? and child.name =~ ATTACH_RX }.
425
+ map { |child| Attachment.new child, helper }.
426
+ select { |attach| attach.valid? }
427
+ end
428
+
429
+ # @return [Array<Recipient>]
430
+ def recipients
431
+ @recipients ||= @root.children.
432
+ select { |child| child.dir? and child.name =~ RECIP_RX }.
433
+ map { |child| Recipient.new child, helper }
434
+ end
435
+
436
+ class Attachment < Mapi::Attachment
437
+ # @return [Ole::Storage::Dirent]
438
+ # @private
439
+ attr_reader :obj
440
+
441
+ # @return [PropertySet]
442
+ attr_reader :properties
443
+
444
+ alias props :properties
445
+
446
+ # @param obj [Ole::Storage::Dirent]
447
+ # @param helper [Helper]
448
+ def initialize obj, helper
449
+ @obj = obj
450
+ @embedded_ole = nil
451
+ @embedded_msg = nil
452
+
453
+ super PropertySet.new(PropertyStore.load(@obj, helper))
454
+ Msg.warn_unknown @obj
455
+
456
+ @obj.children.each do |child|
457
+ # temp hack. PropertyStore doesn't do directory properties atm - FIXME
458
+ if child.dir? and child.name =~ PropertyStore::SUBSTG_RX and
459
+ $1 == '3701' and $2.downcase == '000d'
460
+ @embedded_ole = child
461
+ class << @embedded_ole
462
+ def compobj
463
+ return nil unless compobj = self["\001CompObj"]
464
+ compobj.read[/^.{32}([^\x00]+)/m, 1]
465
+ end
466
+
467
+ def embedded_type
468
+ temp = compobj and return temp
469
+ # try to guess more
470
+ if children.select { |child| child.name =~ /__(substg|properties|recip|attach|nameid)/ }.length > 2
471
+ return 'Microsoft Office Outlook Message'
472
+ end
473
+ nil
474
+ end
475
+ end
476
+ if @embedded_ole.embedded_type == 'Microsoft Office Outlook Message'
477
+ @embedded_msg = Msg.new @embedded_ole, helper
478
+ end
479
+ end
480
+ end
481
+ end
482
+
483
+ # @return [Boolean]
484
+ def valid?
485
+ # something i started to notice when handling embedded ole object attachments is
486
+ # the particularly strange case where there are empty attachments
487
+ not props.raw.keys.empty?
488
+ end
489
+ end
490
+
491
+ #
492
+ # `Recipient` serves as a container for the `recip` directories in the `.msg`.
493
+ #
494
+ # It has things like office_location, business_telephone_number, but I don't
495
+ # think enough to make a vCard out of?
496
+ #
497
+ class Recipient < Mapi::Recipient
498
+ # @return [Ole::Storage::Dirent]
499
+ # @private
500
+ attr_reader :obj
501
+
502
+ attr_reader :properties
503
+
504
+ alias props :properties
505
+
506
+ # @param obj [Ole::Storage::Dirent]
507
+ # @param helper [Helper]
508
+ def initialize obj, helper
509
+ @obj = obj
510
+ super PropertySet.new(PropertyStore.load(@obj, helper))
511
+ Msg.warn_unknown @obj
512
+ end
513
+ end
514
+ end
515
+ end
516
+