libis-mapi 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/mapi/msg.rb ADDED
@@ -0,0 +1,516 @@
1
+ require 'ole/storage'
2
+ require 'mapi'
3
+ require 'mapi/rtf'
4
+ require 'mapi/helper'
5
+
6
+ module Mapi
7
+ #
8
+ # # Introduction
9
+ #
10
+ # Primary class interface to the vagaries of .msg files.
11
+ #
12
+ # The core of the work is done by the {Msg::PropertyStore} class.
13
+ #
14
+ class Msg < Message
15
+ #
16
+ # # Introduction
17
+ #
18
+ # A big compononent of {Msg} files is the property store, which holds
19
+ # all the key/value pairs of properties. The message itself, and all
20
+ # its {Attachment}s and {Recipient}s have an instance of
21
+ # this class.
22
+ #
23
+ # # Storage model
24
+ #
25
+ # Property keys (tags?) can be either simple hex numbers, in the
26
+ # range 0x0000 - 0xffff, or they can be named properties. In fact,
27
+ # properties in the range 0x0000 to 0x7fff are supposed to be the non-
28
+ # named properties, and can be considered to be in the `PS_MAPI`
29
+ # namespace. (correct?)
30
+ #
31
+ # Named properties are serialized in the 0x8000 to 0xffff range,
32
+ # and are referenced as a guid and long/string pair.
33
+ #
34
+ # There are key ranges, which can be used to imply things generally
35
+ # about keys.
36
+ #
37
+ # Further, we can give symbolic names to most keys, coming from
38
+ # constants in various places. Eg:
39
+ #
40
+ # ```
41
+ # 0x0037 => subject
42
+ # {00062002-0000-0000-C000-000000000046}/0x8218 => response_status
43
+ # # displayed as categories in outlook
44
+ # {00020329-0000-0000-C000-000000000046}/"Keywords" => categories
45
+ # ```
46
+ #
47
+ # Futher, there are completely different names, coming from other
48
+ # object models that get mapped to these things (CDO's model,
49
+ # Outlook's model etc). Eg "urn:schemas:httpmail:subject"
50
+ # I think these can be ignored though, as they aren't defined clearly
51
+ # in terms of mapi properties, and i'm really just trying to make
52
+ # a mapi property store. (It should also be relatively easy to
53
+ # support them later.)
54
+ #
55
+ # # Usage
56
+ #
57
+ # The api is driven by a desire to have the simple stuff "just work", ie
58
+ #
59
+ # ```
60
+ # properties.subject
61
+ # properties.display_name
62
+ # ```
63
+ #
64
+ # There also needs to be a way to look up properties more specifically:
65
+ #
66
+ # ```
67
+ # properties[0x0037] # => gets the subject
68
+ # properties[0x0037, PS_MAPI] # => still gets the subject
69
+ # properties['Keywords', PS_PUBLIC_STRINGS] # => gets outlook's categories array
70
+ # ```
71
+ #
72
+ # The abbreviated versions work by "resolving" the symbols to full keys:
73
+ #
74
+ # ```
75
+ # # the guid here is just PS_PUBLIC_STRINGS
76
+ # properties.resolve :keywords # => #<Key {00020329-0000-0000-c000-000000000046}/"Keywords">
77
+ # # the result here is actually also a key
78
+ # k = properties.resolve :subject # => 0x0037
79
+ # # it has a guid
80
+ # k.guid == Msg::Properties::PS_MAPI # => true
81
+ # ```
82
+ #
83
+ # # Parsing
84
+ #
85
+ # There are three objects that need to be parsed to load a +Msg+ property store:
86
+ #
87
+ # 1. The `nameid` directory (`Properties.parse_nameid`)
88
+ # 2. The many `substg` objects, whose names should match `Properties::SUBSTG_RX`
89
+ # (`Properties#parse_substg`)
90
+ # 3. The `properties` file (`Properties#parse_properties`)
91
+ #
92
+ # Understanding of the formats is by no means perfect.
93
+ #
94
+ # # TODO
95
+ #
96
+ # * While the key objects are sufficient, the value objects are just plain
97
+ # ruby types. It currently isn't possible to write to the values, or to know
98
+ # which encoding the value had.
99
+ # * Update this doc.
100
+ # * Perhaps change from eager loading, to be load-on-demand.
101
+ #
102
+ # @private
103
+ class PropertyStore
104
+ include PropertySet::Constants
105
+ Key = PropertySet::Key
106
+
107
+ # note that binary and default both use obj.open. not the block form. this means we should
108
+ # #close it later, which we don't. as we're only reading though, it shouldn't matter right?
109
+ # not really good though FIXME
110
+ # change these to use mapi symbolic const names
111
+ ENCODINGS = {
112
+ 0x000d => proc { |obj, helper| obj }, # seems to be used when its going to be a directory instead of a file. eg nested ole. 3701 usually. in which case we shouldn't get here right?
113
+ 0x001f => proc { |obj, helper| Ole::Types::FROM_UTF16.iconv obj.read }, # unicode
114
+ # ascii
115
+ # FIXME hack did a[0..-2] before, seems right sometimes, but for some others it chopped the text. chomp
116
+ 0x001e => proc { |obj, helper| helper.convert_ansi_str(obj.read.chomp 0.chr) },
117
+ 0x0102 => proc { |obj, helper| obj.open }, # binary?
118
+ :default => proc { |obj, helper| obj.open }
119
+ }
120
+
121
+ SUBSTG_RX = /^__substg1\.0_([0-9A-F]{4})([0-9A-F]{4})(?:-([0-9A-F]{8}))?$/
122
+ PROPERTIES_RX = /^__properties_version1\.0$/
123
+ NAMEID_RX = /^__nameid_version1\.0$/
124
+ VALID_RX = /#{SUBSTG_RX}|#{PROPERTIES_RX}|#{NAMEID_RX}/
125
+
126
+ # @return [Hash]
127
+ attr_reader :nameid
128
+ # @return [Helper]
129
+ attr_reader :helper
130
+
131
+ # @param helper [Helper]
132
+ def initialize helper
133
+ @nameid = nil
134
+ @helper = helper
135
+ # not exactly a cache currently
136
+ @cache = {}
137
+ end
138
+
139
+ # The parsing methods
140
+ #
141
+ # @param obj [Ole::Storage::Dirent]
142
+ # @param helper [Helper]
143
+ # @return [PropertyStore]
144
+ def self.load obj, helper
145
+ prop = new helper
146
+ prop.load obj
147
+ prop
148
+ end
149
+
150
+ # Parse properties from the +Dirent+ obj
151
+ #
152
+ # @param obj [Ole::Storage::Dirent]
153
+ def load obj
154
+ # we need to do the nameid first, as it provides the map for later user defined properties
155
+ if nameid_obj = obj.children.find { |child| child.name =~ NAMEID_RX }
156
+ @nameid = PropertyStore.parse_nameid nameid_obj
157
+ # hack to make it available to all msg files from the same ole storage object
158
+ # FIXME - come up with a neater way
159
+ class << obj.ole
160
+ attr_accessor :msg_nameid
161
+ end
162
+ obj.ole.msg_nameid = @nameid
163
+ elsif obj.ole
164
+ @nameid = obj.ole.msg_nameid rescue nil
165
+ end
166
+ # now parse the actual properties. i think dirs that match the substg should be decoded
167
+ # as properties to. 0x000d is just another encoding, the dir encoding. it should match
168
+ # whether the object is file / dir. currently only example is embedded msgs anyway
169
+ obj.children.each do |child|
170
+ next unless child.file?
171
+ case child.name
172
+ when PROPERTIES_RX
173
+ parse_properties child
174
+ when SUBSTG_RX
175
+ parse_substg(*($~[1..-1].map { |num| num.hex rescue nil } + [child]))
176
+ end
177
+ end
178
+ end
179
+
180
+ # Read nameid from the +Dirent+ obj, which is used for mapping of named properties keys to
181
+ # proxy keys in the 0x8000 - 0xffff range.
182
+ # Returns a hash of integer -> Key.
183
+ #
184
+ # @param obj [Ole::Storage::Dirent]
185
+ def self.parse_nameid obj
186
+ remaining = obj.children.dup
187
+ guids_obj, props_obj, names_obj =
188
+ %w[__substg1.0_00020102 __substg1.0_00030102 __substg1.0_00040102].map do |name|
189
+ remaining.delete obj/name
190
+ end
191
+
192
+ # parse guids
193
+ # this is the guids for named properities (other than builtin ones)
194
+ # i think PS_PUBLIC_STRINGS, and PS_MAPI are builtin.
195
+ # Scan using an ascii pattern - it's binary data we're looking
196
+ # at, so we don't want to look for unicode characters
197
+ guids = [PS_PUBLIC_STRINGS] + guids_obj.read.scan(/.{16}/mn).map do |str|
198
+ Ole::Types.load_guid str
199
+ end
200
+
201
+ # parse names.
202
+ # the string ids for named properties
203
+ # they are no longer parsed, as they're referred to by offset not
204
+ # index. they are simply sequentially packed, as a long, giving
205
+ # the string length, then padding to 4 byte multiple, and repeat.
206
+ names_data = names_obj.read
207
+
208
+ # parse actual props.
209
+ # not sure about any of this stuff really.
210
+ # should flip a few bits in the real msg, to get a better understanding of how this works.
211
+ # Scan using an ascii pattern - it's binary data we're looking
212
+ # at, so we don't want to look for unicode characters
213
+ props = props_obj.read.scan(/.{8}/mn).map do |str|
214
+ flags, offset = str[4..-1].unpack 'v2'
215
+ # the property will be serialised as this pseudo property, mapping it to this named property
216
+ pseudo_prop = 0x8000 + offset
217
+ named = flags & 1 == 1
218
+ prop = if named
219
+ str_off = str.unpack('V').first
220
+ len = names_data[str_off, 4].unpack('V').first
221
+ Ole::Types::FROM_UTF16.iconv names_data[str_off + 4, len]
222
+ else
223
+ a, b = str.unpack('v2')
224
+ Log.debug "b not 0" if b != 0
225
+ a
226
+ end
227
+ # a bit sus
228
+ guid_off = flags >> 1
229
+ # missing a few builtin PS_*
230
+ Log.debug "guid off < 2 (#{guid_off})" if guid_off < 2
231
+ guid = guids[guid_off - 2]
232
+ [pseudo_prop, Key.new(prop, guid)]
233
+ end
234
+
235
+ #Log.warn "* ignoring #{remaining.length} objects in nameid" unless remaining.empty?
236
+ # this leaves a bunch of other unknown chunks of data with completely unknown meaning.
237
+ # pp [:unknown, child.name, child.data.unpack('H*')[0].scan(/.{16}/m)]
238
+ Hash[*props.flatten]
239
+ end
240
+
241
+ # Parse an +Dirent+, as per <tt>msgconvert.pl</tt>. This is how larger properties, such
242
+ # as strings, binary blobs, and other ole sub-directories (eg nested Msg) are stored.
243
+ #
244
+ # @param key [Number]
245
+ # @param encoding [Number]
246
+ # @param offset [Number]
247
+ # @param obj [Ole::Storage::Dirent]
248
+ def parse_substg key, encoding, offset, obj
249
+ if (encoding & 0x1000) != 0
250
+ if !offset
251
+ # there is typically one with no offset first, whose data is a series of numbers
252
+ # equal to the lengths of all the sub parts. gives an implied array size i suppose.
253
+ # maybe you can initialize the array at this time. the sizes are the same as all the
254
+ # ole object sizes anyway, its to pre-allocate i suppose.
255
+ #p obj.data.unpack('V*')
256
+ # ignore this one
257
+ return
258
+ else
259
+ # remove multivalue flag for individual pieces
260
+ encoding &= ~0x1000
261
+ end
262
+ else
263
+ Log.warn "offset specified for non-multivalue encoding #{obj.name}" if offset
264
+ offset = nil
265
+ end
266
+ # offset is for multivalue encodings.
267
+ unless encoder = ENCODINGS[encoding]
268
+ Log.warn "unknown encoding #{encoding}"
269
+ #encoder = proc { |obj| obj.io } #.read }. maybe not a good idea
270
+ encoder = ENCODINGS[:default]
271
+ end
272
+ add_property key, encoder[obj, @helper], offset
273
+ end
274
+
275
+ # For parsing the +properties+ file. Smaller properties are serialized in one chunk,
276
+ # such as longs, bools, times etc. The parsing has problems.
277
+ #
278
+ # @param obj [Ole::Storage::Dirent]
279
+ def parse_properties obj
280
+ data = obj.read
281
+ # don't really understand this that well...
282
+
283
+ pad = data.length % 16
284
+ unless (pad == 0 || pad == 8) and data[0...pad] == "\000" * pad
285
+ Log.warn "padding was not as expected #{pad} (#{data.length}) -> #{data[0...pad].inspect}"
286
+ end
287
+ # Scan using an ascii pattern - it's binary data we're looking
288
+ # at, so we don't want to look for unicode characters
289
+ data[pad..-1].scan(/.{16}/mn).each do |data|
290
+ property, encoding = ('%08x' % data.unpack('V')).scan /.{4}/
291
+ key = property.hex
292
+ # doesn't make any sense to me. probably because its a serialization of some internal
293
+ # outlook structure...
294
+ next if property == '0000'
295
+ case encoding
296
+ when '0102', '001e', '001f', '101e', '101f', '000d'
297
+ # ignore on purpose. not sure what its for
298
+ # multivalue versions ignored also
299
+ when '0003' # long
300
+ # don't know what all the other data is for
301
+ add_property key, *data[8, 4].unpack('V')
302
+ when '000b' # boolean
303
+ # again, heaps more data than needed. and its not always 0 or 1.
304
+ # they are in fact quite big numbers. this is wrong.
305
+ # p [property, data[4..-1].unpack('H*')[0]]
306
+ add_property key, data[8, 4].unpack('V')[0] != 0
307
+ when '0040' # systime
308
+ # seems to work:
309
+ add_property key, Ole::Types.load_time(data[8..-1])
310
+ else
311
+ #Log.warn "ignoring data in __properties section, encoding: #{encoding}"
312
+ #Log << data.unpack('H*').inspect + "\n"
313
+ end
314
+ end
315
+ end
316
+
317
+ # @param key [Integer]
318
+ # @param value [Object]
319
+ # @param pos [Integer, nil]
320
+ def add_property key, value, pos=nil
321
+ # map keys in the named property range through nameid
322
+ if Integer === key and key >= 0x8000
323
+ if !@nameid
324
+ Log.warn "no nameid section yet named properties used"
325
+ key = Key.new key
326
+ elsif real_key = @nameid[key]
327
+ key = real_key
328
+ else
329
+ # i think i hit these when i have a named property, in the PS_MAPI
330
+ # guid
331
+ Log.warn "property in named range not in nameid #{key.inspect}"
332
+ key = Key.new key
333
+ end
334
+ else
335
+ key = Key.new key
336
+ end
337
+ if pos
338
+ @cache[key] ||= []
339
+ Log.warn "duplicate property" unless Array === @cache[key]
340
+ # ^ this is actually a trickier problem. the issue is more that they must all be of
341
+ # the same type.
342
+ @cache[key][pos] = value
343
+ else
344
+ # take the last.
345
+ Log.warn "duplicate property #{key.inspect}" if @cache[key]
346
+ @cache[key] = value
347
+ end
348
+ end
349
+
350
+ # delegate to cache
351
+ def method_missing name, *args, &block
352
+ @cache.send name, *args, &block
353
+ end
354
+ end
355
+
356
+ # these 2 will actually be of the form
357
+ # `1\.0_#([0-9A-Z]{8})`, where `$1` is the 0 based index number in hex
358
+ # should i parse that and use it as an index, or just return in
359
+ # file order? probably should use it later...
360
+
361
+ # @private
362
+ ATTACH_RX = /^__attach_version1\.0_.*/
363
+ # @private
364
+ RECIP_RX = /^__recip_version1\.0_.*/
365
+ # @private
366
+ VALID_RX = /#{PropertyStore::VALID_RX}|#{ATTACH_RX}|#{RECIP_RX}/
367
+
368
+ # @return [Ole::Storage::Dirent]
369
+ # @private
370
+ attr_reader :root
371
+ # @return [Helper]
372
+ # @private
373
+ attr_reader :helper
374
+ # @return [Boolean]
375
+ attr_accessor :close_parent
376
+
377
+ # Alternate constructor, to create an {Msg} directly from `arg` and `mode`, passed
378
+ # directly to {Ole::Storage} (ie either filename or seekable IO object).
379
+ #
380
+ # @param arg [Object]
381
+ # @param mode [Object]
382
+ # @param helper [Helper]
383
+ # @return [Ole::Storage::Dirent]
384
+ def self.open arg, mode=nil, helper=nil
385
+ msg = new Ole::Storage.open(arg, mode).root, helper || Helper.new
386
+ # we will close the ole when we are #closed
387
+ msg.close_parent = true
388
+ if block_given?
389
+ begin yield msg
390
+ ensure; msg.close
391
+ end
392
+ else msg
393
+ end
394
+ end
395
+
396
+ # Create an Msg from `root`, an {Ole::Storage::Dirent} object
397
+ #
398
+ # @param root [Ole::Storage::Dirent]
399
+ # @param helper [Helper]
400
+ def initialize root, helper
401
+ @root = root
402
+ @helper = helper
403
+ @close_parent = false
404
+ super PropertySet.new(PropertyStore.load(@root, helper))
405
+ Msg.warn_unknown @root
406
+ end
407
+
408
+ # @param obj [Ole::Storage::Dirent]
409
+ # @private
410
+ def self.warn_unknown obj
411
+ # bit of validation. not important if there is extra stuff, though would be
412
+ # interested to know what it is. doesn't check dir/file stuff.
413
+ unknown = obj.children.reject { |child| child.name =~ VALID_RX }
414
+ Log.warn "skipped #{unknown.length} unknown msg object(s)" unless unknown.empty?
415
+ end
416
+
417
+ def close
418
+ @root.ole.close if @close_parent
419
+ end
420
+
421
+ # @return [Array<Attachment>]
422
+ def attachments
423
+ @attachments ||= @root.children.
424
+ select { |child| child.dir? and child.name =~ ATTACH_RX }.
425
+ map { |child| Attachment.new child, helper }.
426
+ select { |attach| attach.valid? }
427
+ end
428
+
429
+ # @return [Array<Recipient>]
430
+ def recipients
431
+ @recipients ||= @root.children.
432
+ select { |child| child.dir? and child.name =~ RECIP_RX }.
433
+ map { |child| Recipient.new child, helper }
434
+ end
435
+
436
+ class Attachment < Mapi::Attachment
437
+ # @return [Ole::Storage::Dirent]
438
+ # @private
439
+ attr_reader :obj
440
+
441
+ # @return [PropertySet]
442
+ attr_reader :properties
443
+
444
+ alias props :properties
445
+
446
+ # @param obj [Ole::Storage::Dirent]
447
+ # @param helper [Helper]
448
+ def initialize obj, helper
449
+ @obj = obj
450
+ @embedded_ole = nil
451
+ @embedded_msg = nil
452
+
453
+ super PropertySet.new(PropertyStore.load(@obj, helper))
454
+ Msg.warn_unknown @obj
455
+
456
+ @obj.children.each do |child|
457
+ # temp hack. PropertyStore doesn't do directory properties atm - FIXME
458
+ if child.dir? and child.name =~ PropertyStore::SUBSTG_RX and
459
+ $1 == '3701' and $2.downcase == '000d'
460
+ @embedded_ole = child
461
+ class << @embedded_ole
462
+ def compobj
463
+ return nil unless compobj = self["\001CompObj"]
464
+ compobj.read[/^.{32}([^\x00]+)/m, 1]
465
+ end
466
+
467
+ def embedded_type
468
+ temp = compobj and return temp
469
+ # try to guess more
470
+ if children.select { |child| child.name =~ /__(substg|properties|recip|attach|nameid)/ }.length > 2
471
+ return 'Microsoft Office Outlook Message'
472
+ end
473
+ nil
474
+ end
475
+ end
476
+ if @embedded_ole.embedded_type == 'Microsoft Office Outlook Message'
477
+ @embedded_msg = Msg.new @embedded_ole, helper
478
+ end
479
+ end
480
+ end
481
+ end
482
+
483
+ # @return [Boolean]
484
+ def valid?
485
+ # something i started to notice when handling embedded ole object attachments is
486
+ # the particularly strange case where there are empty attachments
487
+ not props.raw.keys.empty?
488
+ end
489
+ end
490
+
491
+ #
492
+ # `Recipient` serves as a container for the `recip` directories in the `.msg`.
493
+ #
494
+ # It has things like office_location, business_telephone_number, but I don't
495
+ # think enough to make a vCard out of?
496
+ #
497
+ class Recipient < Mapi::Recipient
498
+ # @return [Ole::Storage::Dirent]
499
+ # @private
500
+ attr_reader :obj
501
+
502
+ attr_reader :properties
503
+
504
+ alias props :properties
505
+
506
+ # @param obj [Ole::Storage::Dirent]
507
+ # @param helper [Helper]
508
+ def initialize obj, helper
509
+ @obj = obj
510
+ super PropertySet.new(PropertyStore.load(@obj, helper))
511
+ Msg.warn_unknown @obj
512
+ end
513
+ end
514
+ end
515
+ end
516
+