ruby-msg 1.2.17

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,515 @@
1
+
2
+ class Msg
3
+ #
4
+ # = Introduction
5
+ #
6
+ # A big compononent of +Msg+ files is the property store, which holds
7
+ # all the key/value pairs of properties. The message itself, and all
8
+ # its <tt>Attachment</tt>s and <tt>Recipient</tt>s have an instance of
9
+ # this class.
10
+ #
11
+ # = Storage model
12
+ #
13
+ # Property keys (tags?) can be either simple hex numbers, in the
14
+ # range 0x0000 - 0xffff, or they can be named properties. In fact,
15
+ # properties in the range 0x0000 to 0x7fff are supposed to be the non-
16
+ # named properties, and can be considered to be in the +PS_MAPI+
17
+ # namespace. (correct?)
18
+ #
19
+ # Named properties are serialized in the 0x8000 to 0xffff range,
20
+ # and are referenced as a guid and long/string pair.
21
+ #
22
+ # There are key ranges, which can be used to imply things generally
23
+ # about keys.
24
+ #
25
+ # Further, we can give symbolic names to most keys, coming from
26
+ # constants in various places. Eg:
27
+ #
28
+ # 0x0037 => subject
29
+ # {00062002-0000-0000-C000-000000000046}/0x8218 => response_status
30
+ # # displayed as categories in outlook
31
+ # {00020329-0000-0000-C000-000000000046}/"Keywords" => categories
32
+ #
33
+ # Futher, there are completely different names, coming from other
34
+ # object models that get mapped to these things (CDO's model,
35
+ # Outlook's model etc). Eg "urn:schemas:httpmail:subject"
36
+ # I think these can be ignored though, as they aren't defined clearly
37
+ # in terms of mapi properties, and i'm really just trying to make
38
+ # a mapi property store. (It should also be relatively easy to
39
+ # support them later.)
40
+ #
41
+ # = Usage
42
+ #
43
+ # The api is driven by a desire to have the simple stuff "just work", ie
44
+ #
45
+ # properties.subject
46
+ # properties.display_name
47
+ #
48
+ # There also needs to be a way to look up properties more specifically:
49
+ #
50
+ # properties[0x0037] # => gets the subject
51
+ # properties[PS_MAPI, 0x0037] # => still gets the subject
52
+ # properties[PS_PUBLIC_STRINGS, 'Keywords'] # => gets the above categories
53
+ #
54
+ # The abbreviate versions work by "resolving" the symbols to full keys:
55
+ #
56
+ # properties.resolve :keywords # => [PS_OUTLOOK, 'Keywords']
57
+ # properties.resolve :subject # => [PS_MAPI, 0x0037]
58
+ #
59
+ # = Parsing
60
+ #
61
+ # There are three objects that need to be parsed to load a +Msg+ property store:
62
+ #
63
+ # 1. The +nameid+ directory (<tt>Properties.parse_nameid</tt>)
64
+ # 2. The many +substg+ objects, whose names should match <tt>Properties::SUBSTG_RX</tt>
65
+ # (<tt>Properties#parse_substg</tt>)
66
+ # 3. The +properties+ file (<tt>Properties#parse_properties</tt>)
67
+ #
68
+ # Understanding of the formats is by no means perfect
69
+ #
70
+ # = TODO
71
+ #
72
+ # * Test cases.
73
+ # * While the key objects are sufficient, the value objects are just plain
74
+ # ruby types. It currently isn't possible to write to the values, or to know
75
+ # which encoding the value had.
76
+ # * Consider other MAPI property stores, such as tnef/pst. Similar model?
77
+ # Generalise this one?
78
+ # * Have added IO support to Ole::Storage. now need to fix Properties. can't use
79
+ # current greedy-loading approach. still want strings to work nicely:
80
+ # props.subject
81
+ # but don't want to be loading up large binary blobs, typically attachments, eg
82
+ # props.attach_data.
83
+ # probably the easiest solution is that the binary "encoding", be to return an io
84
+ # object instead. and you must read it if you want it as a string
85
+ # maybe i can avoid the greedy model anyway? rather than parsing the properties completely,
86
+ # have it be load based? you request subject, that translates into, please load the right
87
+ # substg, et voila. maybe redo @raw as a lazy loading hash for substg objects, but do the
88
+ # others straight away. maybe just parse keys so i know what i've got??
89
+ class Properties
90
+ # duplicated here for now
91
+ SUPPORT_DIR = File.dirname(__FILE__) + '/../..'
92
+
93
+ # note that binary and default both use obj.open. not the block form. this means we should
94
+ # #close it later, which we don't. as we're only reading though, it shouldn't matter right?
95
+ # not really good though FIXME
96
+ ENCODINGS = {
97
+ 0x000d => proc { |obj| obj }, # seems to be used when its going to be a directory instead of a file. eg nested ole. 3701 usually. in which case we shouldn't get here right?
98
+ 0x001f => proc { |obj| Ole::Types::FROM_UTF16.iconv obj.read }, # unicode
99
+ # ascii
100
+ # FIXME hack did a[0..-2] before, seems right sometimes, but for some others it chopped the text. chomp
101
+ 0x001e => proc { |obj| a = obj.read; a[-1] == 0 ? a[0...-2] : a },
102
+ 0x0102 => proc { |obj| obj.open }, # binary?
103
+ :default => proc { |obj| obj.open }
104
+ }
105
+
106
+ # these won't be strings for much longer.
107
+ # maybe later, the Key#inspect could automatically show symbolic guid names if they
108
+ # are part of this builtin list.
109
+ # FIXME. hey, nice that my fake string is the same length though :)
110
+ PS_MAPI = '{not-really-sure-what-this-should-say}'
111
+ PS_PUBLIC_STRINGS = '{00020329-0000-0000-c000-000000000046}'
112
+ # string properties in this namespace automatically get added to the internet headers
113
+ PS_INTERNET_HEADERS = '{00020386-0000-0000-c000-000000000046}'
114
+ # theres are bunch of outlook ones i think
115
+ # http://blogs.msdn.com/stephen_griffin/archive/2006/05/10/outlook-2007-beta-documentation-notification-based-indexing-support.aspx
116
+ # IPM.Appointment
117
+ PSETID_Appointment = '{00062002-0000-0000-c000-000000000046}'
118
+ # IPM.Task
119
+ PSETID_Task = '{00062003-0000-0000-c000-000000000046}'
120
+ # used for IPM.Contact
121
+ PSETID_Address = '{00062004-0000-0000-c000-000000000046}'
122
+ PSETID_Common = '{00062008-0000-0000-c000-000000000046}'
123
+ # didn't find a source for this name. it is for IPM.StickyNote
124
+ PSETID_Note = '{0006200e-0000-0000-c000-000000000046}'
125
+ # for IPM.Activity. also called the journal?
126
+ PSETID_Log = '{0006200a-0000-0000-c000-000000000046}'
127
+
128
+ SUBSTG_RX = /__substg1\.0_([0-9A-F]{4})([0-9A-F]{4})(?:-([0-9A-F]{8}))?/
129
+
130
+ # access the underlying raw property hash
131
+ attr_reader :raw
132
+ # unused (non-property) objects after parsing an +Dirent+.
133
+ attr_reader :unused
134
+ attr_reader :nameid
135
+
136
+ def initialize
137
+ @raw = {}
138
+ @unused = []
139
+ # FIXME
140
+ @body_rtf = @body_html = @body = false
141
+ end
142
+
143
+ #--
144
+ # The parsing methods
145
+ #++
146
+
147
+ def self.load obj
148
+ prop = Properties.new
149
+ prop.load obj
150
+ prop
151
+ end
152
+
153
+ # Parse properties from the +Dirent+ obj
154
+ def load obj
155
+ # we need to do the nameid first, as it provides the map for later user defined properties
156
+ children = obj.children.dup
157
+ @nameid = if nameid_obj = children.find { |child| child.name == '__nameid_version1.0' }
158
+ children.delete nameid_obj
159
+ Properties.parse_nameid nameid_obj
160
+ end
161
+ # now parse the actual properties. i think dirs that match the substg should be decoded
162
+ # as properties to. 0x000d is just another encoding, the dir encoding. it should match
163
+ # whether the object is file / dir. currently only example is embedded msgs anyway
164
+ children.each do |child|
165
+ if child.file?
166
+ begin
167
+ case child.name
168
+ when /__properties_version1\.0/
169
+ parse_properties child
170
+ when SUBSTG_RX
171
+ parse_substg *($~[1..-1].map { |num| num.hex rescue nil } + [child])
172
+ else raise "bad name for mapi property #{child.name.inspect}"
173
+ end
174
+ #rescue
175
+ # Log.warn $!
176
+ # @unused << child
177
+ end
178
+ else @unused << child
179
+ end
180
+ end
181
+ end
182
+
183
+ # Read nameid from the +Dirent+ obj, which is used for mapping of named properties keys to
184
+ # proxy keys in the 0x8000 - 0xffff range.
185
+ # Returns a hash of integer -> Key.
186
+ def self.parse_nameid obj
187
+ remaining = obj.children.dup
188
+ guids_obj, props_obj, names_obj =
189
+ %w[__substg1.0_00020102 __substg1.0_00030102 __substg1.0_00040102].map do |name|
190
+ remaining.delete obj[name]
191
+ end
192
+
193
+ # parse guids
194
+ # this is the guids for named properities (other than builtin ones)
195
+ # i think PS_PUBLIC_STRINGS, and PS_MAPI are builtin.
196
+ guids = [PS_PUBLIC_STRINGS] + guids_obj.read.scan(/.{16}/m).map do |str|
197
+ Ole::Types.load_guid str
198
+ end
199
+
200
+ # parse names.
201
+ # the string ids for named properties
202
+ # they are no longer parsed, as they're referred to by offset not
203
+ # index. they are simply sequentially packed, as a long, giving
204
+ # the string length, then padding to 4 byte multiple, and repeat.
205
+ names_data = names_obj.read
206
+
207
+ # parse actual props.
208
+ # not sure about any of this stuff really.
209
+ # should flip a few bits in the real msg, to get a better understanding of how this works.
210
+ props = props_obj.read.scan(/.{8}/m).map do |str|
211
+ flags, offset = str[4..-1].unpack 'S2'
212
+ # the property will be serialised as this pseudo property, mapping it to this named property
213
+ pseudo_prop = 0x8000 + offset
214
+ named = flags & 1 == 1
215
+ prop = if named
216
+ str_off = *str.unpack('L')
217
+ len = *names_data[str_off, 4].unpack('L')
218
+ Ole::Types::FROM_UTF16.iconv names_data[str_off + 4, len]
219
+ else
220
+ a, b = str.unpack('S2')
221
+ Log.debug "b not 0" if b != 0
222
+ a
223
+ end
224
+ # a bit sus
225
+ guid_off = flags >> 1
226
+ # missing a few builtin PS_*
227
+ Log.debug "guid off < 2 (#{guid_off})" if guid_off < 2
228
+ guid = guids[guid_off - 2]
229
+ [pseudo_prop, Key.new(prop, guid)]
230
+ end
231
+
232
+ Log.warn "* ignoring #{remaining.length} objects in nameid" unless remaining.empty?
233
+ # this leaves a bunch of other unknown chunks of data with completely unknown meaning.
234
+ # pp [:unknown, child.name, child.data.unpack('H*')[0].scan(/.{16}/m)]
235
+ Hash[*props.flatten]
236
+ end
237
+
238
+ # Parse an +Dirent+, as per <tt>msgconvert.pl</tt>. This is how larger properties, such
239
+ # as strings, binary blobs, and other ole sub-directories (eg nested Msg) are stored.
240
+ def parse_substg key, encoding, offset, obj
241
+ if (encoding & 0x1000) != 0
242
+ if !offset
243
+ # there is typically one with no offset first, whose data is a series of numbers
244
+ # equal to the lengths of all the sub parts. gives an implied array size i suppose.
245
+ # maybe you can initialize the array at this time. the sizes are the same as all the
246
+ # ole object sizes anyway, its to pre-allocate i suppose.
247
+ #p obj.data.unpack('L*')
248
+ # ignore this one
249
+ return
250
+ else
251
+ # remove multivalue flag for individual pieces
252
+ encoding &= ~0x1000
253
+ end
254
+ else
255
+ Log.warn "offset specified for non-multivalue encoding #{obj.name}" if offset
256
+ offset = nil
257
+ end
258
+ # offset is for multivalue encodings.
259
+ unless encoder = ENCODINGS[encoding]
260
+ Log.warn "unknown encoding #{encoding}"
261
+ #encoder = proc { |obj| obj.io } #.read }. maybe not a good idea
262
+ encoder = ENCODINGS[:default]
263
+ end
264
+ add_property key, encoder[obj], offset
265
+ end
266
+
267
+ # For parsing the +properties+ file. Smaller properties are serialized in one chunk,
268
+ # such as longs, bools, times etc. The parsing has problems.
269
+ def parse_properties obj
270
+ data = obj.read
271
+ # don't really understand this that well...
272
+ pad = data.length % 16
273
+ unless (pad == 0 || pad == 8) and data[0...pad] == "\000" * pad
274
+ Log.warn "padding was not as expected #{pad} (#{data.length}) -> #{data[0...pad].inspect}"
275
+ end
276
+ data[pad..-1].scan(/.{16}/m).each do |data|
277
+ property, encoding = ('%08x' % data.unpack('L')).scan /.{4}/
278
+ key = property.hex
279
+ # doesn't make any sense to me. probably because its a serialization of some internal
280
+ # outlook structure...
281
+ next if property == '0000'
282
+ case encoding
283
+ when '0102', '001e', '001f', '101e', '101f', '000d'
284
+ # ignore on purpose. not sure what its for
285
+ # multivalue versions ignored also
286
+ when '0003' # long
287
+ # don't know what all the other data is for
288
+ add_property key, *data[8, 4].unpack('L')
289
+ when '000b' # boolean
290
+ # again, heaps more data than needed. and its not always 0 or 1.
291
+ # they are in fact quite big numbers. this is wrong.
292
+ # p [property, data[4..-1].unpack('H*')[0]]
293
+ add_property key, data[8, 4].unpack('L')[0] != 0
294
+ when '0040' # systime
295
+ # seems to work:
296
+ add_property key, Ole::Types.load_time(data[8..-1])
297
+ else
298
+ Log.warn "ignoring data in __properties section, encoding: #{encoding}"
299
+ Log << data.unpack('H*').inspect + "\n"
300
+ end
301
+ end
302
+ end
303
+
304
+ def add_property key, value, pos=nil
305
+ # map keys in the named property range through nameid
306
+ if Integer === key and key >= 0x8000
307
+ if !@nameid
308
+ Log.warn "no nameid section yet named properties used"
309
+ key = Key.new key
310
+ elsif real_key = @nameid[key]
311
+ key = real_key
312
+ else
313
+ Log.warn "property in named range not in nameid #{key.inspect}"
314
+ key = Key.new key
315
+ end
316
+ else
317
+ key = Key.new key
318
+ end
319
+ if pos
320
+ @raw[key] ||= []
321
+ Log.warn "duplicate property" unless Array === @raw[key]
322
+ # ^ this is actually a trickier problem. the issue is more that they must all be of
323
+ # the same type.
324
+ @raw[key][pos] = value
325
+ else
326
+ # take the last.
327
+ Log.warn "duplicate property #{key.inspect}" if @raw[key]
328
+ @raw[key] = value
329
+ end
330
+ end
331
+
332
+ # resolve an arg (could be key, code, string, or symbol), and possible guid to a key
333
+ def resolve arg, guid=nil
334
+ if guid; Key.new arg, guid
335
+ else
336
+ case arg
337
+ when Key; arg
338
+ when Integer; Key.new arg
339
+ else sym_to_key[arg.to_sym]
340
+ end
341
+ end or raise "unable to resolve key from #{[arg, guid].inspect}"
342
+ end
343
+
344
+ # just so i can get an easy unique list of missing ones
345
+ @@quiet_property = {}
346
+
347
+ def sym_to_key
348
+ # create a map for converting symbols to keys. cache it
349
+ unless @sym_to_key
350
+ @sym_to_key = {}
351
+ @raw.each do |key, value|
352
+ sym = key.to_sym
353
+ # used to use @@quiet_property to only ignore once
354
+ Log.info "couldn't find symbolic name for key #{key.inspect}" unless Symbol === sym
355
+ if @sym_to_key[sym]
356
+ Log.warn "duplicate key #{key.inspect}"
357
+ # we give preference to PS_MAPI keys
358
+ @sym_to_key[sym] = key if key.guid == PS_MAPI
359
+ else
360
+ # just assign
361
+ @sym_to_key[sym] = key
362
+ end
363
+ end
364
+ end
365
+ @sym_to_key
366
+ end
367
+
368
+ # accessors
369
+
370
+ def [] arg, guid=nil
371
+ @raw[resolve(arg, guid)] rescue nil
372
+ end
373
+
374
+ #--
375
+ # for completeness, but its a mute point until i can write to the ole
376
+ # objects.
377
+ #def []= arg, guid=nil, value
378
+ # @raw[resolve(arg, guid)] = value
379
+ #end
380
+ #++
381
+
382
+ def method_missing name, *args
383
+ if name.to_s !~ /\=$/ and args.empty?
384
+ self[name]
385
+ elsif name.to_s =~ /(.*)\=$/ and args.length == 1
386
+ self[$1] = args[0]
387
+ else
388
+ super
389
+ end
390
+ end
391
+
392
+ def to_h
393
+ hash = {}
394
+ sym_to_key.each { |sym, key| hash[sym] = self[key] if Symbol === sym }
395
+ hash
396
+ end
397
+
398
+ def inspect
399
+ '#<Properties ' + to_h.map do |k, v|
400
+ v = v.inspect
401
+ "#{k}=#{v.length > 32 ? v[0..29] + '..."' : v}"
402
+ end.join(' ') + '>'
403
+ end
404
+
405
+ # -----
406
+
407
+ # temporary pseudo tags
408
+
409
+ # for providing rtf to plain text conversion. later, html to text too.
410
+ def body
411
+ return @body if @body != false
412
+ @body = (self[:body] rescue nil)
413
+ @body = (::RTF::Converter.rtf2text body_rtf rescue nil) if !@body or @body.strip.empty?
414
+ @body
415
+ end
416
+
417
+ # for providing rtf decompression
418
+ def body_rtf
419
+ return @body_rtf if @body_rtf != false
420
+ @body_rtf = (RTF.rtfdecompr rtf_compressed.read rescue nil)
421
+ end
422
+
423
+ # for providing rtf to html conversion
424
+ def body_html
425
+ return @body_html if @body_html != false
426
+ @body_html = (self[:body_html].read rescue nil)
427
+ @body_html = (Msg::RTF.rtf2html body_rtf rescue nil) if !@body_html or @body_html.strip.empty?
428
+ # last resort
429
+ @body_html = (::RTF::Converter.rtf2text body_rtf, :html rescue nil) if !@body_html or @body_html.strip.empty?
430
+ @body_html
431
+ end
432
+
433
+ # +Properties+ are accessed by <tt>Key</tt>s, which are coerced to this class.
434
+ # Includes a bunch of methods (hash, ==, eql?) to allow it to work as a key in
435
+ # a +Hash+.
436
+ #
437
+ # Also contains the code that maps keys to symbolic names.
438
+ class Key
439
+ attr_reader :code, :guid
440
+ def initialize code, guid=PS_MAPI
441
+ @code, @guid = code, guid
442
+ end
443
+
444
+ def to_sym
445
+ # hmmm, for some stuff, like, eg, the message class specific range, sym-ification
446
+ # of the key depends on knowing our message class. i don't want to store anything else
447
+ # here though, so if that kind of thing is needed, it can be passed to this function.
448
+ # worry about that when some examples arise.
449
+ case code
450
+ when Integer
451
+ if guid == PS_MAPI # and < 0x8000 ?
452
+ # the hash should be updated now that i've changed the process
453
+ MAPITAGS['%04x' % code].first[/_(.*)/, 1].downcase.to_sym rescue code
454
+ else
455
+ # handle other guids here, like mapping names to outlook properties, based on the
456
+ # outlook object model.
457
+ NAMED_MAP[self].to_sym rescue code
458
+ end
459
+ when String
460
+ # return something like
461
+ # note that named properties don't go through the map at the moment. so #categories
462
+ # doesn't work yet
463
+ code.downcase.to_sym
464
+ end
465
+ end
466
+
467
+ def to_s
468
+ to_sym.to_s
469
+ end
470
+
471
+ # FIXME implement these
472
+ def transmittable?
473
+ # etc, can go here too
474
+ end
475
+
476
+ # this stuff is to allow it to be a useful key
477
+ def hash
478
+ [code, guid].hash
479
+ end
480
+
481
+ def == other
482
+ hash == other.hash
483
+ end
484
+
485
+ alias eql? :==
486
+
487
+ def inspect
488
+ if Integer === code
489
+ hex = '0x%04x' % code
490
+ if guid == PS_MAPI
491
+ # just display as plain hex number
492
+ hex
493
+ else
494
+ "#<Key #{guid}/#{hex}>"
495
+ end
496
+ else
497
+ # display full guid and code
498
+ "#<Key #{guid}/#{code.inspect}>"
499
+ end
500
+ end
501
+ end
502
+
503
+ #--
504
+ # YUCK moved here because we need Key
505
+ #++
506
+
507
+ # data files that provide for the code to symbolic name mapping
508
+ # guids in named_map are really constant references to the above
509
+ MAPITAGS = open("#{SUPPORT_DIR}/data/mapitags.yaml") { |file| YAML.load file }
510
+ NAMED_MAP = Hash[*open("#{SUPPORT_DIR}/data/named_map.yaml") { |file| YAML.load file }.map do |key, value|
511
+ [Key.new(key[0], const_get(key[1])), value]
512
+ end.flatten]
513
+ end
514
+ end
515
+