ruby-msg 1.2.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,515 @@
1
+
2
+ class Msg
3
+ #
4
+ # = Introduction
5
+ #
6
+ # A big compononent of +Msg+ files is the property store, which holds
7
+ # all the key/value pairs of properties. The message itself, and all
8
+ # its <tt>Attachment</tt>s and <tt>Recipient</tt>s have an instance of
9
+ # this class.
10
+ #
11
+ # = Storage model
12
+ #
13
+ # Property keys (tags?) can be either simple hex numbers, in the
14
+ # range 0x0000 - 0xffff, or they can be named properties. In fact,
15
+ # properties in the range 0x0000 to 0x7fff are supposed to be the non-
16
+ # named properties, and can be considered to be in the +PS_MAPI+
17
+ # namespace. (correct?)
18
+ #
19
+ # Named properties are serialized in the 0x8000 to 0xffff range,
20
+ # and are referenced as a guid and long/string pair.
21
+ #
22
+ # There are key ranges, which can be used to imply things generally
23
+ # about keys.
24
+ #
25
+ # Further, we can give symbolic names to most keys, coming from
26
+ # constants in various places. Eg:
27
+ #
28
+ # 0x0037 => subject
29
+ # {00062002-0000-0000-C000-000000000046}/0x8218 => response_status
30
+ # # displayed as categories in outlook
31
+ # {00020329-0000-0000-C000-000000000046}/"Keywords" => categories
32
+ #
33
+ # Futher, there are completely different names, coming from other
34
+ # object models that get mapped to these things (CDO's model,
35
+ # Outlook's model etc). Eg "urn:schemas:httpmail:subject"
36
+ # I think these can be ignored though, as they aren't defined clearly
37
+ # in terms of mapi properties, and i'm really just trying to make
38
+ # a mapi property store. (It should also be relatively easy to
39
+ # support them later.)
40
+ #
41
+ # = Usage
42
+ #
43
+ # The api is driven by a desire to have the simple stuff "just work", ie
44
+ #
45
+ # properties.subject
46
+ # properties.display_name
47
+ #
48
+ # There also needs to be a way to look up properties more specifically:
49
+ #
50
+ # properties[0x0037] # => gets the subject
51
+ # properties[PS_MAPI, 0x0037] # => still gets the subject
52
+ # properties[PS_PUBLIC_STRINGS, 'Keywords'] # => gets the above categories
53
+ #
54
+ # The abbreviate versions work by "resolving" the symbols to full keys:
55
+ #
56
+ # properties.resolve :keywords # => [PS_OUTLOOK, 'Keywords']
57
+ # properties.resolve :subject # => [PS_MAPI, 0x0037]
58
+ #
59
+ # = Parsing
60
+ #
61
+ # There are three objects that need to be parsed to load a +Msg+ property store:
62
+ #
63
+ # 1. The +nameid+ directory (<tt>Properties.parse_nameid</tt>)
64
+ # 2. The many +substg+ objects, whose names should match <tt>Properties::SUBSTG_RX</tt>
65
+ # (<tt>Properties#parse_substg</tt>)
66
+ # 3. The +properties+ file (<tt>Properties#parse_properties</tt>)
67
+ #
68
+ # Understanding of the formats is by no means perfect
69
+ #
70
+ # = TODO
71
+ #
72
+ # * Test cases.
73
+ # * While the key objects are sufficient, the value objects are just plain
74
+ # ruby types. It currently isn't possible to write to the values, or to know
75
+ # which encoding the value had.
76
+ # * Consider other MAPI property stores, such as tnef/pst. Similar model?
77
+ # Generalise this one?
78
+ # * Have added IO support to Ole::Storage. now need to fix Properties. can't use
79
+ # current greedy-loading approach. still want strings to work nicely:
80
+ # props.subject
81
+ # but don't want to be loading up large binary blobs, typically attachments, eg
82
+ # props.attach_data.
83
+ # probably the easiest solution is that the binary "encoding", be to return an io
84
+ # object instead. and you must read it if you want it as a string
85
+ # maybe i can avoid the greedy model anyway? rather than parsing the properties completely,
86
+ # have it be load based? you request subject, that translates into, please load the right
87
+ # substg, et voila. maybe redo @raw as a lazy loading hash for substg objects, but do the
88
+ # others straight away. maybe just parse keys so i know what i've got??
89
+ class Properties
90
+ # duplicated here for now
91
+ SUPPORT_DIR = File.dirname(__FILE__) + '/../..'
92
+
93
+ # note that binary and default both use obj.open. not the block form. this means we should
94
+ # #close it later, which we don't. as we're only reading though, it shouldn't matter right?
95
+ # not really good though FIXME
96
+ ENCODINGS = {
97
+ 0x000d => proc { |obj| obj }, # seems to be used when its going to be a directory instead of a file. eg nested ole. 3701 usually. in which case we shouldn't get here right?
98
+ 0x001f => proc { |obj| Ole::Types::FROM_UTF16.iconv obj.read }, # unicode
99
+ # ascii
100
+ # FIXME hack did a[0..-2] before, seems right sometimes, but for some others it chopped the text. chomp
101
+ 0x001e => proc { |obj| a = obj.read; a[-1] == 0 ? a[0...-2] : a },
102
+ 0x0102 => proc { |obj| obj.open }, # binary?
103
+ :default => proc { |obj| obj.open }
104
+ }
105
+
106
+ # these won't be strings for much longer.
107
+ # maybe later, the Key#inspect could automatically show symbolic guid names if they
108
+ # are part of this builtin list.
109
+ # FIXME. hey, nice that my fake string is the same length though :)
110
+ PS_MAPI = '{not-really-sure-what-this-should-say}'
111
+ PS_PUBLIC_STRINGS = '{00020329-0000-0000-c000-000000000046}'
112
+ # string properties in this namespace automatically get added to the internet headers
113
+ PS_INTERNET_HEADERS = '{00020386-0000-0000-c000-000000000046}'
114
+ # theres are bunch of outlook ones i think
115
+ # http://blogs.msdn.com/stephen_griffin/archive/2006/05/10/outlook-2007-beta-documentation-notification-based-indexing-support.aspx
116
+ # IPM.Appointment
117
+ PSETID_Appointment = '{00062002-0000-0000-c000-000000000046}'
118
+ # IPM.Task
119
+ PSETID_Task = '{00062003-0000-0000-c000-000000000046}'
120
+ # used for IPM.Contact
121
+ PSETID_Address = '{00062004-0000-0000-c000-000000000046}'
122
+ PSETID_Common = '{00062008-0000-0000-c000-000000000046}'
123
+ # didn't find a source for this name. it is for IPM.StickyNote
124
+ PSETID_Note = '{0006200e-0000-0000-c000-000000000046}'
125
+ # for IPM.Activity. also called the journal?
126
+ PSETID_Log = '{0006200a-0000-0000-c000-000000000046}'
127
+
128
+ SUBSTG_RX = /__substg1\.0_([0-9A-F]{4})([0-9A-F]{4})(?:-([0-9A-F]{8}))?/
129
+
130
+ # access the underlying raw property hash
131
+ attr_reader :raw
132
+ # unused (non-property) objects after parsing an +Dirent+.
133
+ attr_reader :unused
134
+ attr_reader :nameid
135
+
136
+ def initialize
137
+ @raw = {}
138
+ @unused = []
139
+ # FIXME
140
+ @body_rtf = @body_html = @body = false
141
+ end
142
+
143
+ #--
144
+ # The parsing methods
145
+ #++
146
+
147
+ def self.load obj
148
+ prop = Properties.new
149
+ prop.load obj
150
+ prop
151
+ end
152
+
153
+ # Parse properties from the +Dirent+ obj
154
+ def load obj
155
+ # we need to do the nameid first, as it provides the map for later user defined properties
156
+ children = obj.children.dup
157
+ @nameid = if nameid_obj = children.find { |child| child.name == '__nameid_version1.0' }
158
+ children.delete nameid_obj
159
+ Properties.parse_nameid nameid_obj
160
+ end
161
+ # now parse the actual properties. i think dirs that match the substg should be decoded
162
+ # as properties to. 0x000d is just another encoding, the dir encoding. it should match
163
+ # whether the object is file / dir. currently only example is embedded msgs anyway
164
+ children.each do |child|
165
+ if child.file?
166
+ begin
167
+ case child.name
168
+ when /__properties_version1\.0/
169
+ parse_properties child
170
+ when SUBSTG_RX
171
+ parse_substg *($~[1..-1].map { |num| num.hex rescue nil } + [child])
172
+ else raise "bad name for mapi property #{child.name.inspect}"
173
+ end
174
+ #rescue
175
+ # Log.warn $!
176
+ # @unused << child
177
+ end
178
+ else @unused << child
179
+ end
180
+ end
181
+ end
182
+
183
+ # Read nameid from the +Dirent+ obj, which is used for mapping of named properties keys to
184
+ # proxy keys in the 0x8000 - 0xffff range.
185
+ # Returns a hash of integer -> Key.
186
+ def self.parse_nameid obj
187
+ remaining = obj.children.dup
188
+ guids_obj, props_obj, names_obj =
189
+ %w[__substg1.0_00020102 __substg1.0_00030102 __substg1.0_00040102].map do |name|
190
+ remaining.delete obj[name]
191
+ end
192
+
193
+ # parse guids
194
+ # this is the guids for named properities (other than builtin ones)
195
+ # i think PS_PUBLIC_STRINGS, and PS_MAPI are builtin.
196
+ guids = [PS_PUBLIC_STRINGS] + guids_obj.read.scan(/.{16}/m).map do |str|
197
+ Ole::Types.load_guid str
198
+ end
199
+
200
+ # parse names.
201
+ # the string ids for named properties
202
+ # they are no longer parsed, as they're referred to by offset not
203
+ # index. they are simply sequentially packed, as a long, giving
204
+ # the string length, then padding to 4 byte multiple, and repeat.
205
+ names_data = names_obj.read
206
+
207
+ # parse actual props.
208
+ # not sure about any of this stuff really.
209
+ # should flip a few bits in the real msg, to get a better understanding of how this works.
210
+ props = props_obj.read.scan(/.{8}/m).map do |str|
211
+ flags, offset = str[4..-1].unpack 'S2'
212
+ # the property will be serialised as this pseudo property, mapping it to this named property
213
+ pseudo_prop = 0x8000 + offset
214
+ named = flags & 1 == 1
215
+ prop = if named
216
+ str_off = *str.unpack('L')
217
+ len = *names_data[str_off, 4].unpack('L')
218
+ Ole::Types::FROM_UTF16.iconv names_data[str_off + 4, len]
219
+ else
220
+ a, b = str.unpack('S2')
221
+ Log.debug "b not 0" if b != 0
222
+ a
223
+ end
224
+ # a bit sus
225
+ guid_off = flags >> 1
226
+ # missing a few builtin PS_*
227
+ Log.debug "guid off < 2 (#{guid_off})" if guid_off < 2
228
+ guid = guids[guid_off - 2]
229
+ [pseudo_prop, Key.new(prop, guid)]
230
+ end
231
+
232
+ Log.warn "* ignoring #{remaining.length} objects in nameid" unless remaining.empty?
233
+ # this leaves a bunch of other unknown chunks of data with completely unknown meaning.
234
+ # pp [:unknown, child.name, child.data.unpack('H*')[0].scan(/.{16}/m)]
235
+ Hash[*props.flatten]
236
+ end
237
+
238
+ # Parse an +Dirent+, as per <tt>msgconvert.pl</tt>. This is how larger properties, such
239
+ # as strings, binary blobs, and other ole sub-directories (eg nested Msg) are stored.
240
+ def parse_substg key, encoding, offset, obj
241
+ if (encoding & 0x1000) != 0
242
+ if !offset
243
+ # there is typically one with no offset first, whose data is a series of numbers
244
+ # equal to the lengths of all the sub parts. gives an implied array size i suppose.
245
+ # maybe you can initialize the array at this time. the sizes are the same as all the
246
+ # ole object sizes anyway, its to pre-allocate i suppose.
247
+ #p obj.data.unpack('L*')
248
+ # ignore this one
249
+ return
250
+ else
251
+ # remove multivalue flag for individual pieces
252
+ encoding &= ~0x1000
253
+ end
254
+ else
255
+ Log.warn "offset specified for non-multivalue encoding #{obj.name}" if offset
256
+ offset = nil
257
+ end
258
+ # offset is for multivalue encodings.
259
+ unless encoder = ENCODINGS[encoding]
260
+ Log.warn "unknown encoding #{encoding}"
261
+ #encoder = proc { |obj| obj.io } #.read }. maybe not a good idea
262
+ encoder = ENCODINGS[:default]
263
+ end
264
+ add_property key, encoder[obj], offset
265
+ end
266
+
267
+ # For parsing the +properties+ file. Smaller properties are serialized in one chunk,
268
+ # such as longs, bools, times etc. The parsing has problems.
269
+ def parse_properties obj
270
+ data = obj.read
271
+ # don't really understand this that well...
272
+ pad = data.length % 16
273
+ unless (pad == 0 || pad == 8) and data[0...pad] == "\000" * pad
274
+ Log.warn "padding was not as expected #{pad} (#{data.length}) -> #{data[0...pad].inspect}"
275
+ end
276
+ data[pad..-1].scan(/.{16}/m).each do |data|
277
+ property, encoding = ('%08x' % data.unpack('L')).scan /.{4}/
278
+ key = property.hex
279
+ # doesn't make any sense to me. probably because its a serialization of some internal
280
+ # outlook structure...
281
+ next if property == '0000'
282
+ case encoding
283
+ when '0102', '001e', '001f', '101e', '101f', '000d'
284
+ # ignore on purpose. not sure what its for
285
+ # multivalue versions ignored also
286
+ when '0003' # long
287
+ # don't know what all the other data is for
288
+ add_property key, *data[8, 4].unpack('L')
289
+ when '000b' # boolean
290
+ # again, heaps more data than needed. and its not always 0 or 1.
291
+ # they are in fact quite big numbers. this is wrong.
292
+ # p [property, data[4..-1].unpack('H*')[0]]
293
+ add_property key, data[8, 4].unpack('L')[0] != 0
294
+ when '0040' # systime
295
+ # seems to work:
296
+ add_property key, Ole::Types.load_time(data[8..-1])
297
+ else
298
+ Log.warn "ignoring data in __properties section, encoding: #{encoding}"
299
+ Log << data.unpack('H*').inspect + "\n"
300
+ end
301
+ end
302
+ end
303
+
304
+ def add_property key, value, pos=nil
305
+ # map keys in the named property range through nameid
306
+ if Integer === key and key >= 0x8000
307
+ if !@nameid
308
+ Log.warn "no nameid section yet named properties used"
309
+ key = Key.new key
310
+ elsif real_key = @nameid[key]
311
+ key = real_key
312
+ else
313
+ Log.warn "property in named range not in nameid #{key.inspect}"
314
+ key = Key.new key
315
+ end
316
+ else
317
+ key = Key.new key
318
+ end
319
+ if pos
320
+ @raw[key] ||= []
321
+ Log.warn "duplicate property" unless Array === @raw[key]
322
+ # ^ this is actually a trickier problem. the issue is more that they must all be of
323
+ # the same type.
324
+ @raw[key][pos] = value
325
+ else
326
+ # take the last.
327
+ Log.warn "duplicate property #{key.inspect}" if @raw[key]
328
+ @raw[key] = value
329
+ end
330
+ end
331
+
332
+ # resolve an arg (could be key, code, string, or symbol), and possible guid to a key
333
+ def resolve arg, guid=nil
334
+ if guid; Key.new arg, guid
335
+ else
336
+ case arg
337
+ when Key; arg
338
+ when Integer; Key.new arg
339
+ else sym_to_key[arg.to_sym]
340
+ end
341
+ end or raise "unable to resolve key from #{[arg, guid].inspect}"
342
+ end
343
+
344
+ # just so i can get an easy unique list of missing ones
345
+ @@quiet_property = {}
346
+
347
+ def sym_to_key
348
+ # create a map for converting symbols to keys. cache it
349
+ unless @sym_to_key
350
+ @sym_to_key = {}
351
+ @raw.each do |key, value|
352
+ sym = key.to_sym
353
+ # used to use @@quiet_property to only ignore once
354
+ Log.info "couldn't find symbolic name for key #{key.inspect}" unless Symbol === sym
355
+ if @sym_to_key[sym]
356
+ Log.warn "duplicate key #{key.inspect}"
357
+ # we give preference to PS_MAPI keys
358
+ @sym_to_key[sym] = key if key.guid == PS_MAPI
359
+ else
360
+ # just assign
361
+ @sym_to_key[sym] = key
362
+ end
363
+ end
364
+ end
365
+ @sym_to_key
366
+ end
367
+
368
+ # accessors
369
+
370
+ def [] arg, guid=nil
371
+ @raw[resolve(arg, guid)] rescue nil
372
+ end
373
+
374
+ #--
375
+ # for completeness, but its a mute point until i can write to the ole
376
+ # objects.
377
+ #def []= arg, guid=nil, value
378
+ # @raw[resolve(arg, guid)] = value
379
+ #end
380
+ #++
381
+
382
+ def method_missing name, *args
383
+ if name.to_s !~ /\=$/ and args.empty?
384
+ self[name]
385
+ elsif name.to_s =~ /(.*)\=$/ and args.length == 1
386
+ self[$1] = args[0]
387
+ else
388
+ super
389
+ end
390
+ end
391
+
392
+ def to_h
393
+ hash = {}
394
+ sym_to_key.each { |sym, key| hash[sym] = self[key] if Symbol === sym }
395
+ hash
396
+ end
397
+
398
+ def inspect
399
+ '#<Properties ' + to_h.map do |k, v|
400
+ v = v.inspect
401
+ "#{k}=#{v.length > 32 ? v[0..29] + '..."' : v}"
402
+ end.join(' ') + '>'
403
+ end
404
+
405
+ # -----
406
+
407
+ # temporary pseudo tags
408
+
409
+ # for providing rtf to plain text conversion. later, html to text too.
410
+ def body
411
+ return @body if @body != false
412
+ @body = (self[:body] rescue nil)
413
+ @body = (::RTF::Converter.rtf2text body_rtf rescue nil) if !@body or @body.strip.empty?
414
+ @body
415
+ end
416
+
417
+ # for providing rtf decompression
418
+ def body_rtf
419
+ return @body_rtf if @body_rtf != false
420
+ @body_rtf = (RTF.rtfdecompr rtf_compressed.read rescue nil)
421
+ end
422
+
423
+ # for providing rtf to html conversion
424
+ def body_html
425
+ return @body_html if @body_html != false
426
+ @body_html = (self[:body_html].read rescue nil)
427
+ @body_html = (Msg::RTF.rtf2html body_rtf rescue nil) if !@body_html or @body_html.strip.empty?
428
+ # last resort
429
+ @body_html = (::RTF::Converter.rtf2text body_rtf, :html rescue nil) if !@body_html or @body_html.strip.empty?
430
+ @body_html
431
+ end
432
+
433
+ # +Properties+ are accessed by <tt>Key</tt>s, which are coerced to this class.
434
+ # Includes a bunch of methods (hash, ==, eql?) to allow it to work as a key in
435
+ # a +Hash+.
436
+ #
437
+ # Also contains the code that maps keys to symbolic names.
438
+ class Key
439
+ attr_reader :code, :guid
440
+ def initialize code, guid=PS_MAPI
441
+ @code, @guid = code, guid
442
+ end
443
+
444
+ def to_sym
445
+ # hmmm, for some stuff, like, eg, the message class specific range, sym-ification
446
+ # of the key depends on knowing our message class. i don't want to store anything else
447
+ # here though, so if that kind of thing is needed, it can be passed to this function.
448
+ # worry about that when some examples arise.
449
+ case code
450
+ when Integer
451
+ if guid == PS_MAPI # and < 0x8000 ?
452
+ # the hash should be updated now that i've changed the process
453
+ MAPITAGS['%04x' % code].first[/_(.*)/, 1].downcase.to_sym rescue code
454
+ else
455
+ # handle other guids here, like mapping names to outlook properties, based on the
456
+ # outlook object model.
457
+ NAMED_MAP[self].to_sym rescue code
458
+ end
459
+ when String
460
+ # return something like
461
+ # note that named properties don't go through the map at the moment. so #categories
462
+ # doesn't work yet
463
+ code.downcase.to_sym
464
+ end
465
+ end
466
+
467
+ def to_s
468
+ to_sym.to_s
469
+ end
470
+
471
+ # FIXME implement these
472
+ def transmittable?
473
+ # etc, can go here too
474
+ end
475
+
476
+ # this stuff is to allow it to be a useful key
477
+ def hash
478
+ [code, guid].hash
479
+ end
480
+
481
+ def == other
482
+ hash == other.hash
483
+ end
484
+
485
+ alias eql? :==
486
+
487
+ def inspect
488
+ if Integer === code
489
+ hex = '0x%04x' % code
490
+ if guid == PS_MAPI
491
+ # just display as plain hex number
492
+ hex
493
+ else
494
+ "#<Key #{guid}/#{hex}>"
495
+ end
496
+ else
497
+ # display full guid and code
498
+ "#<Key #{guid}/#{code.inspect}>"
499
+ end
500
+ end
501
+ end
502
+
503
+ #--
504
+ # YUCK moved here because we need Key
505
+ #++
506
+
507
+ # data files that provide for the code to symbolic name mapping
508
+ # guids in named_map are really constant references to the above
509
+ MAPITAGS = open("#{SUPPORT_DIR}/data/mapitags.yaml") { |file| YAML.load file }
510
+ NAMED_MAP = Hash[*open("#{SUPPORT_DIR}/data/named_map.yaml") { |file| YAML.load file }.map do |key, value|
511
+ [Key.new(key[0], const_get(key[1])), value]
512
+ end.flatten]
513
+ end
514
+ end
515
+