ruby-msg 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,532 +0,0 @@
1
-
2
- class Msg
3
- #
4
- # = Introduction
5
- #
6
- # A big compononent of +Msg+ files is the property store, which holds
7
- # all the key/value pairs of properties. The message itself, and all
8
- # its <tt>Attachment</tt>s and <tt>Recipient</tt>s have an instance of
9
- # this class.
10
- #
11
- # = Storage model
12
- #
13
- # Property keys (tags?) can be either simple hex numbers, in the
14
- # range 0x0000 - 0xffff, or they can be named properties. In fact,
15
- # properties in the range 0x0000 to 0x7fff are supposed to be the non-
16
- # named properties, and can be considered to be in the +PS_MAPI+
17
- # namespace. (correct?)
18
- #
19
- # Named properties are serialized in the 0x8000 to 0xffff range,
20
- # and are referenced as a guid and long/string pair.
21
- #
22
- # There are key ranges, which can be used to imply things generally
23
- # about keys.
24
- #
25
- # Further, we can give symbolic names to most keys, coming from
26
- # constants in various places. Eg:
27
- #
28
- # 0x0037 => subject
29
- # {00062002-0000-0000-C000-000000000046}/0x8218 => response_status
30
- # # displayed as categories in outlook
31
- # {00020329-0000-0000-C000-000000000046}/"Keywords" => categories
32
- #
33
- # Futher, there are completely different names, coming from other
34
- # object models that get mapped to these things (CDO's model,
35
- # Outlook's model etc). Eg "urn:schemas:httpmail:subject"
36
- # I think these can be ignored though, as they aren't defined clearly
37
- # in terms of mapi properties, and i'm really just trying to make
38
- # a mapi property store. (It should also be relatively easy to
39
- # support them later.)
40
- #
41
- # = Usage
42
- #
43
- # The api is driven by a desire to have the simple stuff "just work", ie
44
- #
45
- # properties.subject
46
- # properties.display_name
47
- #
48
- # There also needs to be a way to look up properties more specifically:
49
- #
50
- # properties[0x0037] # => gets the subject
51
- # properties[0x0037, PS_MAPI] # => still gets the subject
52
- # properties['Keywords', PS_PUBLIC_STRINGS] # => gets outlook's categories array
53
- #
54
- # The abbreviated versions work by "resolving" the symbols to full keys:
55
- #
56
- # # the guid here is just PS_PUBLIC_STRINGS
57
- # properties.resolve :keywords # => #<Key {00020329-0000-0000-c000-000000000046}/"Keywords">
58
- # # the result here is actually also a key
59
- # k = properties.resolve :subject # => 0x0037
60
- # # it has a guid
61
- # k.guid == Msg::Properties::PS_MAPI # => true
62
- #
63
- # = Parsing
64
- #
65
- # There are three objects that need to be parsed to load a +Msg+ property store:
66
- #
67
- # 1. The +nameid+ directory (<tt>Properties.parse_nameid</tt>)
68
- # 2. The many +substg+ objects, whose names should match <tt>Properties::SUBSTG_RX</tt>
69
- # (<tt>Properties#parse_substg</tt>)
70
- # 3. The +properties+ file (<tt>Properties#parse_properties</tt>)
71
- #
72
- # Understanding of the formats is by no means perfect.
73
- #
74
- # = TODO
75
- #
76
- # * Test cases.
77
- # * While the key objects are sufficient, the value objects are just plain
78
- # ruby types. It currently isn't possible to write to the values, or to know
79
- # which encoding the value had.
80
- # * Consider other MAPI property stores, such as tnef/pst. Similar model?
81
- # Generalise this one?
82
- # * Have added IO support to Ole::Storage. now need to fix Properties. can't use
83
- # current greedy-loading approach. still want strings to work nicely:
84
- # props.subject
85
- # but don't want to be loading up large binary blobs, typically attachments, eg
86
- # props.attach_data
87
- # probably the easiest solution is that the binary "encoding", be to return an io
88
- # object instead. and you must read it if you want it as a string
89
- # maybe i can avoid the greedy model anyway? rather than parsing the properties completely,
90
- # have it be load based? you request subject, that translates into, please load the right
91
- # substg, et voila. maybe redo @raw as a lazy loading hash for substg objects, but do the
92
- # others straight away. maybe just parse keys so i know what i've got??
93
- class Properties
94
- # duplicated here for now
95
- SUPPORT_DIR = File.dirname(__FILE__) + '/../..'
96
-
97
- # note that binary and default both use obj.open. not the block form. this means we should
98
- # #close it later, which we don't. as we're only reading though, it shouldn't matter right?
99
- # not really good though FIXME
100
- ENCODINGS = {
101
- 0x000d => proc { |obj| obj }, # seems to be used when its going to be a directory instead of a file. eg nested ole. 3701 usually. in which case we shouldn't get here right?
102
- 0x001f => proc { |obj| Ole::Types::FROM_UTF16.iconv obj.read }, # unicode
103
- # ascii
104
- # FIXME hack did a[0..-2] before, seems right sometimes, but for some others it chopped the text. chomp
105
- 0x001e => proc { |obj| obj.read.chomp 0.chr },
106
- 0x0102 => proc { |obj| obj.open }, # binary?
107
- :default => proc { |obj| obj.open }
108
- }
109
-
110
- # these won't be strings for much longer.
111
- # maybe later, the Key#inspect could automatically show symbolic guid names if they
112
- # are part of this builtin list.
113
- # FIXME. hey, nice that my fake string is the same length though :)
114
- PS_MAPI = '{not-really-sure-what-this-should-say}'
115
- PS_PUBLIC_STRINGS = '{00020329-0000-0000-c000-000000000046}'
116
- # string properties in this namespace automatically get added to the internet headers
117
- PS_INTERNET_HEADERS = '{00020386-0000-0000-c000-000000000046}'
118
- # theres are bunch of outlook ones i think
119
- # http://blogs.msdn.com/stephen_griffin/archive/2006/05/10/outlook-2007-beta-documentation-notification-based-indexing-support.aspx
120
- # IPM.Appointment
121
- PSETID_Appointment = '{00062002-0000-0000-c000-000000000046}'
122
- # IPM.Task
123
- PSETID_Task = '{00062003-0000-0000-c000-000000000046}'
124
- # used for IPM.Contact
125
- PSETID_Address = '{00062004-0000-0000-c000-000000000046}'
126
- PSETID_Common = '{00062008-0000-0000-c000-000000000046}'
127
- # didn't find a source for this name. it is for IPM.StickyNote
128
- PSETID_Note = '{0006200e-0000-0000-c000-000000000046}'
129
- # for IPM.Activity. also called the journal?
130
- PSETID_Log = '{0006200a-0000-0000-c000-000000000046}'
131
-
132
- SUBSTG_RX = /__substg1\.0_([0-9A-F]{4})([0-9A-F]{4})(?:-([0-9A-F]{8}))?/
133
-
134
- # access the underlying raw property hash
135
- attr_reader :raw
136
- # unused (non-property) objects after parsing an +Dirent+.
137
- attr_reader :unused
138
- attr_reader :nameid
139
-
140
- # +nameid+ is to provide a way to inherit from parent (needed for property sets for
141
- # attachments and recipients, which inherit from the msg itself. what about nested
142
- # msg??)
143
- def initialize
144
- @raw = {}
145
- @unused = []
146
- @nameid = nil
147
- # FIXME
148
- @body_rtf = @body_html = @body = false
149
- end
150
-
151
- #--
152
- # The parsing methods
153
- #++
154
-
155
- def self.load obj, ignore=nil
156
- prop = Properties.new
157
- prop.load obj
158
- prop
159
- end
160
-
161
- # Parse properties from the +Dirent+ obj
162
- def load obj
163
- # we need to do the nameid first, as it provides the map for later user defined properties
164
- children = obj.children.dup
165
- if nameid_obj = children.find { |child| child.name == '__nameid_version1.0' }
166
- children.delete nameid_obj
167
- @nameid = Properties.parse_nameid nameid_obj
168
- # hack to make it available to all msg files from the same ole storage object
169
- class << obj.ole
170
- attr_accessor :msg_nameid
171
- end
172
- obj.ole.msg_nameid = @nameid
173
- elsif obj.ole
174
- @nameid = obj.ole.msg_nameid rescue nil
175
- end
176
- # now parse the actual properties. i think dirs that match the substg should be decoded
177
- # as properties to. 0x000d is just another encoding, the dir encoding. it should match
178
- # whether the object is file / dir. currently only example is embedded msgs anyway
179
- children.each do |child|
180
- if child.file?
181
- begin
182
- case child.name
183
- when /__properties_version1\.0/
184
- parse_properties child
185
- when SUBSTG_RX
186
- parse_substg *($~[1..-1].map { |num| num.hex rescue nil } + [child])
187
- else raise "bad name for mapi property #{child.name.inspect}"
188
- end
189
- #rescue
190
- # Log.warn $!
191
- # @unused << child
192
- end
193
- else @unused << child
194
- end
195
- end
196
- end
197
-
198
- # Read nameid from the +Dirent+ obj, which is used for mapping of named properties keys to
199
- # proxy keys in the 0x8000 - 0xffff range.
200
- # Returns a hash of integer -> Key.
201
- def self.parse_nameid obj
202
- remaining = obj.children.dup
203
- guids_obj, props_obj, names_obj =
204
- %w[__substg1.0_00020102 __substg1.0_00030102 __substg1.0_00040102].map do |name|
205
- remaining.delete obj[name]
206
- end
207
-
208
- # parse guids
209
- # this is the guids for named properities (other than builtin ones)
210
- # i think PS_PUBLIC_STRINGS, and PS_MAPI are builtin.
211
- guids = [PS_PUBLIC_STRINGS] + guids_obj.read.scan(/.{16}/m).map do |str|
212
- Ole::Types.load_guid str
213
- end
214
-
215
- # parse names.
216
- # the string ids for named properties
217
- # they are no longer parsed, as they're referred to by offset not
218
- # index. they are simply sequentially packed, as a long, giving
219
- # the string length, then padding to 4 byte multiple, and repeat.
220
- names_data = names_obj.read
221
-
222
- # parse actual props.
223
- # not sure about any of this stuff really.
224
- # should flip a few bits in the real msg, to get a better understanding of how this works.
225
- props = props_obj.read.scan(/.{8}/m).map do |str|
226
- flags, offset = str[4..-1].unpack 'S2'
227
- # the property will be serialised as this pseudo property, mapping it to this named property
228
- pseudo_prop = 0x8000 + offset
229
- named = flags & 1 == 1
230
- prop = if named
231
- str_off = *str.unpack('L')
232
- len = *names_data[str_off, 4].unpack('L')
233
- Ole::Types::FROM_UTF16.iconv names_data[str_off + 4, len]
234
- else
235
- a, b = str.unpack('S2')
236
- Log.debug "b not 0" if b != 0
237
- a
238
- end
239
- # a bit sus
240
- guid_off = flags >> 1
241
- # missing a few builtin PS_*
242
- Log.debug "guid off < 2 (#{guid_off})" if guid_off < 2
243
- guid = guids[guid_off - 2]
244
- [pseudo_prop, Key.new(prop, guid)]
245
- end
246
-
247
- Log.warn "* ignoring #{remaining.length} objects in nameid" unless remaining.empty?
248
- # this leaves a bunch of other unknown chunks of data with completely unknown meaning.
249
- # pp [:unknown, child.name, child.data.unpack('H*')[0].scan(/.{16}/m)]
250
- Hash[*props.flatten]
251
- end
252
-
253
- # Parse an +Dirent+, as per <tt>msgconvert.pl</tt>. This is how larger properties, such
254
- # as strings, binary blobs, and other ole sub-directories (eg nested Msg) are stored.
255
- def parse_substg key, encoding, offset, obj
256
- if (encoding & 0x1000) != 0
257
- if !offset
258
- # there is typically one with no offset first, whose data is a series of numbers
259
- # equal to the lengths of all the sub parts. gives an implied array size i suppose.
260
- # maybe you can initialize the array at this time. the sizes are the same as all the
261
- # ole object sizes anyway, its to pre-allocate i suppose.
262
- #p obj.data.unpack('L*')
263
- # ignore this one
264
- return
265
- else
266
- # remove multivalue flag for individual pieces
267
- encoding &= ~0x1000
268
- end
269
- else
270
- Log.warn "offset specified for non-multivalue encoding #{obj.name}" if offset
271
- offset = nil
272
- end
273
- # offset is for multivalue encodings.
274
- unless encoder = ENCODINGS[encoding]
275
- Log.warn "unknown encoding #{encoding}"
276
- #encoder = proc { |obj| obj.io } #.read }. maybe not a good idea
277
- encoder = ENCODINGS[:default]
278
- end
279
- add_property key, encoder[obj], offset
280
- end
281
-
282
- # For parsing the +properties+ file. Smaller properties are serialized in one chunk,
283
- # such as longs, bools, times etc. The parsing has problems.
284
- def parse_properties obj
285
- data = obj.read
286
- # don't really understand this that well...
287
- pad = data.length % 16
288
- unless (pad == 0 || pad == 8) and data[0...pad] == "\000" * pad
289
- Log.warn "padding was not as expected #{pad} (#{data.length}) -> #{data[0...pad].inspect}"
290
- end
291
- data[pad..-1].scan(/.{16}/m).each do |data|
292
- property, encoding = ('%08x' % data.unpack('L')).scan /.{4}/
293
- key = property.hex
294
- # doesn't make any sense to me. probably because its a serialization of some internal
295
- # outlook structure...
296
- next if property == '0000'
297
- case encoding
298
- when '0102', '001e', '001f', '101e', '101f', '000d'
299
- # ignore on purpose. not sure what its for
300
- # multivalue versions ignored also
301
- when '0003' # long
302
- # don't know what all the other data is for
303
- add_property key, *data[8, 4].unpack('L')
304
- when '000b' # boolean
305
- # again, heaps more data than needed. and its not always 0 or 1.
306
- # they are in fact quite big numbers. this is wrong.
307
- # p [property, data[4..-1].unpack('H*')[0]]
308
- add_property key, data[8, 4].unpack('L')[0] != 0
309
- when '0040' # systime
310
- # seems to work:
311
- add_property key, Ole::Types.load_time(data[8..-1])
312
- else
313
- Log.warn "ignoring data in __properties section, encoding: #{encoding}"
314
- Log << data.unpack('H*').inspect + "\n"
315
- end
316
- end
317
- end
318
-
319
- def add_property key, value, pos=nil
320
- # map keys in the named property range through nameid
321
- if Integer === key and key >= 0x8000
322
- if !@nameid
323
- Log.warn "no nameid section yet named properties used"
324
- key = Key.new key
325
- elsif real_key = @nameid[key]
326
- key = real_key
327
- else
328
- # i think i hit these when i have a named property, in the PS_MAPI
329
- # guid
330
- Log.warn "property in named range not in nameid #{key.inspect}"
331
- key = Key.new key
332
- end
333
- else
334
- key = Key.new key
335
- end
336
- if pos
337
- @raw[key] ||= []
338
- Log.warn "duplicate property" unless Array === @raw[key]
339
- # ^ this is actually a trickier problem. the issue is more that they must all be of
340
- # the same type.
341
- @raw[key][pos] = value
342
- else
343
- # take the last.
344
- Log.warn "duplicate property #{key.inspect}" if @raw[key]
345
- @raw[key] = value
346
- end
347
- end
348
-
349
- # resolve an arg (could be key, code, string, or symbol), and possible guid to a key
350
- def resolve arg, guid=nil
351
- if guid; Key.new arg, guid
352
- else
353
- case arg
354
- when Key; arg
355
- when Integer; Key.new arg
356
- else sym_to_key[arg.to_sym]
357
- end
358
- end or raise "unable to resolve key from #{[arg, guid].inspect}"
359
- end
360
-
361
- # just so i can get an easy unique list of missing ones
362
- @@quiet_property = {}
363
-
364
- def sym_to_key
365
- # create a map for converting symbols to keys. cache it
366
- unless @sym_to_key
367
- @sym_to_key = {}
368
- @raw.each do |key, value|
369
- sym = key.to_sym
370
- # used to use @@quiet_property to only ignore once
371
- Log.info "couldn't find symbolic name for key #{key.inspect}" unless Symbol === sym
372
- if @sym_to_key[sym]
373
- Log.warn "duplicate key #{key.inspect}"
374
- # we give preference to PS_MAPI keys
375
- @sym_to_key[sym] = key if key.guid == PS_MAPI
376
- else
377
- # just assign
378
- @sym_to_key[sym] = key
379
- end
380
- end
381
- end
382
- @sym_to_key
383
- end
384
-
385
- # accessors
386
-
387
- def [] arg, guid=nil
388
- @raw[resolve(arg, guid)] rescue nil
389
- end
390
-
391
- #--
392
- # for completeness, but its a mute point until i can write to the ole
393
- # objects.
394
- #def []= arg, guid=nil, value
395
- # @raw[resolve(arg, guid)] = value
396
- #end
397
- #++
398
-
399
- def method_missing name, *args
400
- if name.to_s !~ /\=$/ and args.empty?
401
- self[name]
402
- elsif name.to_s =~ /(.*)\=$/ and args.length == 1
403
- self[$1] = args[0]
404
- else
405
- super
406
- end
407
- end
408
-
409
- def to_h
410
- hash = {}
411
- sym_to_key.each { |sym, key| hash[sym] = self[key] if Symbol === sym }
412
- hash
413
- end
414
-
415
- def inspect
416
- '#<Properties ' + to_h.map do |k, v|
417
- v = v.inspect
418
- "#{k}=#{v.length > 32 ? v[0..29] + '..."' : v}"
419
- end.join(' ') + '>'
420
- end
421
-
422
- # -----
423
-
424
- # temporary pseudo tags
425
-
426
- # for providing rtf to plain text conversion. later, html to text too.
427
- def body
428
- return @body if @body != false
429
- @body = (self[:body] rescue nil)
430
- @body = (::RTF::Converter.rtf2text body_rtf rescue nil) if !@body or @body.strip.empty?
431
- @body
432
- end
433
-
434
- # for providing rtf decompression
435
- def body_rtf
436
- return @body_rtf if @body_rtf != false
437
- @body_rtf = (RTF.rtfdecompr rtf_compressed.read rescue nil)
438
- end
439
-
440
- # for providing rtf to html conversion
441
- def body_html
442
- return @body_html if @body_html != false
443
- @body_html = (self[:body_html].read rescue nil)
444
- @body_html = (Msg::RTF.rtf2html body_rtf rescue nil) if !@body_html or @body_html.strip.empty?
445
- # last resort
446
- @body_html = (::RTF::Converter.rtf2text body_rtf, :html rescue nil) if !@body_html or @body_html.strip.empty?
447
- @body_html
448
- end
449
-
450
- # +Properties+ are accessed by <tt>Key</tt>s, which are coerced to this class.
451
- # Includes a bunch of methods (hash, ==, eql?) to allow it to work as a key in
452
- # a +Hash+.
453
- #
454
- # Also contains the code that maps keys to symbolic names.
455
- class Key
456
- attr_reader :code, :guid
457
- def initialize code, guid=PS_MAPI
458
- @code, @guid = code, guid
459
- end
460
-
461
- def to_sym
462
- # hmmm, for some stuff, like, eg, the message class specific range, sym-ification
463
- # of the key depends on knowing our message class. i don't want to store anything else
464
- # here though, so if that kind of thing is needed, it can be passed to this function.
465
- # worry about that when some examples arise.
466
- case code
467
- when Integer
468
- if guid == PS_MAPI # and < 0x8000 ?
469
- # the hash should be updated now that i've changed the process
470
- MAPITAGS['%04x' % code].first[/_(.*)/, 1].downcase.to_sym rescue code
471
- else
472
- # handle other guids here, like mapping names to outlook properties, based on the
473
- # outlook object model.
474
- NAMED_MAP[self].to_sym rescue code
475
- end
476
- when String
477
- # return something like
478
- # note that named properties don't go through the map at the moment. so #categories
479
- # doesn't work yet
480
- code.downcase.to_sym
481
- end
482
- end
483
-
484
- def to_s
485
- to_sym.to_s
486
- end
487
-
488
- # FIXME implement these
489
- def transmittable?
490
- # etc, can go here too
491
- end
492
-
493
- # this stuff is to allow it to be a useful key
494
- def hash
495
- [code, guid].hash
496
- end
497
-
498
- def == other
499
- hash == other.hash
500
- end
501
-
502
- alias eql? :==
503
-
504
- def inspect
505
- if Integer === code
506
- hex = '0x%04x' % code
507
- if guid == PS_MAPI
508
- # just display as plain hex number
509
- hex
510
- else
511
- "#<Key #{guid}/#{hex}>"
512
- end
513
- else
514
- # display full guid and code
515
- "#<Key #{guid}/#{code.inspect}>"
516
- end
517
- end
518
- end
519
-
520
- #--
521
- # YUCK moved here because we need Key
522
- #++
523
-
524
- # data files that provide for the code to symbolic name mapping
525
- # guids in named_map are really constant references to the above
526
- MAPITAGS = open("#{SUPPORT_DIR}/data/mapitags.yaml") { |file| YAML.load file }
527
- NAMED_MAP = Hash[*open("#{SUPPORT_DIR}/data/named_map.yaml") { |file| YAML.load file }.map do |key, value|
528
- [Key.new(key[0], const_get(key[1])), value]
529
- end.flatten]
530
- end
531
- end
532
-