ruby-msg-nx 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/mapi/pst.rb ADDED
@@ -0,0 +1,1995 @@
1
+ #
2
+ # = Introduction
3
+ #
4
+ # This file is mostly an attempt to port libpst to ruby, and simplify it in the process. It
5
+ # will leverage much of the existing MAPI => MIME conversion developed for Msg files, and as
6
+ # such is purely concerned with the file structure details.
7
+ #
8
+ # = TODO
9
+ #
10
+ # 1. solve recipient table problem (test4).
11
+ # this is done. turns out it was due to id2 clashes. find better solution
12
+ # 2. check parse consistency. an initial conversion of a 30M file to pst, shows
13
+ # a number of messages conveting badly. compare with libpst too.
14
+ # 3. xattribs
15
+ # 4. generalise the Mapi stuff better
16
+ # 5. refactor index load
17
+ # 6. msg serialization?
18
+ #
19
+
20
+ =begin
21
+
22
+ quick plan for cleanup.
23
+
24
+ have working tests for 97 and 03 file formats, so safe.
25
+
26
+ want to fix up:
27
+
28
+ 64 bit unpacks scattered around. its ugly. not sure how best to handle it, but am slightly tempted
29
+ to override String#unpack to support a 64 bit little endian unpack (like L vs N/V, for Q). one way or
30
+ another need to fix it. Could really slow everything else down if its parsing the unpack strings twice,
31
+ once in ruby, for every single unpack i do :/
32
+
33
+ the index loading process, and the lack of shared code between normal vs 64 bit variants, and Index vs Desc.
34
+ should be able to reduce code by factor of 4. also think I should move load code into the class too. then
35
+ maybe have something like:
36
+
37
+ class Header
38
+ def index_class
39
+ version_2003 ? Index64 : Index
40
+ end
41
+ end
42
+
43
+ def load_idx
44
+ header.index_class.load_index
45
+ end
46
+
47
+ OR
48
+
49
+ def initialize
50
+ @header = ...
51
+ extend @header.index_class::Load
52
+ load_idx
53
+ end
54
+
55
+ need to think about the role of the mapi code, and Pst::Item etc, but that layer can come later.
56
+
57
+ =end
58
+
59
+ require 'mapi'
60
+ require 'enumerator'
61
+ require 'ostruct'
62
+ require 'ole/ranges_io'
63
+ require 'mapi/helper'
64
+
65
+ module Mapi
66
+ # Read Outlook's pst file
67
+ class Pst
68
+ class FormatError < StandardError
69
+ end
70
+
71
+ # unfortunately there is no Q analogue which is little endian only.
72
+ # this translates T as an unsigned quad word, little endian byte order, to
73
+ # not pollute the rest of the code.
74
+ #
75
+ # didn't want to override String#unpack, cause its too hacky, and incomplete.
76
+ #
77
+ # @param str [String]
78
+ # @param unpack_spec [String]
79
+ # @return [Array]
80
+ # @private
81
+ def self.unpack str, unpack_spec
82
+ return str.unpack(unpack_spec) unless unpack_spec['T']
83
+ @unpack_cache ||= {}
84
+ t_offsets, new_spec = @unpack_cache[unpack_spec]
85
+ unless t_offsets
86
+ t_offsets = []
87
+ offset = 0
88
+ new_spec = ''
89
+ unpack_spec.scan(/([^\d])_?(\*|\d+)?/o) do
90
+ num_elems = $1.downcase == 'a' ? 1 : ($2 || 1).to_i
91
+ if $1 == 'T'
92
+ num_elems.times { |i| t_offsets << offset + i }
93
+ new_spec << "V#{num_elems * 2}"
94
+ else
95
+ new_spec << $~[0]
96
+ end
97
+ offset += num_elems
98
+ end
99
+ @unpack_cache[unpack_spec] = [t_offsets, new_spec]
100
+ end
101
+ a = str.unpack(new_spec)
102
+ t_offsets.each do |offset|
103
+ low, high = a[offset, 2]
104
+ a[offset, 2] = low && high ? low + (high << 32) : nil
105
+ end
106
+ a
107
+ end
108
+
109
+ # @param str [String]
110
+ # @param size [Integer]
111
+ # @param count [Integer]
112
+ # @return [Array<String>]
113
+ # @private
114
+ def self.split_per str, size, count
115
+ count = str.length / size if count < 0
116
+ list = []
117
+ count.times {|i| list << str[size * i, size]}
118
+ list
119
+ end
120
+
121
+ #
122
+ # this is the header and encryption encapsulation code
123
+ # ----------------------------------------------------------------------------
124
+ #
125
+
126
+ # class which encapsulates the pst header
127
+ #
128
+ # @private
129
+ class Header
130
+ SIZE = 512
131
+ MAGIC = 0x2142444e
132
+
133
+ # these are the constants defined in libpst.c, that
134
+ # are referenced in pst_open()
135
+ INDEX_TYPE_OFFSET = 0x0A
136
+ FILE_SIZE_POINTER = 0xA8
137
+ FILE_SIZE_POINTER_64 = 0xB8
138
+ SECOND_POINTER = 0xBC
139
+ INDEX_POINTER = 0xC4
140
+ SECOND_POINTER_64 = 0xE0
141
+ INDEX_POINTER_64 = 0xF0
142
+ ENC_OFFSET = 0x1CD
143
+
144
+ # @return [Integer]
145
+ attr_reader :magic
146
+ # @return [Integer]
147
+ attr_reader :index_type
148
+ # @return [Integer]
149
+ attr_reader :encrypt_type
150
+ # @return [Integer]
151
+ attr_reader :size
152
+ # @return [Integer]
153
+ attr_reader :block_btree_count
154
+ # @return [Integer]
155
+ attr_reader :block_btree
156
+ # @return [Integer]
157
+ attr_reader :node_btree_count
158
+ # @return [Integer]
159
+ attr_reader :node_btree
160
+ # @return [Integer]
161
+ attr_reader :version
162
+
163
+ def initialize data
164
+ @magic = data.unpack('N')[0]
165
+ @index_type = data[INDEX_TYPE_OFFSET].ord
166
+ @version = {0x0e => 1997, 0x17 => 2003, 0x24 => 2003}[@index_type]
167
+
168
+ if version_2003?
169
+ # don't know?
170
+ # >> data1.unpack('V*').zip(data2.unpack('V*')).enum_with_index.select { |(c, d), i| c != d and not [46, 56, 60].include?(i) }.select { |(a, b), i| b == 0 }.map { |(a, b), i| [a / 256, i] }
171
+ # [8, 76], [32768, 84], [128, 89]
172
+ # >> data1.unpack('C*').zip(data2.unpack('C*')).enum_with_index.select { |(c, d), i| c != d and not [184..187, 224..227, 240..243].any? { |r| r === i } }.select { |(a, b), i| b == 0 and ((Math.log(a) / Math.log(2)) % 1) < 0.0001 }
173
+ # [[[2, 0], 61], [[2, 0], 76], [[2, 0], 195], [[2, 0], 257], [[8, 0], 305], [[128, 0], 338], [[128, 0], 357]]
174
+ # i have only 2 psts to base this guess on, so i can't really come up with anything that looks reasonable yet. not sure what the offset is. unfortunately there is so much in the header
175
+ # that isn't understood...
176
+ @encrypt_type = 1
177
+
178
+ @node_btree_count, @node_btree = Pst.unpack(data[SECOND_POINTER_64 - 8, 16], "T2")
179
+ @block_btree_count, @block_btree = Pst.unpack(data[INDEX_POINTER_64 - 8, 16], "T2")
180
+
181
+ @size = data[FILE_SIZE_POINTER_64, 4].unpack('V')[0]
182
+ else
183
+ @encrypt_type = data[ENC_OFFSET].ord
184
+
185
+ @node_btree_count, @node_btree = data[SECOND_POINTER - 4, 8].unpack('V2')
186
+ @block_btree_count, @block_btree = data[INDEX_POINTER - 4, 8].unpack('V2')
187
+
188
+ @size = data[FILE_SIZE_POINTER, 4].unpack('V')[0]
189
+ end
190
+
191
+ validate!
192
+ end
193
+
194
+ # return `true` if pst is an Unicode version. Unicode version also uses 64-bit file pointer.
195
+ # otherwise return `false` where pst is an ANSI version. ANSI version uses 32-bit file pointer.
196
+ #
197
+ # @return [Boolean]
198
+ def version_2003?
199
+ version == 2003
200
+ end
201
+
202
+ def encrypted?
203
+ encrypt_type != 0
204
+ end
205
+
206
+ def validate!
207
+ raise FormatError, "bad signature on pst file (#{'0x%x' % magic})" unless magic == MAGIC
208
+ raise FormatError, "only index types 0x0e, 0x17 and 0x24 are handled (#{'0x%x' % index_type})" unless [0x0e, 0x17, 0x24].include?(index_type)
209
+ raise FormatError, "only encrytion types 0 and 1 are handled (#{encrypt_type.inspect})" unless [0, 1].include?(encrypt_type)
210
+ end
211
+ end
212
+
213
+ # compressible encryption! :D
214
+ #
215
+ # simple substitution. see libpst.c
216
+ # maybe test switch to using a String#tr!
217
+ #
218
+ # @private
219
+ class CompressibleEncryption
220
+ DECRYPT_TABLE = [
221
+ 0x47, 0xf1, 0xb4, 0xe6, 0x0b, 0x6a, 0x72, 0x48,
222
+ 0x85, 0x4e, 0x9e, 0xeb, 0xe2, 0xf8, 0x94, 0x53, # 0x0f
223
+ 0xe0, 0xbb, 0xa0, 0x02, 0xe8, 0x5a, 0x09, 0xab,
224
+ 0xdb, 0xe3, 0xba, 0xc6, 0x7c, 0xc3, 0x10, 0xdd, # 0x1f
225
+ 0x39, 0x05, 0x96, 0x30, 0xf5, 0x37, 0x60, 0x82,
226
+ 0x8c, 0xc9, 0x13, 0x4a, 0x6b, 0x1d, 0xf3, 0xfb, # 0x2f
227
+ 0x8f, 0x26, 0x97, 0xca, 0x91, 0x17, 0x01, 0xc4,
228
+ 0x32, 0x2d, 0x6e, 0x31, 0x95, 0xff, 0xd9, 0x23, # 0x3f
229
+ 0xd1, 0x00, 0x5e, 0x79, 0xdc, 0x44, 0x3b, 0x1a,
230
+ 0x28, 0xc5, 0x61, 0x57, 0x20, 0x90, 0x3d, 0x83, # 0x4f
231
+ 0xb9, 0x43, 0xbe, 0x67, 0xd2, 0x46, 0x42, 0x76,
232
+ 0xc0, 0x6d, 0x5b, 0x7e, 0xb2, 0x0f, 0x16, 0x29, # 0x5f
233
+ 0x3c, 0xa9, 0x03, 0x54, 0x0d, 0xda, 0x5d, 0xdf,
234
+ 0xf6, 0xb7, 0xc7, 0x62, 0xcd, 0x8d, 0x06, 0xd3, # 0x6f
235
+ 0x69, 0x5c, 0x86, 0xd6, 0x14, 0xf7, 0xa5, 0x66,
236
+ 0x75, 0xac, 0xb1, 0xe9, 0x45, 0x21, 0x70, 0x0c, # 0x7f
237
+ 0x87, 0x9f, 0x74, 0xa4, 0x22, 0x4c, 0x6f, 0xbf,
238
+ 0x1f, 0x56, 0xaa, 0x2e, 0xb3, 0x78, 0x33, 0x50, # 0x8f
239
+ 0xb0, 0xa3, 0x92, 0xbc, 0xcf, 0x19, 0x1c, 0xa7,
240
+ 0x63, 0xcb, 0x1e, 0x4d, 0x3e, 0x4b, 0x1b, 0x9b, # 0x9f
241
+ 0x4f, 0xe7, 0xf0, 0xee, 0xad, 0x3a, 0xb5, 0x59,
242
+ 0x04, 0xea, 0x40, 0x55, 0x25, 0x51, 0xe5, 0x7a, # 0xaf
243
+ 0x89, 0x38, 0x68, 0x52, 0x7b, 0xfc, 0x27, 0xae,
244
+ 0xd7, 0xbd, 0xfa, 0x07, 0xf4, 0xcc, 0x8e, 0x5f, # 0xbf
245
+ 0xef, 0x35, 0x9c, 0x84, 0x2b, 0x15, 0xd5, 0x77,
246
+ 0x34, 0x49, 0xb6, 0x12, 0x0a, 0x7f, 0x71, 0x88, # 0xcf
247
+ 0xfd, 0x9d, 0x18, 0x41, 0x7d, 0x93, 0xd8, 0x58,
248
+ 0x2c, 0xce, 0xfe, 0x24, 0xaf, 0xde, 0xb8, 0x36, # 0xdf
249
+ 0xc8, 0xa1, 0x80, 0xa6, 0x99, 0x98, 0xa8, 0x2f,
250
+ 0x0e, 0x81, 0x65, 0x73, 0xe4, 0xc2, 0xa2, 0x8a, # 0xef
251
+ 0xd4, 0xe1, 0x11, 0xd0, 0x08, 0x8b, 0x2a, 0xf2,
252
+ 0xed, 0x9a, 0x64, 0x3f, 0xc1, 0x6c, 0xf9, 0xec # 0xff
253
+ ]
254
+
255
+ ENCRYPT_TABLE = [nil] * 256
256
+ DECRYPT_TABLE.each_with_index { |i, j| ENCRYPT_TABLE[i] = j }
257
+
258
+ def self.decrypt_alt encrypted
259
+ decrypted = ''
260
+ encrypted.length.times { |i| decrypted << DECRYPT_TABLE[encrypted[i]] }
261
+ decrypted
262
+ end
263
+
264
+ def self.encrypt_alt decrypted
265
+ encrypted = ''
266
+ decrypted.length.times { |i| encrypted << ENCRYPT_TABLE[decrypted[i]] }
267
+ encrypted
268
+ end
269
+
270
+ # an alternate implementation that is possibly faster....
271
+ # TODO - bench
272
+ DECRYPT_STR, ENCRYPT_STR = [DECRYPT_TABLE, (0...256)].map do |values|
273
+ values.map { |i| i.chr }.join.gsub(/([\^\-\\])/, "\\\\\\1")
274
+ end
275
+
276
+ def self.decrypt encrypted
277
+ encrypted.tr ENCRYPT_STR, DECRYPT_STR
278
+ end
279
+
280
+ def self.encrypt decrypted
281
+ decrypted.tr DECRYPT_STR, ENCRYPT_STR
282
+ end
283
+ end
284
+
285
+ # @return [IO]
286
+ # @private
287
+ attr_reader :io
288
+
289
+ # @return [Header]
290
+ # @private
291
+ attr_reader :header
292
+
293
+ # @return [Array<BlockPtr>]
294
+ # @private
295
+ attr_reader :blocks
296
+
297
+ # @return [Array<NodePtr>]
298
+ # @private
299
+ attr_reader :nodes
300
+
301
+ # @return [Hash<Integer, Symbol>]
302
+ # @private
303
+ attr_reader :special_folder_ids
304
+
305
+ # @return [Helper]
306
+ # @private
307
+ attr_reader :helper
308
+
309
+ # @param io [IO]
310
+ # @param helper [Helper,nil]
311
+ def initialize io, helper=nil
312
+ # corresponds to
313
+ # * pst_open
314
+ # * pst_load_index
315
+
316
+ @io = io
317
+ io.pos = 0
318
+ @helper = helper || Helper.new
319
+ @header = Header.new io.read(Header::SIZE)
320
+
321
+ # would prefer this to be in Header#validate, but it doesn't have the io size.
322
+ # should perhaps downgrade this to just be a warning...
323
+ raise FormatError, "header size field invalid (#{header.size} != #{io.size}}" unless header.size == io.size
324
+
325
+ load_block_btree
326
+ load_node_btree
327
+ load_xattrib
328
+
329
+ @special_folder_ids = {}
330
+ end
331
+
332
+ # @return [Boolean]
333
+ # @private
334
+ def encrypted?
335
+ @header.encrypted?
336
+ end
337
+
338
+ # until i properly fix logging...
339
+ #
340
+ # @private
341
+ def warn s
342
+ Mapi::Log.warn s
343
+ end
344
+
345
+ #
346
+ # this is the index and desc record loading code
347
+ # ----------------------------------------------------------------------------
348
+ #
349
+
350
+ # @private
351
+ ToTree = Module.new
352
+
353
+ # more constants from libpst.c
354
+ # these relate to the index block
355
+ # @private
356
+ ITEM_COUNT_OFFSET = 0x1f0 # count byte
357
+
358
+ # @private
359
+ LEVEL_INDICATOR_OFFSET = 0x1f3 # node or leaf
360
+
361
+ # @private
362
+ BACKLINK_OFFSET = 0x1f8 # backlink u1 value
363
+
364
+ # these 3 classes are used to hold various file records
365
+
366
+ # pst_index
367
+ #
368
+ # @private
369
+ class BlockPtr < Struct.new(:id, :offset, :size, :u1)
370
+ UNPACK_STR32 = 'VVvv'
371
+ UNPACK_STR64 = 'TTvv'
372
+ SIZE32 = 12
373
+ SIZE64 = 24
374
+ BLOCK_SIZE = 512 # index blocks was 516 but bogus
375
+ COUNT_MAX32 = 41 # max active items (ITEM_COUNT_OFFSET / Index::SIZE = 41)
376
+ COUNT_MAX64 = 20 # bit of a guess really. 512 / 24 = 21, but doesn't leave enough header room
377
+
378
+ # @return [Pst]
379
+ attr_accessor :pst
380
+
381
+ # @param data [String, Array]
382
+ # @param is64 [Boolean]
383
+ def initialize data, is64
384
+ data = Pst.unpack data, (is64 ? UNPACK_STR64 : UNPACK_STR32) if String === data
385
+ super(*data)
386
+ end
387
+
388
+ # @return [Symbol]
389
+ def type
390
+ @type ||= begin
391
+ if id & 0x2 == 0
392
+ :data
393
+ else
394
+ first_byte, second_byte = read.unpack('CC')
395
+ if first_byte == 1
396
+ raise second_byte unless second_byte == 1
397
+ :data_chain_header
398
+ elsif first_byte == 2
399
+ raise second_byte unless second_byte == 0
400
+ :id2_assoc
401
+ else
402
+ raise FormatError, 'unknown first byte for block - %p' % first_byte
403
+ end
404
+ end
405
+ end
406
+ end
407
+
408
+ # @return [Boolean]
409
+ def data?
410
+ (id & 0x2) == 0
411
+ end
412
+
413
+ # @return [String]
414
+ def read decrypt=true
415
+ # only data blocks are every encrypted
416
+ decrypt = false unless data?
417
+ pst.pst_read_block_size offset, size, decrypt
418
+ end
419
+
420
+ # show all numbers in hex
421
+ def inspect
422
+ super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }.sub(/Index /, "Index type=#{type.inspect}, ")
423
+ end
424
+ end
425
+
426
+ # mostly guesses.
427
+
428
+ # @private
429
+ ITEM_COUNT_OFFSET_64 = 0x1e8
430
+
431
+ # @private
432
+ LEVEL_INDICATOR_OFFSET_64 = 0x1eb # diff of 3 between these 2 as above...
433
+
434
+ # _pst_table_ptr_struct
435
+ #
436
+ # @private
437
+ class TablePtr < Struct.new(:start, :u1, :offset)
438
+ UNPACK_STR32 = 'V3'
439
+ UNPACK_STR64 = 'T3'
440
+ SIZE32 = 12
441
+ SIZE64 = 24
442
+
443
+ # @param data [String]
444
+ # @param is64 [Boolean]
445
+ def initialize data, is64
446
+ data = Pst.unpack(data, is64 ? UNPACK_STR64 : UNPACK_STR32) if String === data
447
+ super(*data)
448
+ end
449
+ end
450
+
451
+ # pst_desc
452
+ # idx_id is a pointer to an idx record which gets the primary data stream for the Desc record.
453
+ # idx2_id gets you an idx record, that when read gives you an ID2 association list, which just maps
454
+ # another set of ids to index values
455
+ #
456
+ # @private
457
+ class NodePtr < Struct.new(:node_id, :block_id, :sub_block_id, :parent_node_id)
458
+ UNPACK_STR32 = 'V4'
459
+ UNPACK_STR64 = 'T3V'
460
+ SIZE32 = 16
461
+ SIZE64 = 32
462
+ BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus
463
+ COUNT_MAX64 = 15
464
+ COUNT_MAX32 = 31 # max active desc records (ITEM_COUNT_OFFSET / Desc::SIZE = 31)
465
+
466
+ include ToTree
467
+
468
+ # @return [Pst]
469
+ attr_accessor :pst
470
+
471
+ # @return [Array]
472
+ attr_reader :children
473
+
474
+ # @param data [String]
475
+ # @param is64 [Boolean]
476
+ def initialize data, is64
477
+ super(*Pst.unpack(data, is64 ? UNPACK_STR64 : UNPACK_STR32))
478
+ @children = []
479
+ end
480
+
481
+ # @return [BlockPtr]
482
+ def block
483
+ raise "DO NOT USE"
484
+ pst.block_from_id block_id
485
+ end
486
+
487
+ # @return [BlockPtr]
488
+ def sub_block
489
+ raise "DO NOT USE"
490
+ pst.block_from_id sub_block_id
491
+ end
492
+
493
+ # Read node data
494
+ #
495
+ # @return [Array<String>]
496
+ def read_main_array
497
+ @read_main ||= begin
498
+ list = []
499
+ pst.load_node_main_data_to node_id, list
500
+ list
501
+ end
502
+ end
503
+
504
+ # Locate and read node sub data by its local id
505
+ #
506
+ # @param local_node_id [Integer]
507
+ # @return [Array<String>]
508
+ def read_sub_array local_node_id
509
+ list = []
510
+ pst.load_node_sub_data_to node_id, local_node_id, list
511
+ list
512
+ end
513
+
514
+ # @return [Array<String>]
515
+ def get_local_node_list
516
+ list = []
517
+ pst.get_local_node_list_to node_id, list
518
+ list
519
+ end
520
+
521
+ # Check if there is a sub data exists, where it is identified by its local id
522
+ #
523
+ # @param local_node_id [Integer]
524
+ # @return [Boolean]
525
+ def has_sub local_node_id
526
+ #TODO fixme
527
+ read_sub_array(local_node_id).length != 0
528
+ end
529
+
530
+ # show all numbers in hex
531
+ def inspect
532
+ super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }
533
+ end
534
+ end
535
+
536
+ # corresponds to
537
+ # * _pst_build_id_ptr
538
+ #
539
+ # @private
540
+ def load_block_btree
541
+ @blocks = []
542
+ @block_offsets = []
543
+ load_block_tree header.block_btree, header.block_btree_count, 0
544
+
545
+ # we'll typically be accessing by id, so create a hash as a lookup cache
546
+ @block_from_id = {}
547
+ @blocks.each do |idx|
548
+ id = idx.id & ~1
549
+ warn "there are duplicate idx records with id #{id}" if @block_from_id[id]
550
+ @block_from_id[id] = idx
551
+ end
552
+ end
553
+
554
+ # load the flat idx table, which maps ids to file ranges. this is the recursive helper
555
+ #
556
+ # corresponds to
557
+ # * _pst_build_id_ptr
558
+ #
559
+ # @private
560
+ def load_block_tree offset, linku1, start_val
561
+ @block_offsets << offset
562
+
563
+ #_pst_read_block_size(pf, offset, BLOCK_SIZE, &buf, 0, 0) < BLOCK_SIZE)
564
+ buf = pst_read_block_size offset, BlockPtr::BLOCK_SIZE, false
565
+
566
+ item_count = buf[is64 ? ITEM_COUNT_OFFSET_64 : ITEM_COUNT_OFFSET].ord
567
+ level = buf[is64 ? LEVEL_INDICATOR_OFFSET_64 : LEVEL_INDICATOR_OFFSET].ord
568
+ count_max = is64 ? BlockPtr::COUNT_MAX64 : BlockPtr::COUNT_MAX32
569
+ raise "have too many active items in index (#{item_count})" if item_count > count_max
570
+
571
+ this_node_id = is64 ? Pst.unpack(buf[BACKLINK_OFFSET, 8], "T").first : buf[BACKLINK_OFFSET, 4].unpack("V").first
572
+ raise 'blah 1' unless this_node_id == linku1
573
+
574
+ if level == 0
575
+ # leaf pointers
576
+ size = is64 ? BlockPtr::SIZE64 : BlockPtr::SIZE32
577
+
578
+ # split the data into item_count index objects
579
+ Pst.split_per(buf, size, item_count).each_with_index do |data, i|
580
+ idx = BlockPtr.new data, is64
581
+ # first entry
582
+ raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val
583
+ idx.pst = self
584
+ # this shouldn't really happen i'd imagine
585
+ raise "OHNO" if idx.id == 0
586
+ @blocks << idx
587
+ end
588
+ else
589
+ # node pointers
590
+ size = is64 ? TablePtr::SIZE64 : TablePtr::SIZE32
591
+ # split the data into item_count table pointers
592
+ Pst.split_per(buf, size, item_count).each_with_index do |data, i|
593
+ table = TablePtr.new data, is64
594
+ # for the first value, we expect the start to be equal
595
+ raise 'blah 3' if i == 0 and start_val != 0 and table.start != start_val
596
+ # this shouldn't really happen i'd imagine
597
+ raise "OHNO" if table.start == 0
598
+ load_block_tree table.offset, table.u1, table.start
599
+ end
600
+ end
601
+ end
602
+
603
+ # most access to idx objects will use this function
604
+ #
605
+ # corresponds to
606
+ # * _pst_getID
607
+ #
608
+ # @param id [Integer]
609
+ # @return [BlockPtr]
610
+ # @private
611
+ def block_from_id id
612
+ @block_from_id[id & ~1]
613
+ end
614
+
615
+ # corresponds to
616
+ # * _pst_build_desc_ptr
617
+ # * record_descriptor
618
+ #
619
+ # @private
620
+ def load_node_btree
621
+ @nodes = []
622
+ @node_offsets = []
623
+ load_node_tree header.node_btree, header.node_btree_count, 0x21
624
+
625
+ # first create a lookup cache
626
+ @node_from_id = {}
627
+ @nodes.each do |node|
628
+ node.pst = self
629
+ warn "there are duplicate desc records with id #{node.node_id}" if @node_from_id[node.node_id]
630
+ @node_from_id[node.node_id] = node
631
+ end
632
+
633
+ # now turn the flat list of loaded desc records into a tree
634
+
635
+ # well, they have no parent, so they're more like, the toplevel descs.
636
+ @orphans = []
637
+ # now assign each node to the parents child array, putting the orphans in the above
638
+ @nodes.each do |node|
639
+ parent = @node_from_id[node.parent_node_id]
640
+ # note, besides this, its possible to create other circular structures.
641
+ if parent == node
642
+ # this actually happens usually, for the root_item it appears.
643
+ #warn "desc record's parent is itself (#{desc.inspect})"
644
+ # maybe add some more checks in here for circular structures
645
+ elsif parent
646
+ parent.children << node
647
+ next
648
+ end
649
+ @orphans << node
650
+ end
651
+
652
+ # maybe change this to some sort of sane-ness check. orphans are expected
653
+ # warn "have #{@orphans.length} orphan desc record(s)." unless @orphans.empty?
654
+ end
655
+
656
+ # @return [Boolean]
657
+ # @private
658
+ def is64
659
+ @header.version_2003?
660
+ end
661
+
662
+ # load the flat list of desc records recursively
663
+ #
664
+ # corresponds to
665
+ # * _pst_build_desc_ptr
666
+ # * record_descriptor
667
+ #
668
+ # @private
669
+ def load_node_tree offset, linku1, start_val
670
+ @node_offsets << offset
671
+
672
+ buf = pst_read_block_size offset, NodePtr::BLOCK_SIZE, false
673
+ item_count = buf[is64 ? ITEM_COUNT_OFFSET_64 : ITEM_COUNT_OFFSET].ord
674
+ level = buf[is64 ? LEVEL_INDICATOR_OFFSET_64 : LEVEL_INDICATOR_OFFSET].ord
675
+
676
+ # not real desc
677
+ this_node_id = is64 ? Pst.unpack(buf[BACKLINK_OFFSET, 8], "T").first : buf[BACKLINK_OFFSET, 4].unpack("V").first
678
+ raise 'blah 1' unless this_node_id == linku1
679
+
680
+ if level == 0
681
+ # leaf pointers
682
+ size = is64 ? NodePtr::SIZE64 : NodePtr::SIZE32
683
+ count_max = is64 ? NodePtr::COUNT_MAX64 : NodePtr::COUNT_MAX32
684
+
685
+ raise "have too many active items in index (#{item_count})" if item_count > count_max
686
+ # split the data into item_count desc objects
687
+ Pst.split_per(buf, size, item_count).each_with_index do |data, i|
688
+ node = NodePtr.new data, is64
689
+ # first entry
690
+ raise 'blah 3' if i == 0 and start_val != 0 and node.node_id != start_val
691
+ # this shouldn't really happen i'd imagine
692
+ break if node.node_id == 0
693
+ @nodes << node
694
+ end
695
+ else
696
+ # node pointers
697
+ size = is64 ? TablePtr::SIZE64 : TablePtr::SIZE32
698
+ count_max = is64 ? BlockPtr::COUNT_MAX64 : BlockPtr::COUNT_MAX32
699
+
700
+ raise "have too many active items in index (#{item_count})" if item_count > count_max
701
+ # split the data into item_count table pointers
702
+ Pst.split_per(buf, size, item_count).each_with_index do |data, i|
703
+ table = TablePtr.new data, is64
704
+ # for the first value, we expect the start to be equal note that ids -1, so even for the
705
+ # first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert
706
+ # that the first desc record is always 33...
707
+ raise 'blah 3' if i == 0 and start_val != -1 and table.start != start_val
708
+ # this shouldn't really happen i'd imagine
709
+ break if table.start == 0
710
+ load_node_tree table.offset, table.u1, table.start
711
+ end
712
+ end
713
+ end
714
+
715
+ # as for idx
716
+ #
717
+ # corresponds to:
718
+ # * _pst_getDptr
719
+ #
720
+ # @param id [Integer]
721
+ # @return [NodePtr]
722
+ #
723
+ # @private
724
+ def node_from_id id
725
+ @node_from_id[id]
726
+ end
727
+
728
+ # corresponds to
729
+ # * pst_load_extended_attributes
730
+ #
731
+ # @private
732
+ def load_xattrib
733
+ end
734
+
735
+ # corresponds to:
736
+ # * _pst_read_block_size
737
+ # * _pst_read_block ??
738
+ # * _pst_ff_getIDblock_dec ??
739
+ # * _pst_ff_getIDblock ??
740
+ #
741
+ # @param offset [Integer]
742
+ # @param size [Integer]
743
+ # @param decrypt [Boolean]
744
+ # @return [String]
745
+ # @private
746
+ def pst_read_block_size offset, size, decrypt=true
747
+ io.seek offset
748
+ buf = io.read size
749
+ warn "tried to read #{size} bytes but only got #{buf.length}" if buf.length != size
750
+ encrypted? && decrypt ? CompressibleEncryption.decrypt(buf) : buf
751
+ end
752
+
753
+ # @param node_id [Integer]
754
+ # @param list [Array<String>]
755
+ # @private
756
+ def load_node_main_data_to node_id, list
757
+ raise 'node_is must be Integer' unless Integer === node_id
758
+ node = node_from_id node_id
759
+ load_main_block_to node.block_id, list
760
+ end
761
+
762
+ # @param node_id [Integer]
763
+ # @param local_node_id [Integer]
764
+ # @param list [Array<String>]
765
+ # @private
766
+ def load_node_sub_data_to node_id, local_node_id, list
767
+ raise 'node_is must be Integer' unless Integer === node_id
768
+ raise 'local_node_id must be Integer' unless Integer === local_node_id
769
+ node = node_from_id node_id
770
+ load_sub_block_to node.sub_block_id, local_node_id, list
771
+ end
772
+
773
+ # for debug
774
+ #
775
+ # @param node_id [String]
776
+ # @param list [Array<String>]
777
+ # @private
778
+ def get_local_node_list_to node_id, list
779
+ node = node_from_id node_id
780
+ get_local_node_list_of_sub_block_to node.sub_block_id, list
781
+ end
782
+
783
+ # for debug
784
+ #
785
+ # @param sub_block_id [String]
786
+ # @param list [Array<String>]
787
+ # @private
788
+ def get_local_node_list_of_sub_block_to sub_block_id, list
789
+ return if sub_block_id == 0
790
+
791
+ sub_block = block_from_id sub_block_id
792
+ p ["WALK",sub_block_id,sub_block]
793
+ raise 'must not be data' if sub_block.data?
794
+
795
+ # SLBLOCK or SIBLOCK
796
+ data = sub_block.read
797
+
798
+ btype = data[0].ord
799
+ raise 'btype != 2' if btype != 2
800
+
801
+ level = data[1].ord
802
+ case level
803
+ when 0 # SLBLOCK
804
+ count = data[2, 2].unpack("v").first
805
+ count.times do |i|
806
+ sl_node_id, sl_block_id, sl_sub_block_id = (
807
+ is64 ? Pst.unpack(data[(is64 ? 8 : 4) + 24 * i, 24], "T3") : data[(is64 ? 8 : 4) + 12 * i, 12].unpack("V3")
808
+ )
809
+
810
+ list << (sl_node_id & 0xffffffff)
811
+
812
+ get_local_node_list_of_sub_block_to sl_sub_block_id, list
813
+ end
814
+ when 1 # SIBLOCK
815
+ count = data[2, 2].unpack("v").first
816
+ count.times do |i|
817
+ si_node_id, si_block_id = (
818
+ is64 ? Pst.unpack(data[(is64 ? 8 : 4) + 16 * i, 16], "T2") : data[(is64 ? 8 : 4) + 8 * i, 8].unpack("V2")
819
+ )
820
+
821
+ list << (si_node_id & 0xffffffff)
822
+ end
823
+ else
824
+ raise 'level unk'
825
+ end
826
+ end
827
+
828
+ # @param sub_block_id [Integer]
829
+ # @param local_node_id [Integer]
830
+ # @param list [Array<String>]
831
+ # @private
832
+ def load_sub_block_to sub_block_id, local_node_id, list
833
+ raise 'sub_block_id must be Integer' unless Integer === sub_block_id
834
+ return if sub_block_id == 0
835
+
836
+ sub_block = block_from_id sub_block_id
837
+ raise 'must not be data' if sub_block.data?
838
+
839
+ # SLBLOCK or SIBLOCK
840
+ data = sub_block.read
841
+
842
+ btype = data[0].ord
843
+ raise 'btype != 2' if btype != 2
844
+
845
+ level = data[1].ord
846
+ case level
847
+ when 0 # SLBLOCK
848
+ count = data[2, 2].unpack("v").first
849
+ count.times do |i|
850
+ sl_node_id, sl_block_id, sl_sub_block_id = (
851
+ is64 ? Pst.unpack(data[(is64 ? 8 : 4) + 24 * i, 24], "T3") : data[(is64 ? 8 : 4) + 12 * i, 12].unpack("V3")
852
+ )
853
+
854
+ sl_node_id &= 0xffffffff
855
+
856
+ if sl_node_id == local_node_id
857
+ load_main_block_to sl_block_id, list
858
+ end
859
+
860
+ load_sub_block_to sl_sub_block_id, local_node_id, list
861
+ end
862
+ when 1 # SIBLOCK
863
+ count = data[2, 2].unpack("v").first
864
+ count.times do |i|
865
+ si_node_id, si_block_id = (
866
+ is64 ? Pst.unpack(data[(is64 ? 8 : 4) + 16 * i, 16], "T2") : data[(is64 ? 8 : 4) + 8 * i, 8].unpack("V2")
867
+ )
868
+
869
+ si_node_id &= 0xffffffff
870
+
871
+ if si_node_id == local_node_id
872
+ si_block = block_from_id si_block_id
873
+ raise 'must be data' unless si_block.data?
874
+ list << si_block.read.force_encoding("BINARY")
875
+ end
876
+ end
877
+ else
878
+ raise 'level unk'
879
+ end
880
+ end
881
+
882
+ # @param block_id [Integer]
883
+ # @param list [Array<String>]
884
+ # @private
885
+ def load_main_block_to block_id, list
886
+ return if block_id == 0
887
+
888
+ block = block_from_id block_id
889
+
890
+ if block.data?
891
+ # this is real data we want
892
+ list << block.read.force_encoding("BINARY")
893
+ return
894
+ end
895
+
896
+ # XBLOCK or XXBLOCK
897
+ data = block.read
898
+
899
+ btype = data[0].ord
900
+ raise 'btype must be 1' if btype != 1
901
+
902
+ level = data[1].ord
903
+ case level
904
+ when 1, 2
905
+ count, num_bytes = data[2, 6].unpack("vV")
906
+
907
+ items = (
908
+ is64 ? Pst.unpack(data[8, 8 * count], "T#{count}") : data[8, 4 * count].unpack("V#{count}")
909
+ )
910
+ items.each { |block_id|
911
+ load_main_block_to block_id, list
912
+ }
913
+ else
914
+ raise 'level unk'
915
+ end
916
+ end
917
+
918
+ #
919
+ # id2
920
+ # ----------------------------------------------------------------------------
921
+ #
922
+
923
+ #
924
+ # main block parsing code. gets raw properties
925
+ # ----------------------------------------------------------------------------
926
+ #
927
+
928
+ # the job of this class, is to take a desc record, and be able to enumerate through the
929
+ # mapi properties of the associated thing.
930
+ #
931
+ # corresponds to
932
+ # * _pst_parse_block
933
+ # * _pst_process (in some ways. although perhaps thats more the Item::Properties#add_property)
934
+ #
935
+ # @private
936
+ class BlockParser
937
+ include Mapi::Types::Constants
938
+
939
+ # @private
940
+ TYPES = {
941
+ 0xbc => 1,
942
+ 0x7c => 2,
943
+ # type 3 is removed. an artifact of not handling the indirect blocks properly in libpst.
944
+ }
945
+
946
+ # @private
947
+ PR_SUBJECT = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_SUBJECT' }.first.hex
948
+ # @private
949
+ PR_BODY_HTML = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_BODY_HTML' }.first.hex
950
+
951
+ # this stuff could maybe be moved to Ole::Types? or leverage it somehow?
952
+ # whether or not a type is immeidate is more a property of the pst encoding though i expect.
953
+ # what i probably can add is a generic concept of whether a type is of variadic length or not.
954
+
955
+ # these lists are very incomplete. think they are largely copied from libpst
956
+
957
+ # @private
958
+ IMMEDIATE_TYPES = [
959
+ PT_SHORT, PT_LONG, PT_BOOLEAN
960
+ ]
961
+
962
+ # @private
963
+ INDIRECT_TYPES = [
964
+ PT_DOUBLE, PT_OBJECT,
965
+ 0x0014, # whats this? probably something like PT_LONGLONG, given the correspondence with the
966
+ # ole variant types. (= VT_I8)
967
+ PT_STRING8, PT_UNICODE, # unicode isn't in libpst, but added here for outlook 2003 down the track
968
+ PT_SYSTIME,
969
+ 0x0048, # another unknown
970
+ 0x0102, # this is PT_BINARY vs PT_CLSID
971
+ #0x1003, # these are vector types, but they're commented out for now because i'd expect that
972
+ #0x1014, # there's extra decoding needed that i'm not doing. (probably just need a simple
973
+ # # PT_* => unpack string mapping for the immediate types, and just do unpack('V*') etc
974
+ #0x101e,
975
+ #0x1102
976
+ ]
977
+
978
+ # the attachment and recipient arrays appear to be always stored with these fixed
979
+ # id2 values. seems strange. are there other extra streams? can find out by making higher
980
+ # level IO wrapper, which has the id2 value, and doing the diff of available id2 values versus
981
+ # used id2 values in properties of an item.
982
+
983
+ # @private
984
+ ID2_ATTACHMENTS = 0x671
985
+
986
+ # @private
987
+ ID2_RECIPIENTS = 0x692
988
+
989
+ # Targeting main data, not sub
990
+ USE_MAIN_DATA = -1
991
+
992
+ # @return [NodePtr]
993
+ # @private
994
+ attr_reader :node
995
+
996
+ # @return [Hash<Integer, String>] HID to data block
997
+ # @private
998
+ attr_reader :data_chunks
999
+
1000
+ # @param node [NodePtr]
1001
+ # @param local_node_id [Integer]
1002
+ def initialize node, local_node_id = USE_MAIN_DATA
1003
+ #raise FormatError, "unable to get associated index record for #{node.inspect}" unless node.block
1004
+ @node = node
1005
+ @data_chunks = {}
1006
+
1007
+ data_array = (local_node_id == USE_MAIN_DATA) ? node.read_main_array : (node.read_sub_array local_node_id)
1008
+
1009
+ data_array.each_with_index { |data, index|
1010
+ # see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/a3fa280c-eba3-434f-86e4-b95141b3c7b1
1011
+ if index == 0
1012
+ load_root_header data
1013
+ else
1014
+ load_page_header data, index
1015
+ end
1016
+ }
1017
+
1018
+ # now, we may have multiple different blocks
1019
+ end
1020
+
1021
+ # Parse HNPAGEHDR / HNBITMAPHDR
1022
+ #
1023
+ # @see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/9c34ecf8-36bc-45a1-a2df-ee35c6dc840a
1024
+ #
1025
+ # @param data [String]
1026
+ # @param page_index [Integer]
1027
+ # @private
1028
+ def load_page_header data, page_index
1029
+ page_map = data.unpack('v').first
1030
+
1031
+ # read HNPAGEMAP
1032
+ offsets_count = data[page_map, 2].unpack("v").first + 1
1033
+ offset_tables = data[page_map + 4, 2 * offsets_count].unpack("v#{offsets_count}")
1034
+
1035
+ offset_tables.each_cons(2).to_a.each_with_index do |(from, to), index|
1036
+ # conver to HID
1037
+ @data_chunks[0x20 * (1 + index) + 65536 * page_index] = data[from, to - from]
1038
+ end
1039
+ end
1040
+
1041
+ # Parse HNHDR
1042
+ #
1043
+ # @see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/8e4ae05c-3c24-4103-b7e5-ffef6f244834
1044
+ # @private
1045
+ def load_root_header data
1046
+ page_map, sig, @heap_type, @offset1 = data.unpack 'vCCVV'
1047
+ raise FormatError, 'invalid signature 0x%02x' % sig unless sig == 0xec
1048
+ raise FormatError, 'unknown block type signature 0x%02x' % @heap_type unless TYPES[@heap_type]
1049
+ @type = TYPES[@heap_type]
1050
+
1051
+ # read HNPAGEMAP
1052
+ offsets_count = data[page_map, 2].unpack("v").first + 1
1053
+ offset_tables = data[page_map + 4, 2 * offsets_count].unpack("v#{offsets_count}")
1054
+
1055
+ offset_tables.each_cons(2).to_a.each_with_index do |(from, to), index|
1056
+ # conver to HID
1057
+ @data_chunks[0x20 * (1 + index)] = data[from, to - from]
1058
+ end
1059
+ end
1060
+
1061
+ # based on the value of offset, return either some data from buf, or some data from the
1062
+ # id2 chain id2, where offset is some key into a lookup table that is stored as the id2
1063
+ # chain. i think i may need to create a BlockParser class that wraps up all this mess.
1064
+ #
1065
+ # corresponds to:
1066
+ # * _pst_getBlockOffsetPointer
1067
+ # * _pst_getBlockOffset
1068
+ #
1069
+ # @param offset [Integer]
1070
+ # @return [String]
1071
+ # @private
1072
+ def get_data_indirect offset
1073
+ raise "offset must be Integer" unless Integer === offset
1074
+
1075
+ return get_data_indirect_io(offset).read
1076
+ end
1077
+
1078
+ # Resolve data pointed by HNID
1079
+ #
1080
+ # @see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/7ac490ce-31af-4a75-97df-eb9d07a003fd
1081
+ # @param offset [Integer]
1082
+ # @return [StringIO]
1083
+ # @private
1084
+ def get_data_indirect_io offset
1085
+ raise "offset must be Integer" unless Integer === offset
1086
+
1087
+ if offset == 0
1088
+ nil
1089
+ elsif (offset & 0x1f) != 0
1090
+ # this is NID (node)
1091
+ data_array = node.read_sub_array(offset)
1092
+ raise "local node id #{offset} points multi page count #{data_array.count}, use get_data_array() instead" if data_array.count >= 2
1093
+ if data_array.empty?
1094
+ StringIO.new ""
1095
+ else
1096
+ StringIO.new data_array.first
1097
+ end
1098
+ else
1099
+ # this is HID (heap)
1100
+ StringIO.new data_chunks[offset]
1101
+ end
1102
+ end
1103
+
1104
+ # @param offset [Integer]
1105
+ # @return [Array<String>]
1106
+ # @private
1107
+ def get_data_array offset
1108
+ raise "offset must be Integer" unless Integer === offset
1109
+
1110
+ if offset == 0
1111
+ nil
1112
+ elsif (offset & 0x1f) != 0
1113
+ # this is NID (node)
1114
+ node.read_sub_array(offset)
1115
+ else
1116
+ # this is HID (heap)
1117
+ [data_chunks[offset]]
1118
+ end
1119
+ end
1120
+
1121
+ def handle_indirect_values key, type, value
1122
+ case type
1123
+ when PT_BOOLEAN
1124
+ value = value != 0
1125
+ when *IMMEDIATE_TYPES # not including PT_BOOLEAN which we just did above
1126
+ # no processing current applied (needed?).
1127
+ when *INDIRECT_TYPES
1128
+ # the value is a pointer
1129
+ if String === value # ie, value size > 4 above
1130
+ value = StringIO.new value
1131
+ else
1132
+ value = get_data_array(value)
1133
+ if value
1134
+ value = StringIO.new value.join("")
1135
+ end
1136
+ end
1137
+ # keep strings as immediate values for now, for compatability with how i set up
1138
+ # Msg::Properties::ENCODINGS
1139
+ if value
1140
+ if type == PT_STRING8
1141
+ value = node.pst.helper.convert_ansi_str value.read
1142
+ elsif type == PT_UNICODE
1143
+ value = Ole::Types::FROM_UTF16.iconv value.read
1144
+ end
1145
+ end
1146
+ # special subject handling
1147
+ if key == PR_BODY_HTML and value
1148
+ # to keep the msg code happy, which thinks body_html will be an io
1149
+ # although, in 2003 version, they are 0102 already
1150
+ value = StringIO.new value unless value.respond_to?(:read)
1151
+ end
1152
+ if key == PR_SUBJECT and String === value and value.length >= 2
1153
+ if value[0].ord == 1
1154
+ # This 2 chars header tell us how to omit subject prefix like `Yes: `, `Re: `, etc.
1155
+ # We need not to omit them.
1156
+ value = value[2..-1]
1157
+ end
1158
+ =begin
1159
+ index = value =~ /^[A-Z]*:/ ? $~[0].length - 1 : nil
1160
+ unless ignore == 1 and offset == index
1161
+ warn 'something wrong with subject hack'
1162
+ $x = [ignore, offset, value]
1163
+ require 'irb'
1164
+ IRB.start
1165
+ exit
1166
+ end
1167
+ =end
1168
+ =begin
1169
+ new idea:
1170
+
1171
+ making sense of the \001\00[156] i've seen prefixing subject. i think its to do with the placement
1172
+ of the ':', or the ' '. And perhaps an optimization to do with thread topic, and ignoring the prefixes
1173
+ added by mailers. thread topic is equal to subject with all that crap removed.
1174
+
1175
+ can test by creating some mails with bizarre subjects.
1176
+
1177
+ subject="\001\005RE: blah blah"
1178
+ subject="\001\001blah blah"
1179
+ subject="\001\032Out of Office AutoReply: blah blah"
1180
+ subject="\001\020Undeliverable: blah blah"
1181
+
1182
+ looks like it
1183
+
1184
+ =end
1185
+
1186
+ # now what i think, is that perhaps, value[offset..-1] ...
1187
+ # or something like that should be stored as a special tag. ie, do a double yield
1188
+ # for this case. probably PR_CONVERSATION_TOPIC, in which case i'd write instead:
1189
+ # yield [PR_SUBJECT, ref_type, value]
1190
+ # yield [PR_CONVERSATION_TOPIC, ref_type, value[offset..-1]
1191
+ # next # to skip the yield.
1192
+ end
1193
+
1194
+ # special handling for embedded objects
1195
+ # used for attach_data for attached messages. in which case attach_method should == 5,
1196
+ # for embedded object.
1197
+ if type == PT_OBJECT and value
1198
+ value = value.read if value.respond_to?(:read)
1199
+ id2, unknown = value.unpack 'V2'
1200
+ io = get_data_indirect_io id2
1201
+
1202
+ # hacky
1203
+ #desc2 = OpenStruct.new(:node => io, :pst => node.pst, :sub_block => node.sub_block, :children => [])
1204
+ # put nil instead of desc.list_index, otherwise the attachment is attached to itself ad infinitum.
1205
+ # should try and fix that FIXME
1206
+ # this shouldn't be done always. for an attached message, yes, but for an attached
1207
+ # meta file, for example, it shouldn't. difference between embedded_ole vs embedded_msg
1208
+ # really.
1209
+ # note that in the case where its a embedded ole, you actually get a regular serialized ole
1210
+ # object, so i need to create an ole storage object on a rangesioidxchain!
1211
+ # eg:
1212
+ =begin
1213
+ att.props.display_name # => "Picture (Metafile)"
1214
+ io = att.props.attach_data
1215
+ io.read(32).unpack('H*') # => ["d0cf11e0a1b11ae100000.... note the docfile signature.
1216
+ # plug some missing rangesio holes:
1217
+ def io.rewind; seek 0; end
1218
+ def io.flush; raise IOError; end
1219
+ ole = Ole::Storage.open io
1220
+ puts ole.root.to_tree
1221
+
1222
+ - #<Dirent:"Root Entry">
1223
+ |- #<Dirent:"\001Ole" size=20 data="\001\000\000\002\000...">
1224
+ |- #<Dirent:"CONTENTS" size=65696 data="\327\315\306\232\000...">
1225
+ \- #<Dirent:"\003MailStream" size=12 data="\001\000\000\000[...">
1226
+ =end
1227
+ # until properly fixed, i have disabled this code here, so this will break
1228
+ # nested messages temporarily.
1229
+ #value = Item.new desc2, RawPropertyStore.new(desc2).to_a
1230
+ #desc2.list_index = nil
1231
+ value = io
1232
+ end
1233
+ # this is PT_MV_STRING8, i guess.
1234
+ # should probably have the 0x1000 flag, and do the or-ring.
1235
+ # example of 0x1102 is PR_OUTLOOK_2003_ENTRYIDS. less sure about that one.
1236
+ when 0x101e, 0x1102
1237
+ # example data:
1238
+ # 0x802b "\003\000\000\000\020\000\000\000\030\000\000\000#\000\000\000BusinessCompetitionFavorites"
1239
+ # this 0x802b would be an extended attribute for categories / keywords.
1240
+ value = get_data_indirect_io(value).read unless String === value
1241
+ num = value.unpack('V')[0]
1242
+ offsets = value[4, 4 * num].unpack("V#{num}")
1243
+ value = (offsets + [value.length]).to_enum(:each_cons, 2).map { |from, to| value[from...to] }
1244
+ value.map! { |str| StringIO.new str } if type == 0x1102
1245
+ when 0x101f
1246
+ value = get_data_indirect_io(value).read unless String === value
1247
+ num = value.unpack('V')[0]
1248
+ offsets = value[4, 4 * num].unpack("V#{num}")
1249
+ value = (offsets + [value.length]).to_enum(:each_cons, 2).map { |from, to| value[from...to] }
1250
+ value.map! { |str| Ole::Types::FROM_UTF16.iconv str }
1251
+ when 0x1003 # uint32 array
1252
+ value = get_data_indirect_io(value).read unless String === value
1253
+ # there is no count field
1254
+ value = value.unpack("V#{(value.length / 4)}")
1255
+ else
1256
+ name = Mapi::Types::DATA[type].first rescue nil
1257
+ warn '0x%04x %p' % [key, get_data_indirect_io(value).read]
1258
+ raise NotImplementedError, 'unsupported mapi property type - 0x%04x (%p)' % [type, name]
1259
+ end
1260
+ [key, type, value]
1261
+ end
1262
+ end
1263
+
1264
+ =begin
1265
+ * recipients:
1266
+
1267
+ affects: ["0x200764", "0x2011c4", "0x201b24", "0x201b44", "0x201ba4", "0x201c24", "0x201cc4", "0x202504"]
1268
+
1269
+ after adding the rawpropertystoretable fix, all except the second parse properly, and satisfy:
1270
+
1271
+ item.props.display_to == item.recipients.map { |r| r.props.display_name if r.props.recipient_type == 1 }.compact * '; '
1272
+
1273
+ only the second still has a problem
1274
+
1275
+ #[#<struct Pst::Desc desc_id=0x2011c4, idx_id=0x397c, idx2_id=0x398a, parent_desc_id=0x8082>]
1276
+
1277
+ think this is related to a multi block #data3. ie, when you use @x * rec_size, and it
1278
+ goes > 8190, or there abouts, then it stuffs up. probably there is header gunk, or something,
1279
+ similar to when #data is multi block.
1280
+
1281
+ same problem affects the attachment table in test4.
1282
+
1283
+ fixed that issue. round data3 ranges to rec_size.
1284
+
1285
+ fix other issue with attached objects.
1286
+
1287
+ all recipients and attachments in test2 are fine.
1288
+
1289
+ only remaining issue is test4 recipients of 200044. strange.
1290
+
1291
+ =end
1292
+
1293
+ # RawPropertyStore is used to iterate through the properties of an item, or the auxiliary
1294
+ # data for an attachment. its just a parser for the way the properties are serialized, when the
1295
+ # properties don't have to conform to a column structure.
1296
+ #
1297
+ # structure of this chunk of data is often
1298
+ # header, property keys, data values, and then indexes.
1299
+ # the property keys has value in it. value can be the actual value if its a short type,
1300
+ # otherwise you lookup the value in the indicies, where you get the offsets to use in the
1301
+ # main data body. due to the indirect thing though, any of these parts could actually come
1302
+ # from a separate stream.
1303
+ #
1304
+ # @private
1305
+ class RawPropertyStore < BlockParser
1306
+ include Enumerable
1307
+
1308
+ # @return [Integer] number of property tuples
1309
+ attr_reader :length
1310
+
1311
+ # Will read Property Context (PC)
1312
+ #
1313
+ # @see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/294c83c6-ff92-42f5-b6b6-876c29fa9737
1314
+ # @param desc [NodePtr]
1315
+ # @param local_node_id [Integer]
1316
+ def initialize node, local_node_id = USE_MAIN_DATA
1317
+ super
1318
+ bTypePC = 0xbc
1319
+ raise FormatError, "expected type 188 - got #{@heap_type}" unless @heap_type == bTypePC
1320
+
1321
+ # the way that offset works, data1 may be a subset of buf, or something from id2. if its from buf,
1322
+ # it will be offset based on index_offset and offset. so it could be some random chunk of data anywhere
1323
+ # in the thing.
1324
+ header_data = get_data_indirect @offset1
1325
+ raise FormatError if header_data.length < 8
1326
+ signature, offset2 = header_data.unpack 'V2'
1327
+ raise FormatError, 'invalid Property Context signature 0x%08x' % @type if signature != 0x000602b5
1328
+ # this is actually a big chunk of tag tuples.
1329
+ @index_data = get_data_indirect offset2
1330
+ @length = @index_data.length / 8
1331
+ end
1332
+
1333
+ # iterate through the property tuples
1334
+ #
1335
+ # @yield [key, type, value]
1336
+ # @yieldparam key [Integer]
1337
+ # @yieldparam type [Integer]
1338
+ # @yieldparam value [Object]
1339
+ def each
1340
+ length.times do |i|
1341
+ key, type, value = handle_indirect_values(*@index_data[8 * i, 8].unpack('vvV'))
1342
+ yield key, type, value
1343
+ end
1344
+ end
1345
+ end
1346
+
1347
+ # RawPropertyStoreTable is kind of like a database table.
1348
+ # it has a fixed set of columns.
1349
+ # #[] is kind of like getting a row from the table.
1350
+ # those rows are currently encapsulated by Row, which has #each like
1351
+ # RawPropertyStore.
1352
+ # only used for the recipients array, and the attachments array. completely lazy, doesn't
1353
+ # load any of the properties upon creation.
1354
+ #
1355
+ # @private
1356
+ class RawPropertyStoreTable < BlockParser
1357
+ # TCOLDESC
1358
+ # @private
1359
+ class Column < Struct.new(:ref_type, :type, :ind2_off, :size, :slot)
1360
+ def initialize data
1361
+ super(*data.unpack('v3CC'))
1362
+ end
1363
+
1364
+ def nice_type_name
1365
+ Mapi::Types::DATA[ref_type].first[/_(.*)/, 1].downcase rescue '0x%04x' % ref_type
1366
+ end
1367
+
1368
+ def nice_prop_name
1369
+ Mapi::PropertyStore::TAGS['%04x' % type].first[/_(.*)/, 1].downcase rescue '0x%04x' % type
1370
+ end
1371
+
1372
+ def inspect
1373
+ "#<#{self.class} name=#{nice_prop_name.inspect}, type=#{nice_type_name.inspect}>"
1374
+ end
1375
+ end
1376
+
1377
+ include Enumerable
1378
+
1379
+ # @return [Integer] record count
1380
+ attr_reader :length
1381
+ # @return [String] Array of TCOLDESC
1382
+ attr_reader :index_data
1383
+ # @return [String] 2.3.2 BTree-on-Heap (BTH)
1384
+ attr_reader :data2
1385
+ # @return [Array<String>] 2.3.4.4 Row Matrix
1386
+ attr_reader :rows_pages
1387
+ # @return [Integer] TCI_bm
1388
+ attr_reader :rec_size
1389
+ # @return [Integer]
1390
+ attr_reader :rows_per_page
1391
+
1392
+ # @param node [NodePtr]
1393
+ # @param local_node_id [Integer]
1394
+ def initialize node, local_node_id
1395
+ super
1396
+ bTypeTC = 0x7c
1397
+ raise FormatError, "expected type 124 - got #{@heap_type}" unless @heap_type == bTypeTC
1398
+
1399
+ header_data = get_data_indirect @offset1
1400
+ # seven_c_blk
1401
+ # often: u1 == u2 and u3 == u2 + 2, then rec_size == u3 + 4. wtf
1402
+ # TCINFO
1403
+ seven_c, @num_list, u1, u2, u3, @rec_size, b_five_offset,
1404
+ rows_offset, u7, u8 = header_data[0, 22].unpack('CCv4V2v2')
1405
+ @index_data = header_data[22..-1]
1406
+
1407
+ raise FormatError if @num_list != schema.length or seven_c != 0x7c
1408
+ # another check
1409
+ min_size = schema.inject(0) { |total, col| total + col.size }
1410
+ # seem to have at max, 8 padding bytes on the end of the record. not sure if it means
1411
+ # anything. maybe its just space that hasn't been reclaimed due to columns being
1412
+ # removed or something. probably should just check lower bound.
1413
+ range = (min_size..min_size + 8)
1414
+ warn "rec_size seems wrong (#{range} !=== #{rec_size})" unless range === rec_size
1415
+
1416
+ header_data2 = get_data_indirect b_five_offset
1417
+ raise FormatError if header_data2.length < 8
1418
+ signature, offset2 = header_data2.unpack 'V2'
1419
+ # ??? seems a bit iffy
1420
+ # there's probably more to the differences than this, and the data2 difference below
1421
+ expect = node.pst.header.version_2003? ? 0x000404b5 : 0x000204b5
1422
+ raise FormatError, 'unhandled block signature 0x%08x' % signature if signature != expect
1423
+
1424
+ # this holds all the row data
1425
+ # handle multiple block issue.
1426
+ if rows_offset != 0
1427
+ #if RangesIOIdxChain === @rows_io
1428
+ # @data3_idxs =
1429
+ # # modify ranges
1430
+ # ranges = @rows_io.ranges.map { |offset, size| [offset, size / @rec_size * @rec_size] }
1431
+ # @rows_io.instance_variable_set :@ranges, ranges
1432
+ #end
1433
+ @rows_pages = get_data_array(rows_offset)
1434
+ else
1435
+ # table rows are empty, no data to be read
1436
+ @rows_pages = [""]
1437
+ end
1438
+
1439
+ # there must be something to the data in data2. i think data2 is the array of objects essentially.
1440
+ # currently its only used to imply a length
1441
+ # actually, at size 6, its just some auxiliary data. i'm thinking either Vv/vV, for 97, and something
1442
+ # wider for 03. the second value is just the index (0...length), and the first value is
1443
+ # some kind of offset i expect. actually, they were all id2 values, in another case.
1444
+ # so maybe they're get_data_indirect values too?
1445
+ # actually, it turned out they were identical to the PR_ATTACHMENT_ID2 values...
1446
+ # id2_values = ie, data2.unpack('v*').to_enum(:each_slice, 3).transpose[0]
1447
+ # table[i].assoc(PR_ATTACHMENT_ID2).last == id2_values[i], for all i.
1448
+ @data2 = get_data_indirect(offset2) rescue nil
1449
+ #if data2
1450
+ # @length = (data2.length / 6.0).ceil
1451
+ #else
1452
+ # the above / 6, may have been ok for 97 files, but the new 0x0004 style block must have
1453
+ # different size records... just use this instead:
1454
+ # hmmm, actually, we can still figure it out:
1455
+ @rows_per_page = @rows_pages.first.length / @rec_size
1456
+
1457
+ @length = @rows_pages.map { |data| data.length / @rec_size }.sum
1458
+
1459
+ #end
1460
+
1461
+ # lets try and at least use data2 for a warning for now
1462
+ #if data2
1463
+ # data2_rec_size = node.pst.header.version_2003? ? 8 : 6
1464
+ # warn 'somthing seems wrong with data3' unless @length == (data2.length / data2_rec_size)
1465
+ #end
1466
+ end
1467
+
1468
+ # for debug
1469
+ #
1470
+ # @return [Array<Column>]
1471
+ # @private
1472
+ def schema
1473
+ @schema ||= Pst.split_per(index_data, 8, -1).map { |data| Column.new data }
1474
+ end
1475
+
1476
+ # return grid row
1477
+ #
1478
+ # @param idx [Integer]
1479
+ # @return [Row]
1480
+ def [] idx
1481
+ # handle funky rounding
1482
+ Row.new self, idx
1483
+ end
1484
+
1485
+ # @yield [row]
1486
+ # @yieldparam row [Row]
1487
+ def each
1488
+ length.times { |i| yield self[i] }
1489
+ end
1490
+
1491
+ # get record data
1492
+ #
1493
+ # @param record_index [Integer]
1494
+ # @return [String]
1495
+ # @private
1496
+ def get_record record_index
1497
+ page_index = record_index / @rows_per_page
1498
+ heap_index = record_index % @rows_per_page
1499
+ (@rows_pages[page_index])[@rec_size * heap_index, @rec_size]
1500
+ end
1501
+
1502
+ class Row
1503
+ include Enumerable
1504
+
1505
+ # @param array_parser [RawPropertyStoreTable]
1506
+ # @param index [Integer]
1507
+ def initialize array_parser, index
1508
+ @array_parser = array_parser
1509
+ @index = index
1510
+ @data = @array_parser.get_record(index)
1511
+ end
1512
+
1513
+ # iterate through the property tuples
1514
+ #
1515
+ # @yield [key, type, value]
1516
+ # @yieldparam key [Integer]
1517
+ # @yieldparam type [Integer]
1518
+ # @yieldparam value [Object]
1519
+ def each
1520
+ (@array_parser.index_data.length / 8).times do |i|
1521
+ ref_type, type, ind2_off, size, slot = @array_parser.index_data[8 * i, 8].unpack 'v3CC'
1522
+ # check this rescue too
1523
+ value = @data[ind2_off, size]
1524
+ # if INDIRECT_TYPES.include? ref_type
1525
+ if size <= 4
1526
+ value = value.unpack('V')[0]
1527
+ end
1528
+ #p ['0x%04x' % ref_type, '0x%04x' % type, (Msg::Properties::MAPITAGS['%04x' % type].first[/^.._(.*)/, 1].downcase rescue nil),
1529
+ # value_orig, value, (get_data_indirect(value_orig.unpack('V')[0]) rescue nil), size, ind2_off, slot]
1530
+ key, type, value = @array_parser.handle_indirect_values type, ref_type, value
1531
+ yield key, type, value
1532
+ end
1533
+ end
1534
+ end
1535
+ end
1536
+
1537
+ # @private
1538
+ class AttachmentTable < BlockParser
1539
+ # a "fake" MAPI property name for this constant. if you get a mapi property with
1540
+ # this value, it is the id2 value to use to get attachment data.
1541
+ #
1542
+ # @private
1543
+ PR_ATTACHMENT_ID2 = 0x67f2
1544
+
1545
+ # @return [NodePtr]
1546
+ # @private
1547
+ attr_reader :node
1548
+ # @return [RawPropertyStoreTable]
1549
+ # @private
1550
+ attr_reader :table
1551
+
1552
+ # @param node [NodePtr]
1553
+ def initialize node
1554
+ @node = node
1555
+ # no super, we only actually want BlockParser2#idx2
1556
+ #@table = nil
1557
+ #return unless node.sub_block
1558
+ #return unless block = sub_block[ID2_ATTACHMENTS]
1559
+ ## FIXME make a fake desc.
1560
+ #@fake_node = OpenStruct.new :block => block, :pst => node.pst, :sub_block => node.sub_block
1561
+ if @node.has_sub ID2_ATTACHMENTS
1562
+ @table = RawPropertyStoreTable.new @node, ID2_ATTACHMENTS
1563
+ else
1564
+ @table = []
1565
+ end
1566
+ end
1567
+
1568
+ # @return [Array<Array<Array(Integer, Integer, Object)>>]
1569
+ def to_a
1570
+ return [] if !table
1571
+ table.map do |attachment|
1572
+ attachment = attachment.to_a
1573
+ # potentially merge with yet more properties
1574
+ # this still seems pretty broken - especially the property overlap
1575
+ if attachment_id2 = attachment.assoc(PR_ATTACHMENT_ID2)
1576
+ # verify existence of this record
1577
+ if @node.has_sub attachment_id2.last
1578
+ RawPropertyStore.new(@node, attachment_id2.last).each do |a, b, c|
1579
+ record = attachment.assoc a
1580
+ attachment << record = [] unless record
1581
+ record.replace [a, b, c]
1582
+ end
1583
+ else
1584
+ warn "attachment record is missing"
1585
+ end
1586
+ end
1587
+ attachment
1588
+ end
1589
+ end
1590
+ end
1591
+
1592
+ # there is no equivalent to this in libpst. ID2_RECIPIENTS was just guessed given the above
1593
+ # AttachmentTable.
1594
+ #
1595
+ # @private
1596
+ class RecipientTable < BlockParser
1597
+ # @return [NodePtr]
1598
+ # @private
1599
+ attr_reader :node
1600
+ # @return [RawPropertyStoreTable]
1601
+ # @private
1602
+ attr_reader :table
1603
+
1604
+ # @param node [NodePtr]
1605
+ def initialize node
1606
+ @node = node
1607
+ # no super, we only actually want BlockParser2#idx2
1608
+ #@table = nil
1609
+ #return unless node.sub_block
1610
+ #return unless block = sub_block[ID2_RECIPIENTS]
1611
+ ## FIXME make a fake desc.
1612
+ #fake_node = OpenStruct.new :block => block, :pst => node.pst, :sub_block => node.sub_block
1613
+ if @node.has_sub ID2_RECIPIENTS
1614
+ @table = RawPropertyStoreTable.new @node, ID2_RECIPIENTS
1615
+ else
1616
+ @table = []
1617
+ end
1618
+
1619
+ end
1620
+
1621
+ # @return [Array<Array<Array(Integer, Integer, Object)>>]
1622
+ def to_a
1623
+ return [] if !table
1624
+ table.map { |x| x.to_a }
1625
+ end
1626
+ end
1627
+
1628
+ #
1629
+ # higher level item code. wraps up the raw properties above, and gives nice
1630
+ # objects to work with. handles item relationships too.
1631
+ # ----------------------------------------------------------------------------
1632
+ #
1633
+
1634
+ # @param property_list [Array<Array(Integer, Integer, Object)>]
1635
+ # @return [PropertySet]
1636
+ # @private
1637
+ def self.make_property_set property_list
1638
+ hash = property_list.inject({}) do |hash, (key, type, value)|
1639
+ hash.update PropertySet::Key.new(key) => value
1640
+ end
1641
+ PropertySet.new hash
1642
+ end
1643
+
1644
+ class Attachment < Mapi::Attachment
1645
+ def initialize list
1646
+ super Pst.make_property_set(list)
1647
+
1648
+ @embedded_msg = props.attach_data if Item === props.attach_data
1649
+ end
1650
+ end
1651
+
1652
+ class Recipient < Mapi::Recipient
1653
+ def initialize list
1654
+ super Pst.make_property_set(list)
1655
+ end
1656
+ end
1657
+
1658
+ class Item < Mapi::Message
1659
+ # @private
1660
+ class EntryID < Struct.new(:u1, :entry_id, :id)
1661
+ UNPACK_STR = 'VA16V'
1662
+
1663
+ def initialize data
1664
+ data = data.unpack(UNPACK_STR) if String === data
1665
+ super(*data)
1666
+ end
1667
+ end
1668
+
1669
+ include RecursivelyEnumerable
1670
+
1671
+ # Obtain item type
1672
+ #
1673
+ # - `:folder`
1674
+ # - `:message`
1675
+ # - `:wastebasket`
1676
+ #
1677
+ # @return [Symbol]
1678
+ attr_accessor :type
1679
+
1680
+ # @return [Item]
1681
+ attr_accessor :parent
1682
+
1683
+ # @param node [NodePtr]
1684
+ # @param list [Array]
1685
+ # @param type [Object, nil]
1686
+ def initialize node, list, type=nil
1687
+ @node = node
1688
+ super Pst.make_property_set(list)
1689
+
1690
+ # this is kind of weird, but the ids of the special folders are stored in a hash
1691
+ # when the root item is loaded
1692
+ if ipm_wastebasket_entryid
1693
+ node.pst.special_folder_ids[ipm_wastebasket_entryid] = :wastebasket
1694
+ end
1695
+
1696
+ if finder_entryid
1697
+ node.pst.special_folder_ids[finder_entryid] = :finder
1698
+ end
1699
+
1700
+ # and then here, those are used, along with a crappy heuristic to determine if we are an
1701
+ # item
1702
+ =begin
1703
+ i think the low bits of the desc_id can give some info on the type.
1704
+
1705
+ it seems that 0x4 is for regular messages (and maybe contacts etc)
1706
+ 0x2 is for folders, and 0x8 is for special things like rules etc, that aren't visible.
1707
+ =end
1708
+ unless type
1709
+ type = props.valid_folder_mask || ipm_subtree_entryid || props.content_count || props.subfolders ? :folder : :message
1710
+ if type == :folder
1711
+ type = node.pst.special_folder_ids[node.node_id] || type
1712
+ end
1713
+ end
1714
+
1715
+ @type = type
1716
+ end
1717
+
1718
+ # @yield [item]
1719
+ # @yieldparam item [Item]
1720
+ # @return [void]
1721
+ def each_child
1722
+ id = ipm_subtree_entryid
1723
+ if id
1724
+ root = @node.pst.node_from_id id
1725
+ raise "couldn't find root" unless root
1726
+ raise 'both kinds of children' unless @node.children.empty?
1727
+ children = root.children
1728
+ # lets look up the other ids we have.
1729
+ # typically the wastebasket one "deleted items" is in the children already, but
1730
+ # the search folder isn't.
1731
+ extras = [ipm_wastebasket_entryid, finder_entryid].compact.map do |id|
1732
+ root = @node.pst.node_from_id id
1733
+ warn "couldn't find root for id #{id}" unless root
1734
+ root
1735
+ end.compact
1736
+ # i do this instead of union, so as not to mess with the order of the
1737
+ # existing children.
1738
+ children += (extras - children)
1739
+ children
1740
+ else
1741
+ @node.children
1742
+ end.each do |node|
1743
+ item = @node.pst.pst_parse_item(node)
1744
+ item.parent = self
1745
+ yield item
1746
+ end
1747
+ end
1748
+
1749
+ # @return [String]
1750
+ def path
1751
+ parents, item = [], self
1752
+ parents.unshift item while item = item.parent
1753
+ # remove root
1754
+ parents.shift
1755
+ parents.map { |item| item.props.display_name or raise 'unable to construct path' } * '/'
1756
+ end
1757
+
1758
+ # Enumerate direct children
1759
+ #
1760
+ # @return [Array<Item>]
1761
+ def children
1762
+ to_enum(:each_child).to_a
1763
+ end
1764
+
1765
+ # these are still around because they do different stuff
1766
+
1767
+ # Top of Personal Folder Record
1768
+ #
1769
+ # @private
1770
+ def ipm_subtree_entryid
1771
+ @ipm_subtree_entryid ||= EntryID.new(props.ipm_subtree_entryid.read).id rescue nil
1772
+ end
1773
+
1774
+ # Deleted Items Folder Record
1775
+ #
1776
+ # @private
1777
+ def ipm_wastebasket_entryid
1778
+ @ipm_wastebasket_entryid ||= EntryID.new(props.ipm_wastebasket_entryid.read).id rescue nil
1779
+ end
1780
+
1781
+ # Search Root Record
1782
+ #
1783
+ # @private
1784
+ def finder_entryid
1785
+ @finder_entryid ||= EntryID.new(props.finder_entryid.read).id rescue nil
1786
+ end
1787
+
1788
+ # all these have been replaced with the method_missing below
1789
+ =begin
1790
+ # States which folders are valid for this message store
1791
+ #def valid_folder_mask
1792
+ # props[0x35df]
1793
+ #end
1794
+
1795
+ # Number of emails stored in a folder
1796
+ def content_count
1797
+ props[0x3602]
1798
+ end
1799
+
1800
+ # Has children
1801
+ def subfolders
1802
+ props[0x360a]
1803
+ end
1804
+ =end
1805
+
1806
+ # i think i will change these, so they can inherit the lazyness from RawPropertyStoreTable.
1807
+ # so if you want the last attachment, you can get it without creating the others perhaps.
1808
+ # it just has to handle the no table at all case a bit more gracefully.
1809
+
1810
+ # @return [Array<Attachment>]
1811
+ def attachments
1812
+ @attachments ||= AttachmentTable.new(@node).to_a.map { |list| Attachment.new list }
1813
+ end
1814
+
1815
+ # @return [Array<Recipient>]
1816
+ def recipients
1817
+ #[]
1818
+ @recipients ||= RecipientTable.new(@node).to_a.map { |list| Recipient.new list }
1819
+ end
1820
+
1821
+ # Iterate children (except on this instance) recursively stored in this MessageStore.
1822
+ #
1823
+ # @yield [item]
1824
+ # @yieldparam item [Item]
1825
+ # @return [void]
1826
+ def each_recursive(&block)
1827
+ #p :self => self
1828
+ children.each do |child|
1829
+ #p :child => child
1830
+ block[child]
1831
+ child.each_recursive(&block)
1832
+ end
1833
+ end
1834
+
1835
+ def inspect
1836
+ attrs = %w[display_name subject sender_name subfolders]
1837
+ # attrs = %w[display_name valid_folder_mask ipm_wastebasket_entryid finder_entryid content_count subfolders]
1838
+ str = attrs.map { |a| b = props.send a; " #{a}=#{b.inspect}" if b }.compact * ','
1839
+
1840
+ type_s = type == :message ? 'Message' : type == :folder ? 'Folder' : type.to_s.capitalize + 'Folder'
1841
+ str2 = 'node_id=0x%x' % @node.node_id
1842
+
1843
+ !str.empty? ? "#<Pst::#{type_s} #{str2}#{str}>" : "#<Pst::#{type_s} #{str2} props=#{props.inspect}>" #\n" + props.transport_message_headers + ">"
1844
+ end
1845
+ end
1846
+
1847
+ # corresponds to
1848
+ # * _pst_parse_item
1849
+ #
1850
+ # @param desc [NodePtr]
1851
+ # @return [Item]
1852
+ # @private
1853
+ def pst_parse_item node
1854
+ Item.new node, RawPropertyStore.new(node).to_a
1855
+ end
1856
+
1857
+ #
1858
+ # other random code
1859
+ # ----------------------------------------------------------------------------
1860
+ #
1861
+
1862
+ # @private
1863
+ def dump_debug_info
1864
+ puts "* pst header"
1865
+ p header
1866
+
1867
+ =begin
1868
+ Looking at the output of this, for blank-o1997.pst, i see this part:
1869
+ ...
1870
+ - (26624,516) desc block data (overlap of 4 bytes)
1871
+ - (27136,516) desc block data (gap of 508 bytes)
1872
+ - (28160,516) desc block data (gap of 2620 bytes)
1873
+ ...
1874
+
1875
+ which confirms my belief that the block size for idx and desc is more likely 512
1876
+ =end
1877
+ if 0 + 0 == 0
1878
+ puts '* file range usage'
1879
+ file_ranges =
1880
+ # these 3 things, should account for most of the data in the file.
1881
+ [[0, Header::SIZE, 'pst file header']] +
1882
+ @block_offsets.map { |offset| [offset, BlockPtr::BLOCK_SIZE, 'block data'] } +
1883
+ @node_offsets.map { |offset| [offset, NodePtr::BLOCK_SIZE, 'node data'] } +
1884
+ @blocks.map { |idx| [idx.offset, idx.size, 'idx id=0x%x (%s)' % [idx.id, idx.type]] }
1885
+ (file_ranges.sort_by { |idx| idx.first } + [nil]).to_enum(:each_cons, 2).each do |(offset, size, name), next_record|
1886
+ # i think there is a padding of the size out to 64 bytes
1887
+ # which is equivalent to padding out the final offset, because i think the offset is
1888
+ # similarly oriented
1889
+ pad_amount = 64
1890
+ warn 'i am wrong about the offset padding' if offset % pad_amount != 0
1891
+ # so, assuming i'm not wrong about that, then we can calculate how much padding is needed.
1892
+ pad = pad_amount - (size % pad_amount)
1893
+ pad = 0 if pad == pad_amount
1894
+ gap = next_record ? next_record.first - (offset + size + pad) : 0
1895
+ extra = case gap <=> 0
1896
+ when -1; ["overlap of #{gap.abs} bytes)"]
1897
+ when 0; []
1898
+ when +1; ["gap of #{gap} bytes"]
1899
+ end
1900
+ # how about we check that padding
1901
+ @io.pos = offset + size
1902
+ pad_bytes = @io.read(pad)
1903
+ extra += ["padding not all zero"] unless pad_bytes == 0.chr * pad
1904
+ puts "- #{offset}:#{size}+#{pad} #{name.inspect}" + (extra.empty? ? '' : ' [' + extra * ', ' + ']')
1905
+ end
1906
+ end
1907
+
1908
+ # i think the idea of the idx, and indeed the idx2, is just to be able to
1909
+ # refer to data indirectly, which means it can get moved around, and you just update
1910
+ # the idx table. it is simply a list of file offsets and sizes.
1911
+ # not sure i get how id2 plays into it though....
1912
+ # the sizes seem to be all even. is that a co-incidence? and the ids are all even. that
1913
+ # seems to be related to something else (see the (id & 2) == 1 stuff)
1914
+ puts '* idx entries'
1915
+ @blocks.each { |idx| puts "- #{idx.inspect}" }
1916
+
1917
+ # if you look at the desc tree, you notice a few things:
1918
+ # 1. there is a desc that seems to be the parent of all the folders, messages etc.
1919
+ # it is the one whose parent is itself.
1920
+ # one of its children is referenced as the subtree_entryid of the first desc item,
1921
+ # the root.
1922
+ # 2. typically only 2 types of desc records have idx2_id != 0. messages themselves,
1923
+ # and the desc with id = 0x61 - the xattrib container. everything else uses the
1924
+ # regular ids to find its data. i think it should be reframed as small blocks and
1925
+ # big blocks, but i'll look into it more.
1926
+ #
1927
+ # idx_id and idx2_id are for getting to the data. desc_id and parent_desc_id just define
1928
+ # the parent <-> child relationship, and the desc_ids are how the items are referred to in
1929
+ # entryids.
1930
+ # note that these aren't unique! eg for 0, 4 etc. i expect these'd never change, as the ids
1931
+ # are stored in entryids. whereas the idx and idx2 could be a bit more volatile.
1932
+ puts '* node tree'
1933
+ # make a dummy root hold everything just for convenience
1934
+ root = NodePtr.new ''
1935
+ def root.inspect; "#<Pst::Root>"; end
1936
+ root.children.replace @orphans
1937
+ # this still loads the whole thing as a string for gsub. should use directo output io
1938
+ # version.
1939
+ puts root.to_tree.gsub(/, (parent_node_id|idx2_id)=0x0(?!\d)/, '')
1940
+
1941
+ # this is fairly easy to understand, its just an attempt to display the pst items in a tree form
1942
+ # which resembles what you'd see in outlook.
1943
+ puts '* item tree'
1944
+ # now streams directly
1945
+ root_item.to_tree STDOUT
1946
+ end
1947
+
1948
+ # @return [NodePtr]
1949
+ # @private
1950
+ def root_desc
1951
+ @nodes.first
1952
+ end
1953
+
1954
+ # @return [Item]
1955
+ # @private
1956
+ def root_item
1957
+ item = pst_parse_item root_desc
1958
+ item.type = :root
1959
+ item
1960
+ end
1961
+
1962
+ # Obtain a root item
1963
+ #
1964
+ # @return [Item]
1965
+ def root
1966
+ root_item
1967
+ end
1968
+
1969
+ # depth first search of all items
1970
+ include Enumerable
1971
+
1972
+ # Iterate all kind of items recursively stored in this MessageStore.
1973
+ #
1974
+ # @yield [message]
1975
+ # @yieldparam message [Item]
1976
+ # @return [void]
1977
+ def each(&block)
1978
+ root = self.root
1979
+ block[root]
1980
+ root.each_recursive(&block)
1981
+ end
1982
+
1983
+ # Get this MessageStore's display name.
1984
+ #
1985
+ # @return [String]
1986
+ def name
1987
+ @name ||= root_item.props.display_name
1988
+ end
1989
+
1990
+ def inspect
1991
+ "#<Pst name=#{name.inspect} io=#{io.inspect}>"
1992
+ end
1993
+ end
1994
+ end
1995
+