libis-mapi 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/mapi/pst.rb ADDED
@@ -0,0 +1,1995 @@
1
+ #
2
+ # = Introduction
3
+ #
4
+ # This file is mostly an attempt to port libpst to ruby, and simplify it in the process. It
5
+ # will leverage much of the existing MAPI => MIME conversion developed for Msg files, and as
6
+ # such is purely concerned with the file structure details.
7
+ #
8
+ # = TODO
9
+ #
10
+ # 1. solve recipient table problem (test4).
11
+ # this is done. turns out it was due to id2 clashes. find better solution
12
+ # 2. check parse consistency. an initial conversion of a 30M file to pst, shows
13
+ # a number of messages conveting badly. compare with libpst too.
14
+ # 3. xattribs
15
+ # 4. generalise the Mapi stuff better
16
+ # 5. refactor index load
17
+ # 6. msg serialization?
18
+ #
19
+
20
+ =begin
21
+
22
+ quick plan for cleanup.
23
+
24
+ have working tests for 97 and 03 file formats, so safe.
25
+
26
+ want to fix up:
27
+
28
+ 64 bit unpacks scattered around. its ugly. not sure how best to handle it, but am slightly tempted
29
+ to override String#unpack to support a 64 bit little endian unpack (like L vs N/V, for Q). one way or
30
+ another need to fix it. Could really slow everything else down if its parsing the unpack strings twice,
31
+ once in ruby, for every single unpack i do :/
32
+
33
+ the index loading process, and the lack of shared code between normal vs 64 bit variants, and Index vs Desc.
34
+ should be able to reduce code by factor of 4. also think I should move load code into the class too. then
35
+ maybe have something like:
36
+
37
+ class Header
38
+ def index_class
39
+ version_2003 ? Index64 : Index
40
+ end
41
+ end
42
+
43
+ def load_idx
44
+ header.index_class.load_index
45
+ end
46
+
47
+ OR
48
+
49
+ def initialize
50
+ @header = ...
51
+ extend @header.index_class::Load
52
+ load_idx
53
+ end
54
+
55
+ need to think about the role of the mapi code, and Pst::Item etc, but that layer can come later.
56
+
57
+ =end
58
+
59
+ require 'mapi'
60
+ require 'enumerator'
61
+ require 'ostruct'
62
+ require 'ole/ranges_io'
63
+ require 'mapi/helper'
64
+
65
+ module Mapi
66
+ # Read Outlook's pst file
67
+ class Pst
68
+ class FormatError < StandardError
69
+ end
70
+
71
+ # unfortunately there is no Q analogue which is little endian only.
72
+ # this translates T as an unsigned quad word, little endian byte order, to
73
+ # not pollute the rest of the code.
74
+ #
75
+ # didn't want to override String#unpack, cause its too hacky, and incomplete.
76
+ #
77
+ # @param str [String]
78
+ # @param unpack_spec [String]
79
+ # @return [Array]
80
+ # @private
81
+ def self.unpack str, unpack_spec
82
+ return str.unpack(unpack_spec) unless unpack_spec['T']
83
+ @unpack_cache ||= {}
84
+ t_offsets, new_spec = @unpack_cache[unpack_spec]
85
+ unless t_offsets
86
+ t_offsets = []
87
+ offset = 0
88
+ new_spec = ''
89
+ unpack_spec.scan(/([^\d])_?(\*|\d+)?/o) do
90
+ num_elems = $1.downcase == 'a' ? 1 : ($2 || 1).to_i
91
+ if $1 == 'T'
92
+ num_elems.times { |i| t_offsets << offset + i }
93
+ new_spec << "V#{num_elems * 2}"
94
+ else
95
+ new_spec << $~[0]
96
+ end
97
+ offset += num_elems
98
+ end
99
+ @unpack_cache[unpack_spec] = [t_offsets, new_spec]
100
+ end
101
+ a = str.unpack(new_spec)
102
+ t_offsets.each do |offset|
103
+ low, high = a[offset, 2]
104
+ a[offset, 2] = low && high ? low + (high << 32) : nil
105
+ end
106
+ a
107
+ end
108
+
109
+ # @param str [String]
110
+ # @param size [Integer]
111
+ # @param count [Integer]
112
+ # @return [Array<String>]
113
+ # @private
114
+ def self.split_per str, size, count
115
+ count = str.length / size if count < 0
116
+ list = []
117
+ count.times {|i| list << str[size * i, size]}
118
+ list
119
+ end
120
+
121
+ #
122
+ # this is the header and encryption encapsulation code
123
+ # ----------------------------------------------------------------------------
124
+ #
125
+
126
+ # class which encapsulates the pst header
127
+ #
128
+ # @private
129
+ class Header
130
+ SIZE = 512
131
+ MAGIC = 0x2142444e
132
+
133
+ # these are the constants defined in libpst.c, that
134
+ # are referenced in pst_open()
135
+ INDEX_TYPE_OFFSET = 0x0A
136
+ FILE_SIZE_POINTER = 0xA8
137
+ FILE_SIZE_POINTER_64 = 0xB8
138
+ SECOND_POINTER = 0xBC
139
+ INDEX_POINTER = 0xC4
140
+ SECOND_POINTER_64 = 0xE0
141
+ INDEX_POINTER_64 = 0xF0
142
+ ENC_OFFSET = 0x1CD
143
+
144
+ # @return [Integer]
145
+ attr_reader :magic
146
+ # @return [Integer]
147
+ attr_reader :index_type
148
+ # @return [Integer]
149
+ attr_reader :encrypt_type
150
+ # @return [Integer]
151
+ attr_reader :size
152
+ # @return [Integer]
153
+ attr_reader :block_btree_count
154
+ # @return [Integer]
155
+ attr_reader :block_btree
156
+ # @return [Integer]
157
+ attr_reader :node_btree_count
158
+ # @return [Integer]
159
+ attr_reader :node_btree
160
+ # @return [Integer]
161
+ attr_reader :version
162
+
163
+ def initialize data
164
+ @magic = data.unpack('N')[0]
165
+ @index_type = data[INDEX_TYPE_OFFSET].ord
166
+ @version = {0x0e => 1997, 0x17 => 2003, 0x24 => 2003}[@index_type]
167
+
168
+ if version_2003?
169
+ # don't know?
170
+ # >> data1.unpack('V*').zip(data2.unpack('V*')).enum_with_index.select { |(c, d), i| c != d and not [46, 56, 60].include?(i) }.select { |(a, b), i| b == 0 }.map { |(a, b), i| [a / 256, i] }
171
+ # [8, 76], [32768, 84], [128, 89]
172
+ # >> data1.unpack('C*').zip(data2.unpack('C*')).enum_with_index.select { |(c, d), i| c != d and not [184..187, 224..227, 240..243].any? { |r| r === i } }.select { |(a, b), i| b == 0 and ((Math.log(a) / Math.log(2)) % 1) < 0.0001 }
173
+ # [[[2, 0], 61], [[2, 0], 76], [[2, 0], 195], [[2, 0], 257], [[8, 0], 305], [[128, 0], 338], [[128, 0], 357]]
174
+ # i have only 2 psts to base this guess on, so i can't really come up with anything that looks reasonable yet. not sure what the offset is. unfortunately there is so much in the header
175
+ # that isn't understood...
176
+ @encrypt_type = 1
177
+
178
+ @node_btree_count, @node_btree = Pst.unpack(data[SECOND_POINTER_64 - 8, 16], "T2")
179
+ @block_btree_count, @block_btree = Pst.unpack(data[INDEX_POINTER_64 - 8, 16], "T2")
180
+
181
+ @size = data[FILE_SIZE_POINTER_64, 4].unpack('V')[0]
182
+ else
183
+ @encrypt_type = data[ENC_OFFSET].ord
184
+
185
+ @node_btree_count, @node_btree = data[SECOND_POINTER - 4, 8].unpack('V2')
186
+ @block_btree_count, @block_btree = data[INDEX_POINTER - 4, 8].unpack('V2')
187
+
188
+ @size = data[FILE_SIZE_POINTER, 4].unpack('V')[0]
189
+ end
190
+
191
+ validate!
192
+ end
193
+
194
+ # return `true` if pst is an Unicode version. Unicode version also uses 64-bit file pointer.
195
+ # otherwise return `false` where pst is an ANSI version. ANSI version uses 32-bit file pointer.
196
+ #
197
+ # @return [Boolean]
198
+ def version_2003?
199
+ version == 2003
200
+ end
201
+
202
+ def encrypted?
203
+ encrypt_type != 0
204
+ end
205
+
206
+ def validate!
207
+ raise FormatError, "bad signature on pst file (#{'0x%x' % magic})" unless magic == MAGIC
208
+ raise FormatError, "only index types 0x0e, 0x17 and 0x24 are handled (#{'0x%x' % index_type})" unless [0x0e, 0x17, 0x24].include?(index_type)
209
+ raise FormatError, "only encrytion types 0 and 1 are handled (#{encrypt_type.inspect})" unless [0, 1].include?(encrypt_type)
210
+ end
211
+ end
212
+
213
+ # compressible encryption! :D
214
+ #
215
+ # simple substitution. see libpst.c
216
+ # maybe test switch to using a String#tr!
217
+ #
218
+ # @private
219
+ class CompressibleEncryption
220
+ DECRYPT_TABLE = [
221
+ 0x47, 0xf1, 0xb4, 0xe6, 0x0b, 0x6a, 0x72, 0x48,
222
+ 0x85, 0x4e, 0x9e, 0xeb, 0xe2, 0xf8, 0x94, 0x53, # 0x0f
223
+ 0xe0, 0xbb, 0xa0, 0x02, 0xe8, 0x5a, 0x09, 0xab,
224
+ 0xdb, 0xe3, 0xba, 0xc6, 0x7c, 0xc3, 0x10, 0xdd, # 0x1f
225
+ 0x39, 0x05, 0x96, 0x30, 0xf5, 0x37, 0x60, 0x82,
226
+ 0x8c, 0xc9, 0x13, 0x4a, 0x6b, 0x1d, 0xf3, 0xfb, # 0x2f
227
+ 0x8f, 0x26, 0x97, 0xca, 0x91, 0x17, 0x01, 0xc4,
228
+ 0x32, 0x2d, 0x6e, 0x31, 0x95, 0xff, 0xd9, 0x23, # 0x3f
229
+ 0xd1, 0x00, 0x5e, 0x79, 0xdc, 0x44, 0x3b, 0x1a,
230
+ 0x28, 0xc5, 0x61, 0x57, 0x20, 0x90, 0x3d, 0x83, # 0x4f
231
+ 0xb9, 0x43, 0xbe, 0x67, 0xd2, 0x46, 0x42, 0x76,
232
+ 0xc0, 0x6d, 0x5b, 0x7e, 0xb2, 0x0f, 0x16, 0x29, # 0x5f
233
+ 0x3c, 0xa9, 0x03, 0x54, 0x0d, 0xda, 0x5d, 0xdf,
234
+ 0xf6, 0xb7, 0xc7, 0x62, 0xcd, 0x8d, 0x06, 0xd3, # 0x6f
235
+ 0x69, 0x5c, 0x86, 0xd6, 0x14, 0xf7, 0xa5, 0x66,
236
+ 0x75, 0xac, 0xb1, 0xe9, 0x45, 0x21, 0x70, 0x0c, # 0x7f
237
+ 0x87, 0x9f, 0x74, 0xa4, 0x22, 0x4c, 0x6f, 0xbf,
238
+ 0x1f, 0x56, 0xaa, 0x2e, 0xb3, 0x78, 0x33, 0x50, # 0x8f
239
+ 0xb0, 0xa3, 0x92, 0xbc, 0xcf, 0x19, 0x1c, 0xa7,
240
+ 0x63, 0xcb, 0x1e, 0x4d, 0x3e, 0x4b, 0x1b, 0x9b, # 0x9f
241
+ 0x4f, 0xe7, 0xf0, 0xee, 0xad, 0x3a, 0xb5, 0x59,
242
+ 0x04, 0xea, 0x40, 0x55, 0x25, 0x51, 0xe5, 0x7a, # 0xaf
243
+ 0x89, 0x38, 0x68, 0x52, 0x7b, 0xfc, 0x27, 0xae,
244
+ 0xd7, 0xbd, 0xfa, 0x07, 0xf4, 0xcc, 0x8e, 0x5f, # 0xbf
245
+ 0xef, 0x35, 0x9c, 0x84, 0x2b, 0x15, 0xd5, 0x77,
246
+ 0x34, 0x49, 0xb6, 0x12, 0x0a, 0x7f, 0x71, 0x88, # 0xcf
247
+ 0xfd, 0x9d, 0x18, 0x41, 0x7d, 0x93, 0xd8, 0x58,
248
+ 0x2c, 0xce, 0xfe, 0x24, 0xaf, 0xde, 0xb8, 0x36, # 0xdf
249
+ 0xc8, 0xa1, 0x80, 0xa6, 0x99, 0x98, 0xa8, 0x2f,
250
+ 0x0e, 0x81, 0x65, 0x73, 0xe4, 0xc2, 0xa2, 0x8a, # 0xef
251
+ 0xd4, 0xe1, 0x11, 0xd0, 0x08, 0x8b, 0x2a, 0xf2,
252
+ 0xed, 0x9a, 0x64, 0x3f, 0xc1, 0x6c, 0xf9, 0xec # 0xff
253
+ ]
254
+
255
+ ENCRYPT_TABLE = [nil] * 256
256
+ DECRYPT_TABLE.each_with_index { |i, j| ENCRYPT_TABLE[i] = j }
257
+
258
+ def self.decrypt_alt encrypted
259
+ decrypted = ''
260
+ encrypted.length.times { |i| decrypted << DECRYPT_TABLE[encrypted[i]] }
261
+ decrypted
262
+ end
263
+
264
+ def self.encrypt_alt decrypted
265
+ encrypted = ''
266
+ decrypted.length.times { |i| encrypted << ENCRYPT_TABLE[decrypted[i]] }
267
+ encrypted
268
+ end
269
+
270
+ # an alternate implementation that is possibly faster....
271
+ # TODO - bench
272
+ DECRYPT_STR, ENCRYPT_STR = [DECRYPT_TABLE, (0...256)].map do |values|
273
+ values.map { |i| i.chr }.join.gsub(/([\^\-\\])/, "\\\\\\1")
274
+ end
275
+
276
+ def self.decrypt encrypted
277
+ encrypted.tr ENCRYPT_STR, DECRYPT_STR
278
+ end
279
+
280
+ def self.encrypt decrypted
281
+ decrypted.tr DECRYPT_STR, ENCRYPT_STR
282
+ end
283
+ end
284
+
285
+ # @return [IO]
286
+ # @private
287
+ attr_reader :io
288
+
289
+ # @return [Header]
290
+ # @private
291
+ attr_reader :header
292
+
293
+ # @return [Array<BlockPtr>]
294
+ # @private
295
+ attr_reader :blocks
296
+
297
+ # @return [Array<NodePtr>]
298
+ # @private
299
+ attr_reader :nodes
300
+
301
+ # @return [Hash<Integer, Symbol>]
302
+ # @private
303
+ attr_reader :special_folder_ids
304
+
305
+ # @return [Helper]
306
+ # @private
307
+ attr_reader :helper
308
+
309
+ # @param io [IO]
310
+ # @param helper [Helper,nil]
311
+ def initialize io, helper=nil
312
+ # corresponds to
313
+ # * pst_open
314
+ # * pst_load_index
315
+
316
+ @io = io
317
+ io.pos = 0
318
+ @helper = helper || Helper.new
319
+ @header = Header.new io.read(Header::SIZE)
320
+
321
+ # would prefer this to be in Header#validate, but it doesn't have the io size.
322
+ # should perhaps downgrade this to just be a warning...
323
+ raise FormatError, "header size field invalid (#{header.size} != #{io.size}}" unless header.size == io.size
324
+
325
+ load_block_btree
326
+ load_node_btree
327
+ load_xattrib
328
+
329
+ @special_folder_ids = {}
330
+ end
331
+
332
+ # @return [Boolean]
333
+ # @private
334
+ def encrypted?
335
+ @header.encrypted?
336
+ end
337
+
338
+ # until i properly fix logging...
339
+ #
340
+ # @private
341
+ def warn s
342
+ Mapi::Log.warn s
343
+ end
344
+
345
+ #
346
+ # this is the index and desc record loading code
347
+ # ----------------------------------------------------------------------------
348
+ #
349
+
350
+ # @private
351
+ ToTree = Module.new
352
+
353
+ # more constants from libpst.c
354
+ # these relate to the index block
355
+ # @private
356
+ ITEM_COUNT_OFFSET = 0x1f0 # count byte
357
+
358
+ # @private
359
+ LEVEL_INDICATOR_OFFSET = 0x1f3 # node or leaf
360
+
361
+ # @private
362
+ BACKLINK_OFFSET = 0x1f8 # backlink u1 value
363
+
364
+ # these 3 classes are used to hold various file records
365
+
366
+ # pst_index
367
+ #
368
+ # @private
369
+ class BlockPtr < Struct.new(:id, :offset, :size, :u1)
370
+ UNPACK_STR32 = 'VVvv'
371
+ UNPACK_STR64 = 'TTvv'
372
+ SIZE32 = 12
373
+ SIZE64 = 24
374
+ BLOCK_SIZE = 512 # index blocks was 516 but bogus
375
+ COUNT_MAX32 = 41 # max active items (ITEM_COUNT_OFFSET / Index::SIZE = 41)
376
+ COUNT_MAX64 = 20 # bit of a guess really. 512 / 24 = 21, but doesn't leave enough header room
377
+
378
+ # @return [Pst]
379
+ attr_accessor :pst
380
+
381
+ # @param data [String, Array]
382
+ # @param is64 [Boolean]
383
+ def initialize data, is64
384
+ data = Pst.unpack data, (is64 ? UNPACK_STR64 : UNPACK_STR32) if String === data
385
+ super(*data)
386
+ end
387
+
388
+ # @return [Symbol]
389
+ def type
390
+ @type ||= begin
391
+ if id & 0x2 == 0
392
+ :data
393
+ else
394
+ first_byte, second_byte = read.unpack('CC')
395
+ if first_byte == 1
396
+ raise second_byte unless second_byte == 1
397
+ :data_chain_header
398
+ elsif first_byte == 2
399
+ raise second_byte unless second_byte == 0
400
+ :id2_assoc
401
+ else
402
+ raise FormatError, 'unknown first byte for block - %p' % first_byte
403
+ end
404
+ end
405
+ end
406
+ end
407
+
408
+ # @return [Boolean]
409
+ def data?
410
+ (id & 0x2) == 0
411
+ end
412
+
413
+ # @return [String]
414
+ def read decrypt=true
415
+ # only data blocks are every encrypted
416
+ decrypt = false unless data?
417
+ pst.pst_read_block_size offset, size, decrypt
418
+ end
419
+
420
+ # show all numbers in hex
421
+ def inspect
422
+ super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }.sub(/Index /, "Index type=#{type.inspect}, ")
423
+ end
424
+ end
425
+
426
+ # mostly guesses.
427
+
428
+ # @private
429
+ ITEM_COUNT_OFFSET_64 = 0x1e8
430
+
431
+ # @private
432
+ LEVEL_INDICATOR_OFFSET_64 = 0x1eb # diff of 3 between these 2 as above...
433
+
434
+ # _pst_table_ptr_struct
435
+ #
436
+ # @private
437
+ class TablePtr < Struct.new(:start, :u1, :offset)
438
+ UNPACK_STR32 = 'V3'
439
+ UNPACK_STR64 = 'T3'
440
+ SIZE32 = 12
441
+ SIZE64 = 24
442
+
443
+ # @param data [String]
444
+ # @param is64 [Boolean]
445
+ def initialize data, is64
446
+ data = Pst.unpack(data, is64 ? UNPACK_STR64 : UNPACK_STR32) if String === data
447
+ super(*data)
448
+ end
449
+ end
450
+
451
+ # pst_desc
452
+ # idx_id is a pointer to an idx record which gets the primary data stream for the Desc record.
453
+ # idx2_id gets you an idx record, that when read gives you an ID2 association list, which just maps
454
+ # another set of ids to index values
455
+ #
456
+ # @private
457
+ class NodePtr < Struct.new(:node_id, :block_id, :sub_block_id, :parent_node_id)
458
+ UNPACK_STR32 = 'V4'
459
+ UNPACK_STR64 = 'T3V'
460
+ SIZE32 = 16
461
+ SIZE64 = 32
462
+ BLOCK_SIZE = 512 # descriptor blocks was 520 but bogus
463
+ COUNT_MAX64 = 15
464
+ COUNT_MAX32 = 31 # max active desc records (ITEM_COUNT_OFFSET / Desc::SIZE = 31)
465
+
466
+ include ToTree
467
+
468
+ # @return [Pst]
469
+ attr_accessor :pst
470
+
471
+ # @return [Array]
472
+ attr_reader :children
473
+
474
+ # @param data [String]
475
+ # @param is64 [Boolean]
476
+ def initialize data, is64
477
+ super(*Pst.unpack(data, is64 ? UNPACK_STR64 : UNPACK_STR32))
478
+ @children = []
479
+ end
480
+
481
+ # @return [BlockPtr]
482
+ def block
483
+ raise "DO NOT USE"
484
+ pst.block_from_id block_id
485
+ end
486
+
487
+ # @return [BlockPtr]
488
+ def sub_block
489
+ raise "DO NOT USE"
490
+ pst.block_from_id sub_block_id
491
+ end
492
+
493
+ # Read node data
494
+ #
495
+ # @return [Array<String>]
496
+ def read_main_array
497
+ @read_main ||= begin
498
+ list = []
499
+ pst.load_node_main_data_to node_id, list
500
+ list
501
+ end
502
+ end
503
+
504
+ # Locate and read node sub data by its local id
505
+ #
506
+ # @param local_node_id [Integer]
507
+ # @return [Array<String>]
508
+ def read_sub_array local_node_id
509
+ list = []
510
+ pst.load_node_sub_data_to node_id, local_node_id, list
511
+ list
512
+ end
513
+
514
+ # @return [Array<String>]
515
+ def get_local_node_list
516
+ list = []
517
+ pst.get_local_node_list_to node_id, list
518
+ list
519
+ end
520
+
521
+ # Check if there is a sub data exists, where it is identified by its local id
522
+ #
523
+ # @param local_node_id [Integer]
524
+ # @return [Boolean]
525
+ def has_sub local_node_id
526
+ #TODO fixme
527
+ read_sub_array(local_node_id).length != 0
528
+ end
529
+
530
+ # show all numbers in hex
531
+ def inspect
532
+ super.gsub(/=(\d+)/) { '=0x%x' % $1.to_i }
533
+ end
534
+ end
535
+
536
+ # corresponds to
537
+ # * _pst_build_id_ptr
538
+ #
539
+ # @private
540
+ def load_block_btree
541
+ @blocks = []
542
+ @block_offsets = []
543
+ load_block_tree header.block_btree, header.block_btree_count, 0
544
+
545
+ # we'll typically be accessing by id, so create a hash as a lookup cache
546
+ @block_from_id = {}
547
+ @blocks.each do |idx|
548
+ id = idx.id & ~1
549
+ warn "there are duplicate idx records with id #{id}" if @block_from_id[id]
550
+ @block_from_id[id] = idx
551
+ end
552
+ end
553
+
554
+ # load the flat idx table, which maps ids to file ranges. this is the recursive helper
555
+ #
556
+ # corresponds to
557
+ # * _pst_build_id_ptr
558
+ #
559
+ # @private
560
+ def load_block_tree offset, linku1, start_val
561
+ @block_offsets << offset
562
+
563
+ #_pst_read_block_size(pf, offset, BLOCK_SIZE, &buf, 0, 0) < BLOCK_SIZE)
564
+ buf = pst_read_block_size offset, BlockPtr::BLOCK_SIZE, false
565
+
566
+ item_count = buf[is64 ? ITEM_COUNT_OFFSET_64 : ITEM_COUNT_OFFSET].ord
567
+ level = buf[is64 ? LEVEL_INDICATOR_OFFSET_64 : LEVEL_INDICATOR_OFFSET].ord
568
+ count_max = is64 ? BlockPtr::COUNT_MAX64 : BlockPtr::COUNT_MAX32
569
+ raise "have too many active items in index (#{item_count})" if item_count > count_max
570
+
571
+ this_node_id = is64 ? Pst.unpack(buf[BACKLINK_OFFSET, 8], "T").first : buf[BACKLINK_OFFSET, 4].unpack("V").first
572
+ raise 'blah 1' unless this_node_id == linku1
573
+
574
+ if level == 0
575
+ # leaf pointers
576
+ size = is64 ? BlockPtr::SIZE64 : BlockPtr::SIZE32
577
+
578
+ # split the data into item_count index objects
579
+ Pst.split_per(buf, size, item_count).each_with_index do |data, i|
580
+ idx = BlockPtr.new data, is64
581
+ # first entry
582
+ raise 'blah 3' if i == 0 and start_val != 0 and idx.id != start_val
583
+ idx.pst = self
584
+ # this shouldn't really happen i'd imagine
585
+ raise "OHNO" if idx.id == 0
586
+ @blocks << idx
587
+ end
588
+ else
589
+ # node pointers
590
+ size = is64 ? TablePtr::SIZE64 : TablePtr::SIZE32
591
+ # split the data into item_count table pointers
592
+ Pst.split_per(buf, size, item_count).each_with_index do |data, i|
593
+ table = TablePtr.new data, is64
594
+ # for the first value, we expect the start to be equal
595
+ raise 'blah 3' if i == 0 and start_val != 0 and table.start != start_val
596
+ # this shouldn't really happen i'd imagine
597
+ raise "OHNO" if table.start == 0
598
+ load_block_tree table.offset, table.u1, table.start
599
+ end
600
+ end
601
+ end
602
+
603
+ # most access to idx objects will use this function
604
+ #
605
+ # corresponds to
606
+ # * _pst_getID
607
+ #
608
+ # @param id [Integer]
609
+ # @return [BlockPtr]
610
+ # @private
611
+ def block_from_id id
612
+ @block_from_id[id & ~1]
613
+ end
614
+
615
+ # corresponds to
616
+ # * _pst_build_desc_ptr
617
+ # * record_descriptor
618
+ #
619
+ # @private
620
+ def load_node_btree
621
+ @nodes = []
622
+ @node_offsets = []
623
+ load_node_tree header.node_btree, header.node_btree_count, 0x21
624
+
625
+ # first create a lookup cache
626
+ @node_from_id = {}
627
+ @nodes.each do |node|
628
+ node.pst = self
629
+ warn "there are duplicate desc records with id #{node.node_id}" if @node_from_id[node.node_id]
630
+ @node_from_id[node.node_id] = node
631
+ end
632
+
633
+ # now turn the flat list of loaded desc records into a tree
634
+
635
+ # well, they have no parent, so they're more like, the toplevel descs.
636
+ @orphans = []
637
+ # now assign each node to the parents child array, putting the orphans in the above
638
+ @nodes.each do |node|
639
+ parent = @node_from_id[node.parent_node_id]
640
+ # note, besides this, its possible to create other circular structures.
641
+ if parent == node
642
+ # this actually happens usually, for the root_item it appears.
643
+ #warn "desc record's parent is itself (#{desc.inspect})"
644
+ # maybe add some more checks in here for circular structures
645
+ elsif parent
646
+ parent.children << node
647
+ next
648
+ end
649
+ @orphans << node
650
+ end
651
+
652
+ # maybe change this to some sort of sane-ness check. orphans are expected
653
+ # warn "have #{@orphans.length} orphan desc record(s)." unless @orphans.empty?
654
+ end
655
+
656
+ # @return [Boolean]
657
+ # @private
658
+ def is64
659
+ @header.version_2003?
660
+ end
661
+
662
+ # load the flat list of desc records recursively
663
+ #
664
+ # corresponds to
665
+ # * _pst_build_desc_ptr
666
+ # * record_descriptor
667
+ #
668
+ # @private
669
+ def load_node_tree offset, linku1, start_val
670
+ @node_offsets << offset
671
+
672
+ buf = pst_read_block_size offset, NodePtr::BLOCK_SIZE, false
673
+ item_count = buf[is64 ? ITEM_COUNT_OFFSET_64 : ITEM_COUNT_OFFSET].ord
674
+ level = buf[is64 ? LEVEL_INDICATOR_OFFSET_64 : LEVEL_INDICATOR_OFFSET].ord
675
+
676
+ # not real desc
677
+ this_node_id = is64 ? Pst.unpack(buf[BACKLINK_OFFSET, 8], "T").first : buf[BACKLINK_OFFSET, 4].unpack("V").first
678
+ raise 'blah 1' unless this_node_id == linku1
679
+
680
+ if level == 0
681
+ # leaf pointers
682
+ size = is64 ? NodePtr::SIZE64 : NodePtr::SIZE32
683
+ count_max = is64 ? NodePtr::COUNT_MAX64 : NodePtr::COUNT_MAX32
684
+
685
+ raise "have too many active items in index (#{item_count})" if item_count > count_max
686
+ # split the data into item_count desc objects
687
+ Pst.split_per(buf, size, item_count).each_with_index do |data, i|
688
+ node = NodePtr.new data, is64
689
+ # first entry
690
+ raise 'blah 3' if i == 0 and start_val != 0 and node.node_id != start_val
691
+ # this shouldn't really happen i'd imagine
692
+ break if node.node_id == 0
693
+ @nodes << node
694
+ end
695
+ else
696
+ # node pointers
697
+ size = is64 ? TablePtr::SIZE64 : TablePtr::SIZE32
698
+ count_max = is64 ? BlockPtr::COUNT_MAX64 : BlockPtr::COUNT_MAX32
699
+
700
+ raise "have too many active items in index (#{item_count})" if item_count > count_max
701
+ # split the data into item_count table pointers
702
+ Pst.split_per(buf, size, item_count).each_with_index do |data, i|
703
+ table = TablePtr.new data, is64
704
+ # for the first value, we expect the start to be equal note that ids -1, so even for the
705
+ # first we expect it to be equal. thats the 0x21 (dec 33) desc record. this means we assert
706
+ # that the first desc record is always 33...
707
+ raise 'blah 3' if i == 0 and start_val != -1 and table.start != start_val
708
+ # this shouldn't really happen i'd imagine
709
+ break if table.start == 0
710
+ load_node_tree table.offset, table.u1, table.start
711
+ end
712
+ end
713
+ end
714
+
715
+ # as for idx
716
+ #
717
+ # corresponds to:
718
+ # * _pst_getDptr
719
+ #
720
+ # @param id [Integer]
721
+ # @return [NodePtr]
722
+ #
723
+ # @private
724
+ def node_from_id id
725
+ @node_from_id[id]
726
+ end
727
+
728
+ # corresponds to
729
+ # * pst_load_extended_attributes
730
+ #
731
+ # @private
732
+ def load_xattrib
733
+ end
734
+
735
+ # corresponds to:
736
+ # * _pst_read_block_size
737
+ # * _pst_read_block ??
738
+ # * _pst_ff_getIDblock_dec ??
739
+ # * _pst_ff_getIDblock ??
740
+ #
741
+ # @param offset [Integer]
742
+ # @param size [Integer]
743
+ # @param decrypt [Boolean]
744
+ # @return [String]
745
+ # @private
746
+ def pst_read_block_size offset, size, decrypt=true
747
+ io.seek offset
748
+ buf = io.read size
749
+ warn "tried to read #{size} bytes but only got #{buf.length}" if buf.length != size
750
+ encrypted? && decrypt ? CompressibleEncryption.decrypt(buf) : buf
751
+ end
752
+
753
+ # @param node_id [Integer]
754
+ # @param list [Array<String>]
755
+ # @private
756
+ def load_node_main_data_to node_id, list
757
+ raise 'node_is must be Integer' unless Integer === node_id
758
+ node = node_from_id node_id
759
+ load_main_block_to node.block_id, list
760
+ end
761
+
762
+ # @param node_id [Integer]
763
+ # @param local_node_id [Integer]
764
+ # @param list [Array<String>]
765
+ # @private
766
+ def load_node_sub_data_to node_id, local_node_id, list
767
+ raise 'node_is must be Integer' unless Integer === node_id
768
+ raise 'local_node_id must be Integer' unless Integer === local_node_id
769
+ node = node_from_id node_id
770
+ load_sub_block_to node.sub_block_id, local_node_id, list
771
+ end
772
+
773
+ # for debug
774
+ #
775
+ # @param node_id [String]
776
+ # @param list [Array<String>]
777
+ # @private
778
+ def get_local_node_list_to node_id, list
779
+ node = node_from_id node_id
780
+ get_local_node_list_of_sub_block_to node.sub_block_id, list
781
+ end
782
+
783
+ # for debug
784
+ #
785
+ # @param sub_block_id [String]
786
+ # @param list [Array<String>]
787
+ # @private
788
+ def get_local_node_list_of_sub_block_to sub_block_id, list
789
+ return if sub_block_id == 0
790
+
791
+ sub_block = block_from_id sub_block_id
792
+ p ["WALK",sub_block_id,sub_block]
793
+ raise 'must not be data' if sub_block.data?
794
+
795
+ # SLBLOCK or SIBLOCK
796
+ data = sub_block.read
797
+
798
+ btype = data[0].ord
799
+ raise 'btype != 2' if btype != 2
800
+
801
+ level = data[1].ord
802
+ case level
803
+ when 0 # SLBLOCK
804
+ count = data[2, 2].unpack("v").first
805
+ count.times do |i|
806
+ sl_node_id, sl_block_id, sl_sub_block_id = (
807
+ is64 ? Pst.unpack(data[(is64 ? 8 : 4) + 24 * i, 24], "T3") : data[(is64 ? 8 : 4) + 12 * i, 12].unpack("V3")
808
+ )
809
+
810
+ list << (sl_node_id & 0xffffffff)
811
+
812
+ get_local_node_list_of_sub_block_to sl_sub_block_id, list
813
+ end
814
+ when 1 # SIBLOCK
815
+ count = data[2, 2].unpack("v").first
816
+ count.times do |i|
817
+ si_node_id, si_block_id = (
818
+ is64 ? Pst.unpack(data[(is64 ? 8 : 4) + 16 * i, 16], "T2") : data[(is64 ? 8 : 4) + 8 * i, 8].unpack("V2")
819
+ )
820
+
821
+ list << (si_node_id & 0xffffffff)
822
+ end
823
+ else
824
+ raise 'level unk'
825
+ end
826
+ end
827
+
828
+ # @param sub_block_id [Integer]
829
+ # @param local_node_id [Integer]
830
+ # @param list [Array<String>]
831
+ # @private
832
+ def load_sub_block_to sub_block_id, local_node_id, list
833
+ raise 'sub_block_id must be Integer' unless Integer === sub_block_id
834
+ return if sub_block_id == 0
835
+
836
+ sub_block = block_from_id sub_block_id
837
+ raise 'must not be data' if sub_block.data?
838
+
839
+ # SLBLOCK or SIBLOCK
840
+ data = sub_block.read
841
+
842
+ btype = data[0].ord
843
+ raise 'btype != 2' if btype != 2
844
+
845
+ level = data[1].ord
846
+ case level
847
+ when 0 # SLBLOCK
848
+ count = data[2, 2].unpack("v").first
849
+ count.times do |i|
850
+ sl_node_id, sl_block_id, sl_sub_block_id = (
851
+ is64 ? Pst.unpack(data[(is64 ? 8 : 4) + 24 * i, 24], "T3") : data[(is64 ? 8 : 4) + 12 * i, 12].unpack("V3")
852
+ )
853
+
854
+ sl_node_id &= 0xffffffff
855
+
856
+ if sl_node_id == local_node_id
857
+ load_main_block_to sl_block_id, list
858
+ end
859
+
860
+ load_sub_block_to sl_sub_block_id, local_node_id, list
861
+ end
862
+ when 1 # SIBLOCK
863
+ count = data[2, 2].unpack("v").first
864
+ count.times do |i|
865
+ si_node_id, si_block_id = (
866
+ is64 ? Pst.unpack(data[(is64 ? 8 : 4) + 16 * i, 16], "T2") : data[(is64 ? 8 : 4) + 8 * i, 8].unpack("V2")
867
+ )
868
+
869
+ si_node_id &= 0xffffffff
870
+
871
+ if si_node_id == local_node_id
872
+ si_block = block_from_id si_block_id
873
+ raise 'must be data' unless si_block.data?
874
+ list << si_block.read.force_encoding("BINARY")
875
+ end
876
+ end
877
+ else
878
+ raise 'level unk'
879
+ end
880
+ end
881
+
882
+ # @param block_id [Integer]
883
+ # @param list [Array<String>]
884
+ # @private
885
+ def load_main_block_to block_id, list
886
+ return if block_id == 0
887
+
888
+ block = block_from_id block_id
889
+
890
+ if block.data?
891
+ # this is real data we want
892
+ list << block.read.force_encoding("BINARY")
893
+ return
894
+ end
895
+
896
+ # XBLOCK or XXBLOCK
897
+ data = block.read
898
+
899
+ btype = data[0].ord
900
+ raise 'btype must be 1' if btype != 1
901
+
902
+ level = data[1].ord
903
+ case level
904
+ when 1, 2
905
+ count, num_bytes = data[2, 6].unpack("vV")
906
+
907
+ items = (
908
+ is64 ? Pst.unpack(data[8, 8 * count], "T#{count}") : data[8, 4 * count].unpack("V#{count}")
909
+ )
910
+ items.each { |block_id|
911
+ load_main_block_to block_id, list
912
+ }
913
+ else
914
+ raise 'level unk'
915
+ end
916
+ end
917
+
918
+ #
919
+ # id2
920
+ # ----------------------------------------------------------------------------
921
+ #
922
+
923
+ #
924
+ # main block parsing code. gets raw properties
925
+ # ----------------------------------------------------------------------------
926
+ #
927
+
928
+ # the job of this class, is to take a desc record, and be able to enumerate through the
929
+ # mapi properties of the associated thing.
930
+ #
931
+ # corresponds to
932
+ # * _pst_parse_block
933
+ # * _pst_process (in some ways. although perhaps thats more the Item::Properties#add_property)
934
+ #
935
+ # @private
936
+ class BlockParser
937
+ include Mapi::Types::Constants
938
+
939
+ # @private
940
+ TYPES = {
941
+ 0xbc => 1,
942
+ 0x7c => 2,
943
+ # type 3 is removed. an artifact of not handling the indirect blocks properly in libpst.
944
+ }
945
+
946
+ # @private
947
+ PR_SUBJECT = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_SUBJECT' }.first.hex
948
+ # @private
949
+ PR_BODY_HTML = PropertySet::TAGS.find { |num, (name, type)| name == 'PR_BODY_HTML' }.first.hex
950
+
951
+ # this stuff could maybe be moved to Ole::Types? or leverage it somehow?
952
+ # whether or not a type is immeidate is more a property of the pst encoding though i expect.
953
+ # what i probably can add is a generic concept of whether a type is of variadic length or not.
954
+
955
+ # these lists are very incomplete. think they are largely copied from libpst
956
+
957
+ # @private
958
+ IMMEDIATE_TYPES = [
959
+ PT_SHORT, PT_LONG, PT_BOOLEAN
960
+ ]
961
+
962
+ # @private
963
+ INDIRECT_TYPES = [
964
+ PT_DOUBLE, PT_OBJECT,
965
+ 0x0014, # whats this? probably something like PT_LONGLONG, given the correspondence with the
966
+ # ole variant types. (= VT_I8)
967
+ PT_STRING8, PT_UNICODE, # unicode isn't in libpst, but added here for outlook 2003 down the track
968
+ PT_SYSTIME,
969
+ 0x0048, # another unknown
970
+ 0x0102, # this is PT_BINARY vs PT_CLSID
971
+ #0x1003, # these are vector types, but they're commented out for now because i'd expect that
972
+ #0x1014, # there's extra decoding needed that i'm not doing. (probably just need a simple
973
+ # # PT_* => unpack string mapping for the immediate types, and just do unpack('V*') etc
974
+ #0x101e,
975
+ #0x1102
976
+ ]
977
+
978
+ # the attachment and recipient arrays appear to be always stored with these fixed
979
+ # id2 values. seems strange. are there other extra streams? can find out by making higher
980
+ # level IO wrapper, which has the id2 value, and doing the diff of available id2 values versus
981
+ # used id2 values in properties of an item.
982
+
983
+ # @private
984
+ ID2_ATTACHMENTS = 0x671
985
+
986
+ # @private
987
+ ID2_RECIPIENTS = 0x692
988
+
989
+ # Targeting main data, not sub
990
+ USE_MAIN_DATA = -1
991
+
992
+ # @return [NodePtr]
993
+ # @private
994
+ attr_reader :node
995
+
996
+ # @return [Hash<Integer, String>] HID to data block
997
+ # @private
998
+ attr_reader :data_chunks
999
+
1000
+ # @param node [NodePtr]
1001
+ # @param local_node_id [Integer]
1002
+ def initialize node, local_node_id = USE_MAIN_DATA
1003
+ #raise FormatError, "unable to get associated index record for #{node.inspect}" unless node.block
1004
+ @node = node
1005
+ @data_chunks = {}
1006
+
1007
+ data_array = (local_node_id == USE_MAIN_DATA) ? node.read_main_array : (node.read_sub_array local_node_id)
1008
+
1009
+ data_array.each_with_index { |data, index|
1010
+ # see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/a3fa280c-eba3-434f-86e4-b95141b3c7b1
1011
+ if index == 0
1012
+ load_root_header data
1013
+ else
1014
+ load_page_header data, index
1015
+ end
1016
+ }
1017
+
1018
+ # now, we may have multiple different blocks
1019
+ end
1020
+
1021
+ # Parse HNPAGEHDR / HNBITMAPHDR
1022
+ #
1023
+ # @see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/9c34ecf8-36bc-45a1-a2df-ee35c6dc840a
1024
+ #
1025
+ # @param data [String]
1026
+ # @param page_index [Integer]
1027
+ # @private
1028
+ def load_page_header data, page_index
1029
+ page_map = data.unpack('v').first
1030
+
1031
+ # read HNPAGEMAP
1032
+ offsets_count = data[page_map, 2].unpack("v").first + 1
1033
+ offset_tables = data[page_map + 4, 2 * offsets_count].unpack("v#{offsets_count}")
1034
+
1035
+ offset_tables.each_cons(2).to_a.each_with_index do |(from, to), index|
1036
+ # conver to HID
1037
+ @data_chunks[0x20 * (1 + index) + 65536 * page_index] = data[from, to - from]
1038
+ end
1039
+ end
1040
+
1041
+ # Parse HNHDR
1042
+ #
1043
+ # @see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/8e4ae05c-3c24-4103-b7e5-ffef6f244834
1044
+ # @private
1045
+ def load_root_header data
1046
+ page_map, sig, @heap_type, @offset1 = data.unpack 'vCCVV'
1047
+ raise FormatError, 'invalid signature 0x%02x' % sig unless sig == 0xec
1048
+ raise FormatError, 'unknown block type signature 0x%02x' % @heap_type unless TYPES[@heap_type]
1049
+ @type = TYPES[@heap_type]
1050
+
1051
+ # read HNPAGEMAP
1052
+ offsets_count = data[page_map, 2].unpack("v").first + 1
1053
+ offset_tables = data[page_map + 4, 2 * offsets_count].unpack("v#{offsets_count}")
1054
+
1055
+ offset_tables.each_cons(2).to_a.each_with_index do |(from, to), index|
1056
+ # conver to HID
1057
+ @data_chunks[0x20 * (1 + index)] = data[from, to - from]
1058
+ end
1059
+ end
1060
+
1061
+ # based on the value of offset, return either some data from buf, or some data from the
1062
+ # id2 chain id2, where offset is some key into a lookup table that is stored as the id2
1063
+ # chain. i think i may need to create a BlockParser class that wraps up all this mess.
1064
+ #
1065
+ # corresponds to:
1066
+ # * _pst_getBlockOffsetPointer
1067
+ # * _pst_getBlockOffset
1068
+ #
1069
+ # @param offset [Integer]
1070
+ # @return [String]
1071
+ # @private
1072
+ def get_data_indirect offset
1073
+ raise "offset must be Integer" unless Integer === offset
1074
+
1075
+ return get_data_indirect_io(offset).read
1076
+ end
1077
+
1078
+ # Resolve data pointed by HNID
1079
+ #
1080
+ # @see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/7ac490ce-31af-4a75-97df-eb9d07a003fd
1081
+ # @param offset [Integer]
1082
+ # @return [StringIO]
1083
+ # @private
1084
+ def get_data_indirect_io offset
1085
+ raise "offset must be Integer" unless Integer === offset
1086
+
1087
+ if offset == 0
1088
+ nil
1089
+ elsif (offset & 0x1f) != 0
1090
+ # this is NID (node)
1091
+ data_array = node.read_sub_array(offset)
1092
+ raise "local node id #{offset} points multi page count #{data_array.count}, use get_data_array() instead" if data_array.count >= 2
1093
+ if data_array.empty?
1094
+ StringIO.new ""
1095
+ else
1096
+ StringIO.new data_array.first
1097
+ end
1098
+ else
1099
+ # this is HID (heap)
1100
+ StringIO.new data_chunks[offset]
1101
+ end
1102
+ end
1103
+
1104
+ # @param offset [Integer]
1105
+ # @return [Array<String>]
1106
+ # @private
1107
+ def get_data_array offset
1108
+ raise "offset must be Integer" unless Integer === offset
1109
+
1110
+ if offset == 0
1111
+ nil
1112
+ elsif (offset & 0x1f) != 0
1113
+ # this is NID (node)
1114
+ node.read_sub_array(offset)
1115
+ else
1116
+ # this is HID (heap)
1117
+ [data_chunks[offset]]
1118
+ end
1119
+ end
1120
+
1121
+ def handle_indirect_values key, type, value
1122
+ case type
1123
+ when PT_BOOLEAN
1124
+ value = value != 0
1125
+ when *IMMEDIATE_TYPES # not including PT_BOOLEAN which we just did above
1126
+ # no processing current applied (needed?).
1127
+ when *INDIRECT_TYPES
1128
+ # the value is a pointer
1129
+ if String === value # ie, value size > 4 above
1130
+ value = StringIO.new value
1131
+ else
1132
+ value = get_data_array(value)
1133
+ if value
1134
+ value = StringIO.new value.join("")
1135
+ end
1136
+ end
1137
+ # keep strings as immediate values for now, for compatability with how i set up
1138
+ # Msg::Properties::ENCODINGS
1139
+ if value
1140
+ if type == PT_STRING8
1141
+ value = node.pst.helper.convert_ansi_str value.read
1142
+ elsif type == PT_UNICODE
1143
+ value = Ole::Types::FROM_UTF16.iconv value.read
1144
+ end
1145
+ end
1146
+ # special subject handling
1147
+ if key == PR_BODY_HTML and value
1148
+ # to keep the msg code happy, which thinks body_html will be an io
1149
+ # although, in 2003 version, they are 0102 already
1150
+ value = StringIO.new value unless value.respond_to?(:read)
1151
+ end
1152
+ if key == PR_SUBJECT and String === value and value.length >= 2
1153
+ if value[0].ord == 1
1154
+ # This 2 chars header tell us how to omit subject prefix like `Yes: `, `Re: `, etc.
1155
+ # We need not to omit them.
1156
+ value = value[2..-1]
1157
+ end
1158
+ =begin
1159
+ index = value =~ /^[A-Z]*:/ ? $~[0].length - 1 : nil
1160
+ unless ignore == 1 and offset == index
1161
+ warn 'something wrong with subject hack'
1162
+ $x = [ignore, offset, value]
1163
+ require 'irb'
1164
+ IRB.start
1165
+ exit
1166
+ end
1167
+ =end
1168
+ =begin
1169
+ new idea:
1170
+
1171
+ making sense of the \001\00[156] i've seen prefixing subject. i think its to do with the placement
1172
+ of the ':', or the ' '. And perhaps an optimization to do with thread topic, and ignoring the prefixes
1173
+ added by mailers. thread topic is equal to subject with all that crap removed.
1174
+
1175
+ can test by creating some mails with bizarre subjects.
1176
+
1177
+ subject="\001\005RE: blah blah"
1178
+ subject="\001\001blah blah"
1179
+ subject="\001\032Out of Office AutoReply: blah blah"
1180
+ subject="\001\020Undeliverable: blah blah"
1181
+
1182
+ looks like it
1183
+
1184
+ =end
1185
+
1186
+ # now what i think, is that perhaps, value[offset..-1] ...
1187
+ # or something like that should be stored as a special tag. ie, do a double yield
1188
+ # for this case. probably PR_CONVERSATION_TOPIC, in which case i'd write instead:
1189
+ # yield [PR_SUBJECT, ref_type, value]
1190
+ # yield [PR_CONVERSATION_TOPIC, ref_type, value[offset..-1]
1191
+ # next # to skip the yield.
1192
+ end
1193
+
1194
+ # special handling for embedded objects
1195
+ # used for attach_data for attached messages. in which case attach_method should == 5,
1196
+ # for embedded object.
1197
+ if type == PT_OBJECT and value
1198
+ value = value.read if value.respond_to?(:read)
1199
+ id2, unknown = value.unpack 'V2'
1200
+ io = get_data_indirect_io id2
1201
+
1202
+ # hacky
1203
+ #desc2 = OpenStruct.new(:node => io, :pst => node.pst, :sub_block => node.sub_block, :children => [])
1204
+ # put nil instead of desc.list_index, otherwise the attachment is attached to itself ad infinitum.
1205
+ # should try and fix that FIXME
1206
+ # this shouldn't be done always. for an attached message, yes, but for an attached
1207
+ # meta file, for example, it shouldn't. difference between embedded_ole vs embedded_msg
1208
+ # really.
1209
+ # note that in the case where its a embedded ole, you actually get a regular serialized ole
1210
+ # object, so i need to create an ole storage object on a rangesioidxchain!
1211
+ # eg:
1212
+ =begin
1213
+ att.props.display_name # => "Picture (Metafile)"
1214
+ io = att.props.attach_data
1215
+ io.read(32).unpack('H*') # => ["d0cf11e0a1b11ae100000.... note the docfile signature.
1216
+ # plug some missing rangesio holes:
1217
+ def io.rewind; seek 0; end
1218
+ def io.flush; raise IOError; end
1219
+ ole = Ole::Storage.open io
1220
+ puts ole.root.to_tree
1221
+
1222
+ - #<Dirent:"Root Entry">
1223
+ |- #<Dirent:"\001Ole" size=20 data="\001\000\000\002\000...">
1224
+ |- #<Dirent:"CONTENTS" size=65696 data="\327\315\306\232\000...">
1225
+ \- #<Dirent:"\003MailStream" size=12 data="\001\000\000\000[...">
1226
+ =end
1227
+ # until properly fixed, i have disabled this code here, so this will break
1228
+ # nested messages temporarily.
1229
+ #value = Item.new desc2, RawPropertyStore.new(desc2).to_a
1230
+ #desc2.list_index = nil
1231
+ value = io
1232
+ end
1233
+ # this is PT_MV_STRING8, i guess.
1234
+ # should probably have the 0x1000 flag, and do the or-ring.
1235
+ # example of 0x1102 is PR_OUTLOOK_2003_ENTRYIDS. less sure about that one.
1236
+ when 0x101e, 0x1102
1237
+ # example data:
1238
+ # 0x802b "\003\000\000\000\020\000\000\000\030\000\000\000#\000\000\000BusinessCompetitionFavorites"
1239
+ # this 0x802b would be an extended attribute for categories / keywords.
1240
+ value = get_data_indirect_io(value).read unless String === value
1241
+ num = value.unpack('V')[0]
1242
+ offsets = value[4, 4 * num].unpack("V#{num}")
1243
+ value = (offsets + [value.length]).to_enum(:each_cons, 2).map { |from, to| value[from...to] }
1244
+ value.map! { |str| StringIO.new str } if type == 0x1102
1245
+ when 0x101f
1246
+ value = get_data_indirect_io(value).read unless String === value
1247
+ num = value.unpack('V')[0]
1248
+ offsets = value[4, 4 * num].unpack("V#{num}")
1249
+ value = (offsets + [value.length]).to_enum(:each_cons, 2).map { |from, to| value[from...to] }
1250
+ value.map! { |str| Ole::Types::FROM_UTF16.iconv str }
1251
+ when 0x1003 # uint32 array
1252
+ value = get_data_indirect_io(value).read unless String === value
1253
+ # there is no count field
1254
+ value = value.unpack("V#{(value.length / 4)}")
1255
+ else
1256
+ name = Mapi::Types::DATA[type].first rescue nil
1257
+ warn '0x%04x %p' % [key, get_data_indirect_io(value).read]
1258
+ raise NotImplementedError, 'unsupported mapi property type - 0x%04x (%p)' % [type, name]
1259
+ end
1260
+ [key, type, value]
1261
+ end
1262
+ end
1263
+
1264
+ =begin
1265
+ * recipients:
1266
+
1267
+ affects: ["0x200764", "0x2011c4", "0x201b24", "0x201b44", "0x201ba4", "0x201c24", "0x201cc4", "0x202504"]
1268
+
1269
+ after adding the rawpropertystoretable fix, all except the second parse properly, and satisfy:
1270
+
1271
+ item.props.display_to == item.recipients.map { |r| r.props.display_name if r.props.recipient_type == 1 }.compact * '; '
1272
+
1273
+ only the second still has a problem
1274
+
1275
+ #[#<struct Pst::Desc desc_id=0x2011c4, idx_id=0x397c, idx2_id=0x398a, parent_desc_id=0x8082>]
1276
+
1277
+ think this is related to a multi block #data3. ie, when you use @x * rec_size, and it
1278
+ goes > 8190, or there abouts, then it stuffs up. probably there is header gunk, or something,
1279
+ similar to when #data is multi block.
1280
+
1281
+ same problem affects the attachment table in test4.
1282
+
1283
+ fixed that issue. round data3 ranges to rec_size.
1284
+
1285
+ fix other issue with attached objects.
1286
+
1287
+ all recipients and attachments in test2 are fine.
1288
+
1289
+ only remaining issue is test4 recipients of 200044. strange.
1290
+
1291
+ =end
1292
+
1293
+ # RawPropertyStore is used to iterate through the properties of an item, or the auxiliary
1294
+ # data for an attachment. its just a parser for the way the properties are serialized, when the
1295
+ # properties don't have to conform to a column structure.
1296
+ #
1297
+ # structure of this chunk of data is often
1298
+ # header, property keys, data values, and then indexes.
1299
+ # the property keys has value in it. value can be the actual value if its a short type,
1300
+ # otherwise you lookup the value in the indicies, where you get the offsets to use in the
1301
+ # main data body. due to the indirect thing though, any of these parts could actually come
1302
+ # from a separate stream.
1303
+ #
1304
+ # @private
1305
+ class RawPropertyStore < BlockParser
1306
+ include Enumerable
1307
+
1308
+ # @return [Integer] number of property tuples
1309
+ attr_reader :length
1310
+
1311
+ # Will read Property Context (PC)
1312
+ #
1313
+ # @see https://docs.microsoft.com/en-us/openspecs/office_file_formats/ms-pst/294c83c6-ff92-42f5-b6b6-876c29fa9737
1314
+ # @param desc [NodePtr]
1315
+ # @param local_node_id [Integer]
1316
+ def initialize node, local_node_id = USE_MAIN_DATA
1317
+ super
1318
+ bTypePC = 0xbc
1319
+ raise FormatError, "expected type 188 - got #{@heap_type}" unless @heap_type == bTypePC
1320
+
1321
+ # the way that offset works, data1 may be a subset of buf, or something from id2. if its from buf,
1322
+ # it will be offset based on index_offset and offset. so it could be some random chunk of data anywhere
1323
+ # in the thing.
1324
+ header_data = get_data_indirect @offset1
1325
+ raise FormatError if header_data.length < 8
1326
+ signature, offset2 = header_data.unpack 'V2'
1327
+ raise FormatError, 'invalid Property Context signature 0x%08x' % @type if signature != 0x000602b5
1328
+ # this is actually a big chunk of tag tuples.
1329
+ @index_data = get_data_indirect offset2
1330
+ @length = @index_data.length / 8
1331
+ end
1332
+
1333
+ # iterate through the property tuples
1334
+ #
1335
+ # @yield [key, type, value]
1336
+ # @yieldparam key [Integer]
1337
+ # @yieldparam type [Integer]
1338
+ # @yieldparam value [Object]
1339
+ def each
1340
+ length.times do |i|
1341
+ key, type, value = handle_indirect_values(*@index_data[8 * i, 8].unpack('vvV'))
1342
+ yield key, type, value
1343
+ end
1344
+ end
1345
+ end
1346
+
1347
+ # RawPropertyStoreTable is kind of like a database table.
1348
+ # it has a fixed set of columns.
1349
+ # #[] is kind of like getting a row from the table.
1350
+ # those rows are currently encapsulated by Row, which has #each like
1351
+ # RawPropertyStore.
1352
+ # only used for the recipients array, and the attachments array. completely lazy, doesn't
1353
+ # load any of the properties upon creation.
1354
+ #
1355
+ # @private
1356
+ class RawPropertyStoreTable < BlockParser
1357
+ # TCOLDESC
1358
+ # @private
1359
+ class Column < Struct.new(:ref_type, :type, :ind2_off, :size, :slot)
1360
+ def initialize data
1361
+ super(*data.unpack('v3CC'))
1362
+ end
1363
+
1364
+ def nice_type_name
1365
+ Mapi::Types::DATA[ref_type].first[/_(.*)/, 1].downcase rescue '0x%04x' % ref_type
1366
+ end
1367
+
1368
+ def nice_prop_name
1369
+ Mapi::PropertyStore::TAGS['%04x' % type].first[/_(.*)/, 1].downcase rescue '0x%04x' % type
1370
+ end
1371
+
1372
+ def inspect
1373
+ "#<#{self.class} name=#{nice_prop_name.inspect}, type=#{nice_type_name.inspect}>"
1374
+ end
1375
+ end
1376
+
1377
+ include Enumerable
1378
+
1379
+ # @return [Integer] record count
1380
+ attr_reader :length
1381
+ # @return [String] Array of TCOLDESC
1382
+ attr_reader :index_data
1383
+ # @return [String] 2.3.2 BTree-on-Heap (BTH)
1384
+ attr_reader :data2
1385
+ # @return [Array<String>] 2.3.4.4 Row Matrix
1386
+ attr_reader :rows_pages
1387
+ # @return [Integer] TCI_bm
1388
+ attr_reader :rec_size
1389
+ # @return [Integer]
1390
+ attr_reader :rows_per_page
1391
+
1392
+ # @param node [NodePtr]
1393
+ # @param local_node_id [Integer]
1394
+ def initialize node, local_node_id
1395
+ super
1396
+ bTypeTC = 0x7c
1397
+ raise FormatError, "expected type 124 - got #{@heap_type}" unless @heap_type == bTypeTC
1398
+
1399
+ header_data = get_data_indirect @offset1
1400
+ # seven_c_blk
1401
+ # often: u1 == u2 and u3 == u2 + 2, then rec_size == u3 + 4. wtf
1402
+ # TCINFO
1403
+ seven_c, @num_list, u1, u2, u3, @rec_size, b_five_offset,
1404
+ rows_offset, u7, u8 = header_data[0, 22].unpack('CCv4V2v2')
1405
+ @index_data = header_data[22..-1]
1406
+
1407
+ raise FormatError if @num_list != schema.length or seven_c != 0x7c
1408
+ # another check
1409
+ min_size = schema.inject(0) { |total, col| total + col.size }
1410
+ # seem to have at max, 8 padding bytes on the end of the record. not sure if it means
1411
+ # anything. maybe its just space that hasn't been reclaimed due to columns being
1412
+ # removed or something. probably should just check lower bound.
1413
+ range = (min_size..min_size + 8)
1414
+ warn "rec_size seems wrong (#{range} !=== #{rec_size})" unless range === rec_size
1415
+
1416
+ header_data2 = get_data_indirect b_five_offset
1417
+ raise FormatError if header_data2.length < 8
1418
+ signature, offset2 = header_data2.unpack 'V2'
1419
+ # ??? seems a bit iffy
1420
+ # there's probably more to the differences than this, and the data2 difference below
1421
+ expect = node.pst.header.version_2003? ? 0x000404b5 : 0x000204b5
1422
+ raise FormatError, 'unhandled block signature 0x%08x' % signature if signature != expect
1423
+
1424
+ # this holds all the row data
1425
+ # handle multiple block issue.
1426
+ if rows_offset != 0
1427
+ #if RangesIOIdxChain === @rows_io
1428
+ # @data3_idxs =
1429
+ # # modify ranges
1430
+ # ranges = @rows_io.ranges.map { |offset, size| [offset, size / @rec_size * @rec_size] }
1431
+ # @rows_io.instance_variable_set :@ranges, ranges
1432
+ #end
1433
+ @rows_pages = get_data_array(rows_offset)
1434
+ else
1435
+ # table rows are empty, no data to be read
1436
+ @rows_pages = [""]
1437
+ end
1438
+
1439
+ # there must be something to the data in data2. i think data2 is the array of objects essentially.
1440
+ # currently its only used to imply a length
1441
+ # actually, at size 6, its just some auxiliary data. i'm thinking either Vv/vV, for 97, and something
1442
+ # wider for 03. the second value is just the index (0...length), and the first value is
1443
+ # some kind of offset i expect. actually, they were all id2 values, in another case.
1444
+ # so maybe they're get_data_indirect values too?
1445
+ # actually, it turned out they were identical to the PR_ATTACHMENT_ID2 values...
1446
+ # id2_values = ie, data2.unpack('v*').to_enum(:each_slice, 3).transpose[0]
1447
+ # table[i].assoc(PR_ATTACHMENT_ID2).last == id2_values[i], for all i.
1448
+ @data2 = get_data_indirect(offset2) rescue nil
1449
+ #if data2
1450
+ # @length = (data2.length / 6.0).ceil
1451
+ #else
1452
+ # the above / 6, may have been ok for 97 files, but the new 0x0004 style block must have
1453
+ # different size records... just use this instead:
1454
+ # hmmm, actually, we can still figure it out:
1455
+ @rows_per_page = @rows_pages.first.length / @rec_size
1456
+
1457
+ @length = @rows_pages.map { |data| data.length / @rec_size }.sum
1458
+
1459
+ #end
1460
+
1461
+ # lets try and at least use data2 for a warning for now
1462
+ #if data2
1463
+ # data2_rec_size = node.pst.header.version_2003? ? 8 : 6
1464
+ # warn 'somthing seems wrong with data3' unless @length == (data2.length / data2_rec_size)
1465
+ #end
1466
+ end
1467
+
1468
+ # for debug
1469
+ #
1470
+ # @return [Array<Column>]
1471
+ # @private
1472
+ def schema
1473
+ @schema ||= Pst.split_per(index_data, 8, -1).map { |data| Column.new data }
1474
+ end
1475
+
1476
+ # return grid row
1477
+ #
1478
+ # @param idx [Integer]
1479
+ # @return [Row]
1480
+ def [] idx
1481
+ # handle funky rounding
1482
+ Row.new self, idx
1483
+ end
1484
+
1485
+ # @yield [row]
1486
+ # @yieldparam row [Row]
1487
+ def each
1488
+ length.times { |i| yield self[i] }
1489
+ end
1490
+
1491
+ # get record data
1492
+ #
1493
+ # @param record_index [Integer]
1494
+ # @return [String]
1495
+ # @private
1496
+ def get_record record_index
1497
+ page_index = record_index / @rows_per_page
1498
+ heap_index = record_index % @rows_per_page
1499
+ (@rows_pages[page_index])[@rec_size * heap_index, @rec_size]
1500
+ end
1501
+
1502
+ class Row
1503
+ include Enumerable
1504
+
1505
+ # @param array_parser [RawPropertyStoreTable]
1506
+ # @param index [Integer]
1507
+ def initialize array_parser, index
1508
+ @array_parser = array_parser
1509
+ @index = index
1510
+ @data = @array_parser.get_record(index)
1511
+ end
1512
+
1513
+ # iterate through the property tuples
1514
+ #
1515
+ # @yield [key, type, value]
1516
+ # @yieldparam key [Integer]
1517
+ # @yieldparam type [Integer]
1518
+ # @yieldparam value [Object]
1519
+ def each
1520
+ (@array_parser.index_data.length / 8).times do |i|
1521
+ ref_type, type, ind2_off, size, slot = @array_parser.index_data[8 * i, 8].unpack 'v3CC'
1522
+ # check this rescue too
1523
+ value = @data[ind2_off, size]
1524
+ # if INDIRECT_TYPES.include? ref_type
1525
+ if size <= 4
1526
+ value = value.unpack('V')[0]
1527
+ end
1528
+ #p ['0x%04x' % ref_type, '0x%04x' % type, (Msg::Properties::MAPITAGS['%04x' % type].first[/^.._(.*)/, 1].downcase rescue nil),
1529
+ # value_orig, value, (get_data_indirect(value_orig.unpack('V')[0]) rescue nil), size, ind2_off, slot]
1530
+ key, type, value = @array_parser.handle_indirect_values type, ref_type, value
1531
+ yield key, type, value
1532
+ end
1533
+ end
1534
+ end
1535
+ end
1536
+
1537
+ # @private
1538
+ class AttachmentTable < BlockParser
1539
+ # a "fake" MAPI property name for this constant. if you get a mapi property with
1540
+ # this value, it is the id2 value to use to get attachment data.
1541
+ #
1542
+ # @private
1543
+ PR_ATTACHMENT_ID2 = 0x67f2
1544
+
1545
+ # @return [NodePtr]
1546
+ # @private
1547
+ attr_reader :node
1548
+ # @return [RawPropertyStoreTable]
1549
+ # @private
1550
+ attr_reader :table
1551
+
1552
+ # @param node [NodePtr]
1553
+ def initialize node
1554
+ @node = node
1555
+ # no super, we only actually want BlockParser2#idx2
1556
+ #@table = nil
1557
+ #return unless node.sub_block
1558
+ #return unless block = sub_block[ID2_ATTACHMENTS]
1559
+ ## FIXME make a fake desc.
1560
+ #@fake_node = OpenStruct.new :block => block, :pst => node.pst, :sub_block => node.sub_block
1561
+ if @node.has_sub ID2_ATTACHMENTS
1562
+ @table = RawPropertyStoreTable.new @node, ID2_ATTACHMENTS
1563
+ else
1564
+ @table = []
1565
+ end
1566
+ end
1567
+
1568
+ # @return [Array<Array<Array(Integer, Integer, Object)>>]
1569
+ def to_a
1570
+ return [] if !table
1571
+ table.map do |attachment|
1572
+ attachment = attachment.to_a
1573
+ # potentially merge with yet more properties
1574
+ # this still seems pretty broken - especially the property overlap
1575
+ if attachment_id2 = attachment.assoc(PR_ATTACHMENT_ID2)
1576
+ # verify existence of this record
1577
+ if @node.has_sub attachment_id2.last
1578
+ RawPropertyStore.new(@node, attachment_id2.last).each do |a, b, c|
1579
+ record = attachment.assoc a
1580
+ attachment << record = [] unless record
1581
+ record.replace [a, b, c]
1582
+ end
1583
+ else
1584
+ warn "attachment record is missing"
1585
+ end
1586
+ end
1587
+ attachment
1588
+ end
1589
+ end
1590
+ end
1591
+
1592
+ # there is no equivalent to this in libpst. ID2_RECIPIENTS was just guessed given the above
1593
+ # AttachmentTable.
1594
+ #
1595
+ # @private
1596
+ class RecipientTable < BlockParser
1597
+ # @return [NodePtr]
1598
+ # @private
1599
+ attr_reader :node
1600
+ # @return [RawPropertyStoreTable]
1601
+ # @private
1602
+ attr_reader :table
1603
+
1604
+ # @param node [NodePtr]
1605
+ def initialize node
1606
+ @node = node
1607
+ # no super, we only actually want BlockParser2#idx2
1608
+ #@table = nil
1609
+ #return unless node.sub_block
1610
+ #return unless block = sub_block[ID2_RECIPIENTS]
1611
+ ## FIXME make a fake desc.
1612
+ #fake_node = OpenStruct.new :block => block, :pst => node.pst, :sub_block => node.sub_block
1613
+ if @node.has_sub ID2_RECIPIENTS
1614
+ @table = RawPropertyStoreTable.new @node, ID2_RECIPIENTS
1615
+ else
1616
+ @table = []
1617
+ end
1618
+
1619
+ end
1620
+
1621
+ # @return [Array<Array<Array(Integer, Integer, Object)>>]
1622
+ def to_a
1623
+ return [] if !table
1624
+ table.map { |x| x.to_a }
1625
+ end
1626
+ end
1627
+
1628
+ #
1629
+ # higher level item code. wraps up the raw properties above, and gives nice
1630
+ # objects to work with. handles item relationships too.
1631
+ # ----------------------------------------------------------------------------
1632
+ #
1633
+
1634
+ # @param property_list [Array<Array(Integer, Integer, Object)>]
1635
+ # @return [PropertySet]
1636
+ # @private
1637
+ def self.make_property_set property_list
1638
+ hash = property_list.inject({}) do |hash, (key, type, value)|
1639
+ hash.update PropertySet::Key.new(key) => value
1640
+ end
1641
+ PropertySet.new hash
1642
+ end
1643
+
1644
+ class Attachment < Mapi::Attachment
1645
+ def initialize list
1646
+ super Pst.make_property_set(list)
1647
+
1648
+ @embedded_msg = props.attach_data if Item === props.attach_data
1649
+ end
1650
+ end
1651
+
1652
+ class Recipient < Mapi::Recipient
1653
+ def initialize list
1654
+ super Pst.make_property_set(list)
1655
+ end
1656
+ end
1657
+
1658
+ class Item < Mapi::Message
1659
+ # @private
1660
+ class EntryID < Struct.new(:u1, :entry_id, :id)
1661
+ UNPACK_STR = 'VA16V'
1662
+
1663
+ def initialize data
1664
+ data = data.unpack(UNPACK_STR) if String === data
1665
+ super(*data)
1666
+ end
1667
+ end
1668
+
1669
+ include RecursivelyEnumerable
1670
+
1671
+ # Obtain item type
1672
+ #
1673
+ # - `:folder`
1674
+ # - `:message`
1675
+ # - `:wastebasket`
1676
+ #
1677
+ # @return [Symbol]
1678
+ attr_accessor :type
1679
+
1680
+ # @return [Item]
1681
+ attr_accessor :parent
1682
+
1683
+ # @param node [NodePtr]
1684
+ # @param list [Array]
1685
+ # @param type [Object, nil]
1686
+ def initialize node, list, type=nil
1687
+ @node = node
1688
+ super Pst.make_property_set(list)
1689
+
1690
+ # this is kind of weird, but the ids of the special folders are stored in a hash
1691
+ # when the root item is loaded
1692
+ if ipm_wastebasket_entryid
1693
+ node.pst.special_folder_ids[ipm_wastebasket_entryid] = :wastebasket
1694
+ end
1695
+
1696
+ if finder_entryid
1697
+ node.pst.special_folder_ids[finder_entryid] = :finder
1698
+ end
1699
+
1700
+ # and then here, those are used, along with a crappy heuristic to determine if we are an
1701
+ # item
1702
+ =begin
1703
+ i think the low bits of the desc_id can give some info on the type.
1704
+
1705
+ it seems that 0x4 is for regular messages (and maybe contacts etc)
1706
+ 0x2 is for folders, and 0x8 is for special things like rules etc, that aren't visible.
1707
+ =end
1708
+ unless type
1709
+ type = props.valid_folder_mask || ipm_subtree_entryid || props.content_count || props.subfolders ? :folder : :message
1710
+ if type == :folder
1711
+ type = node.pst.special_folder_ids[node.node_id] || type
1712
+ end
1713
+ end
1714
+
1715
+ @type = type
1716
+ end
1717
+
1718
+ # @yield [item]
1719
+ # @yieldparam item [Item]
1720
+ # @return [void]
1721
+ def each_child
1722
+ id = ipm_subtree_entryid
1723
+ if id
1724
+ root = @node.pst.node_from_id id
1725
+ raise "couldn't find root" unless root
1726
+ raise 'both kinds of children' unless @node.children.empty?
1727
+ children = root.children
1728
+ # lets look up the other ids we have.
1729
+ # typically the wastebasket one "deleted items" is in the children already, but
1730
+ # the search folder isn't.
1731
+ extras = [ipm_wastebasket_entryid, finder_entryid].compact.map do |id|
1732
+ root = @node.pst.node_from_id id
1733
+ warn "couldn't find root for id #{id}" unless root
1734
+ root
1735
+ end.compact
1736
+ # i do this instead of union, so as not to mess with the order of the
1737
+ # existing children.
1738
+ children += (extras - children)
1739
+ children
1740
+ else
1741
+ @node.children
1742
+ end.each do |node|
1743
+ item = @node.pst.pst_parse_item(node)
1744
+ item.parent = self
1745
+ yield item
1746
+ end
1747
+ end
1748
+
1749
+ # @return [String]
1750
+ def path
1751
+ parents, item = [], self
1752
+ parents.unshift item while item = item.parent
1753
+ # remove root
1754
+ parents.shift
1755
+ parents.map { |item| item.props.display_name or raise 'unable to construct path' } * '/'
1756
+ end
1757
+
1758
+ # Enumerate direct children
1759
+ #
1760
+ # @return [Array<Item>]
1761
+ def children
1762
+ to_enum(:each_child).to_a
1763
+ end
1764
+
1765
+ # these are still around because they do different stuff
1766
+
1767
+ # Top of Personal Folder Record
1768
+ #
1769
+ # @private
1770
+ def ipm_subtree_entryid
1771
+ @ipm_subtree_entryid ||= EntryID.new(props.ipm_subtree_entryid.read).id rescue nil
1772
+ end
1773
+
1774
+ # Deleted Items Folder Record
1775
+ #
1776
+ # @private
1777
+ def ipm_wastebasket_entryid
1778
+ @ipm_wastebasket_entryid ||= EntryID.new(props.ipm_wastebasket_entryid.read).id rescue nil
1779
+ end
1780
+
1781
+ # Search Root Record
1782
+ #
1783
+ # @private
1784
+ def finder_entryid
1785
+ @finder_entryid ||= EntryID.new(props.finder_entryid.read).id rescue nil
1786
+ end
1787
+
1788
+ # all these have been replaced with the method_missing below
1789
+ =begin
1790
+ # States which folders are valid for this message store
1791
+ #def valid_folder_mask
1792
+ # props[0x35df]
1793
+ #end
1794
+
1795
+ # Number of emails stored in a folder
1796
+ def content_count
1797
+ props[0x3602]
1798
+ end
1799
+
1800
+ # Has children
1801
+ def subfolders
1802
+ props[0x360a]
1803
+ end
1804
+ =end
1805
+
1806
+ # i think i will change these, so they can inherit the lazyness from RawPropertyStoreTable.
1807
+ # so if you want the last attachment, you can get it without creating the others perhaps.
1808
+ # it just has to handle the no table at all case a bit more gracefully.
1809
+
1810
+ # @return [Array<Attachment>]
1811
+ def attachments
1812
+ @attachments ||= AttachmentTable.new(@node).to_a.map { |list| Attachment.new list }
1813
+ end
1814
+
1815
+ # @return [Array<Recipient>]
1816
+ def recipients
1817
+ #[]
1818
+ @recipients ||= RecipientTable.new(@node).to_a.map { |list| Recipient.new list }
1819
+ end
1820
+
1821
+ # Iterate children (except on this instance) recursively stored in this MessageStore.
1822
+ #
1823
+ # @yield [item]
1824
+ # @yieldparam item [Item]
1825
+ # @return [void]
1826
+ def each_recursive(&block)
1827
+ #p :self => self
1828
+ children.each do |child|
1829
+ #p :child => child
1830
+ block[child]
1831
+ child.each_recursive(&block)
1832
+ end
1833
+ end
1834
+
1835
+ def inspect
1836
+ attrs = %w[display_name subject sender_name subfolders]
1837
+ # attrs = %w[display_name valid_folder_mask ipm_wastebasket_entryid finder_entryid content_count subfolders]
1838
+ str = attrs.map { |a| b = props.send a; " #{a}=#{b.inspect}" if b }.compact * ','
1839
+
1840
+ type_s = type == :message ? 'Message' : type == :folder ? 'Folder' : type.to_s.capitalize + 'Folder'
1841
+ str2 = 'node_id=0x%x' % @node.node_id
1842
+
1843
+ !str.empty? ? "#<Pst::#{type_s} #{str2}#{str}>" : "#<Pst::#{type_s} #{str2} props=#{props.inspect}>" #\n" + props.transport_message_headers + ">"
1844
+ end
1845
+ end
1846
+
1847
+ # corresponds to
1848
+ # * _pst_parse_item
1849
+ #
1850
+ # @param desc [NodePtr]
1851
+ # @return [Item]
1852
+ # @private
1853
+ def pst_parse_item node
1854
+ Item.new node, RawPropertyStore.new(node).to_a
1855
+ end
1856
+
1857
+ #
1858
+ # other random code
1859
+ # ----------------------------------------------------------------------------
1860
+ #
1861
+
1862
+ # @private
1863
+ def dump_debug_info
1864
+ puts "* pst header"
1865
+ p header
1866
+
1867
+ =begin
1868
+ Looking at the output of this, for blank-o1997.pst, i see this part:
1869
+ ...
1870
+ - (26624,516) desc block data (overlap of 4 bytes)
1871
+ - (27136,516) desc block data (gap of 508 bytes)
1872
+ - (28160,516) desc block data (gap of 2620 bytes)
1873
+ ...
1874
+
1875
+ which confirms my belief that the block size for idx and desc is more likely 512
1876
+ =end
1877
+ if 0 + 0 == 0
1878
+ puts '* file range usage'
1879
+ file_ranges =
1880
+ # these 3 things, should account for most of the data in the file.
1881
+ [[0, Header::SIZE, 'pst file header']] +
1882
+ @block_offsets.map { |offset| [offset, BlockPtr::BLOCK_SIZE, 'block data'] } +
1883
+ @node_offsets.map { |offset| [offset, NodePtr::BLOCK_SIZE, 'node data'] } +
1884
+ @blocks.map { |idx| [idx.offset, idx.size, 'idx id=0x%x (%s)' % [idx.id, idx.type]] }
1885
+ (file_ranges.sort_by { |idx| idx.first } + [nil]).to_enum(:each_cons, 2).each do |(offset, size, name), next_record|
1886
+ # i think there is a padding of the size out to 64 bytes
1887
+ # which is equivalent to padding out the final offset, because i think the offset is
1888
+ # similarly oriented
1889
+ pad_amount = 64
1890
+ warn 'i am wrong about the offset padding' if offset % pad_amount != 0
1891
+ # so, assuming i'm not wrong about that, then we can calculate how much padding is needed.
1892
+ pad = pad_amount - (size % pad_amount)
1893
+ pad = 0 if pad == pad_amount
1894
+ gap = next_record ? next_record.first - (offset + size + pad) : 0
1895
+ extra = case gap <=> 0
1896
+ when -1; ["overlap of #{gap.abs} bytes)"]
1897
+ when 0; []
1898
+ when +1; ["gap of #{gap} bytes"]
1899
+ end
1900
+ # how about we check that padding
1901
+ @io.pos = offset + size
1902
+ pad_bytes = @io.read(pad)
1903
+ extra += ["padding not all zero"] unless pad_bytes == 0.chr * pad
1904
+ puts "- #{offset}:#{size}+#{pad} #{name.inspect}" + (extra.empty? ? '' : ' [' + extra * ', ' + ']')
1905
+ end
1906
+ end
1907
+
1908
+ # i think the idea of the idx, and indeed the idx2, is just to be able to
1909
+ # refer to data indirectly, which means it can get moved around, and you just update
1910
+ # the idx table. it is simply a list of file offsets and sizes.
1911
+ # not sure i get how id2 plays into it though....
1912
+ # the sizes seem to be all even. is that a co-incidence? and the ids are all even. that
1913
+ # seems to be related to something else (see the (id & 2) == 1 stuff)
1914
+ puts '* idx entries'
1915
+ @blocks.each { |idx| puts "- #{idx.inspect}" }
1916
+
1917
+ # if you look at the desc tree, you notice a few things:
1918
+ # 1. there is a desc that seems to be the parent of all the folders, messages etc.
1919
+ # it is the one whose parent is itself.
1920
+ # one of its children is referenced as the subtree_entryid of the first desc item,
1921
+ # the root.
1922
+ # 2. typically only 2 types of desc records have idx2_id != 0. messages themselves,
1923
+ # and the desc with id = 0x61 - the xattrib container. everything else uses the
1924
+ # regular ids to find its data. i think it should be reframed as small blocks and
1925
+ # big blocks, but i'll look into it more.
1926
+ #
1927
+ # idx_id and idx2_id are for getting to the data. desc_id and parent_desc_id just define
1928
+ # the parent <-> child relationship, and the desc_ids are how the items are referred to in
1929
+ # entryids.
1930
+ # note that these aren't unique! eg for 0, 4 etc. i expect these'd never change, as the ids
1931
+ # are stored in entryids. whereas the idx and idx2 could be a bit more volatile.
1932
+ puts '* node tree'
1933
+ # make a dummy root hold everything just for convenience
1934
+ root = NodePtr.new ''
1935
+ def root.inspect; "#<Pst::Root>"; end
1936
+ root.children.replace @orphans
1937
+ # this still loads the whole thing as a string for gsub. should use directo output io
1938
+ # version.
1939
+ puts root.to_tree.gsub(/, (parent_node_id|idx2_id)=0x0(?!\d)/, '')
1940
+
1941
+ # this is fairly easy to understand, its just an attempt to display the pst items in a tree form
1942
+ # which resembles what you'd see in outlook.
1943
+ puts '* item tree'
1944
+ # now streams directly
1945
+ root_item.to_tree STDOUT
1946
+ end
1947
+
1948
+ # @return [NodePtr]
1949
+ # @private
1950
+ def root_desc
1951
+ @nodes.first
1952
+ end
1953
+
1954
+ # @return [Item]
1955
+ # @private
1956
+ def root_item
1957
+ item = pst_parse_item root_desc
1958
+ item.type = :root
1959
+ item
1960
+ end
1961
+
1962
+ # Obtain a root item
1963
+ #
1964
+ # @return [Item]
1965
+ def root
1966
+ root_item
1967
+ end
1968
+
1969
+ # depth first search of all items
1970
+ include Enumerable
1971
+
1972
+ # Iterate all kind of items recursively stored in this MessageStore.
1973
+ #
1974
+ # @yield [message]
1975
+ # @yieldparam message [Item]
1976
+ # @return [void]
1977
+ def each(&block)
1978
+ root = self.root
1979
+ block[root]
1980
+ root.each_recursive(&block)
1981
+ end
1982
+
1983
+ # Get this MessageStore's display name.
1984
+ #
1985
+ # @return [String]
1986
+ def name
1987
+ @name ||= root_item.props.display_name
1988
+ end
1989
+
1990
+ def inspect
1991
+ "#<Pst name=#{name.inspect} io=#{io.inspect}>"
1992
+ end
1993
+ end
1994
+ end
1995
+