ruby-msg 1.2.17

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,927 @@
1
+ #! /usr/bin/ruby -w
2
+
3
+ $: << File.dirname(__FILE__) + '/..'
4
+
5
+ require 'support'
6
+
7
+ require 'stringio'
8
+ require 'tempfile'
9
+
10
+ require 'ole/base'
11
+ require 'ole/types'
12
+ # not strictly ole related
13
+ require 'ole/io_helpers'
14
+
15
+ module Ole # :nodoc:
16
+ #
17
+ # = Introduction
18
+ #
19
+ # <tt>Ole::Storage</tt> is a class intended to abstract away details of the
20
+ # access to OLE2 structured storage files, such as those produced by
21
+ # Microsoft Office, eg *.doc, *.msg etc.
22
+ #
23
+ # Initially based on chicago's libole, source available at
24
+ # http://prdownloads.sf.net/chicago/ole.tgz
25
+ # Later augmented with some corrections by inspecting pole, and (purely
26
+ # for header definitions) gsf.
27
+ #
28
+ # = Usage
29
+ #
30
+ # Usage should be fairly straight forward:
31
+ #
32
+ # # get the parent ole storage object
33
+ # ole = Ole::Storage.open 'myfile.msg', 'r+'
34
+ # # => #<Ole::Storage io=#<File:myfile.msg> root=#<Dirent:"Root Entry">>
35
+ # # read some data
36
+ # ole.root[1].read 4
37
+ # # => "\001\000\376\377"
38
+ # # get the top level root object and output a tree structure for
39
+ # # debugging
40
+ # puts ole.root.to_tree
41
+ # # =>
42
+ # - #<Dirent:"Root Entry" size=3840 time="2006-11-03T00:52:53Z">
43
+ # |- #<Dirent:"__nameid_version1.0" size=0 time="2006-11-03T00:52:53Z">
44
+ # | |- #<Dirent:"__substg1.0_00020102" size=16 data="CCAGAAAAAADAAA...">
45
+ # ...
46
+ # |- #<Dirent:"__substg1.0_8002001E" size=4 data="MTEuMA==">
47
+ # |- #<Dirent:"__properties_version1.0" size=800 data="AAAAAAAAAAABAA...">
48
+ # \- #<Dirent:"__recip_version1.0_#00000000" size=0 time="2006-11-03T00:52:53Z">
49
+ # |- #<Dirent:"__substg1.0_0FF60102" size=4 data="AAAAAA==">
50
+ # ...
51
+ # # write some data, and finish up (note that open is 'r+', so this overwrites
52
+ # # but doesn't truncate)
53
+ # ole.root["\001CompObj"].open { |f| f.write "blah blah" }
54
+ # ole.close
55
+ #
56
+ # = TODO
57
+ #
58
+ # 1. tests. lock down how things work at the moment - mostly good.
59
+ # create from scratch works now, as does copying in a subtree of another doc, so
60
+ # ole embedded attachment serialization works now. i can save embedded xls in an msg
61
+ # into a separate file, and open it. this was a goal. now i would want to implemenet
62
+ # to_mime conversion for embedded attachments, that serializes them to ole, but handles
63
+ # some separately like various meta file types as plain .wmf attachments perhaps. this
64
+ # will give pretty good .eml's from emails with embedded attachments.
65
+ # the other todo is .rtf output, with full support for embedded ole objects...
66
+ # 2. lots of tidying up
67
+ # - main FIXME's in this regard are:
68
+ # * the custom header cruft for Header and Dirent needs some love.
69
+ # * i have a number of classes doing load/save combos: Header, AllocationTable, Dirent,
70
+ # and, in a manner of speaking, but arguably different, Storage itself.
71
+ # they have differing api's which would be nice to clean.
72
+ # AllocationTable::Big must be created aot now, as it is used for all subsequent reads.
73
+ # * ole types need work, can't serialize datetime at the moment.
74
+ # 3. need to fix META_BAT support in #flush.
75
+ #
76
+ class Storage
77
+ VERSION = '1.1.3'
78
+
79
+ # The top of the ole tree structure
80
+ attr_reader :root
81
+ # The tree structure in its original flattened form. only valid after #load, or #flush.
82
+ attr_reader :dirents
83
+ # The underlying io object to/from which the ole object is serialized, whether we
84
+ # should close it, and whether it is writeable
85
+ attr_reader :io, :close_parent, :writeable
86
+ # Low level internals, you probably shouldn't need to mess with these
87
+ attr_reader :header, :bbat, :sbat, :sb_file
88
+
89
+ # maybe include an option hash, and allow :close_parent => true, to be more general.
90
+ # +arg+ should be either a file, or an +IO+ object, and needs to be seekable.
91
+ def initialize arg, mode=nil
92
+ # get the io object
93
+ @close_parent, @io = if String === arg
94
+ [true, open(arg, mode || 'rb')]
95
+ else
96
+ raise 'unable to specify mode string with io object' if mode
97
+ [false, arg]
98
+ end
99
+ # do we have this file opened for writing? don't know of a better way to tell
100
+ @writeable = begin
101
+ @io.flush
102
+ true
103
+ rescue IOError
104
+ false
105
+ end
106
+ # silence undefined warning in clear
107
+ @sb_file = nil
108
+ # if the io object has data, we should load it, otherwise start afresh
109
+ @io.size > 0 ? load : clear
110
+ end
111
+
112
+ def self.new arg, mode=nil
113
+ ole = super
114
+ if block_given?
115
+ begin yield ole
116
+ ensure; ole.close
117
+ end
118
+ else ole
119
+ end
120
+ end
121
+
122
+ class << self
123
+ # encouraged
124
+ alias open :new
125
+ # deprecated
126
+ alias load :new
127
+ end
128
+
129
+ # load document from file.
130
+ def load
131
+ # we always read 512 for the header block. if the block size ends up being different,
132
+ # what happens to the 109 fat entries. are there more/less entries?
133
+ @io.rewind
134
+ header_block = @io.read 512
135
+ @header = Header.load header_block
136
+
137
+ # create an empty bbat
138
+ @bbat = AllocationTable::Big.new self
139
+ # extra mbat blocks
140
+ mbat_blocks = (0...@header.num_mbat).map { |i| i + @header.mbat_start }
141
+ bbat_chain = (header_block[Header::SIZE..-1] + @bbat.read(mbat_blocks)).unpack 'L*'
142
+ # am i using num_bat in the right way?
143
+ @bbat.load @bbat.read(bbat_chain[0, @header.num_bat])
144
+
145
+ # get block chain for directories, read it, then split it into chunks and load the
146
+ # directory entries. semantics changed - used to cut at first dir where dir.type == 0
147
+ @dirents = @bbat.read(@header.dirent_start).scan(/.{#{Dirent::SIZE}}/mo).
148
+ map { |str| Dirent.load self, str }.reject { |d| d.type_id == 0 }
149
+
150
+ # now reorder from flat into a tree
151
+ # links are stored in some kind of balanced binary tree
152
+ # check that everything is visited at least, and at most once
153
+ # similarly with the blocks of the file.
154
+ # was thinking of moving this to Dirent.to_tree instead.
155
+ class << @dirents
156
+ def to_tree idx=0
157
+ return [] if idx == Dirent::EOT
158
+ d = self[idx]
159
+ d.children = to_tree d.child
160
+ raise "directory #{d.inspect} used twice" if d.idx
161
+ d.idx = idx
162
+ to_tree(d.prev) + [d] + to_tree(d.next)
163
+ end
164
+ end
165
+
166
+ @root = @dirents.to_tree.first
167
+ Log.warn "root name was #{@root.name.inspect}" unless @root.name == 'Root Entry'
168
+ unused = @dirents.reject(&:idx).length
169
+ Log.warn "* #{unused} unused directories" if unused > 0
170
+
171
+ # FIXME i don't currently use @header.num_sbat which i should
172
+ # hmm. nor do i write it. it means what exactly again?
173
+ @sb_file = RangesIOResizeable.new @bbat, @root.first_block, @root.size
174
+ @sbat = AllocationTable::Small.new self
175
+ @sbat.load @bbat.read(@header.sbat_start)
176
+ end
177
+
178
+ def close
179
+ flush if @writeable
180
+ @sb_file.close
181
+ @io.close if @close_parent
182
+ end
183
+
184
+ # should have a #open_dirent i think. and use it in load and flush. neater.
185
+ # also was thinking about Dirent#open_padding. then i can more easily clean up the padding
186
+ # to be 0.chr
187
+ =begin
188
+ thoughts on fixes:
189
+ 1. reterminate any chain not ending in EOC.
190
+ 2. pass through all chain heads looking for collisions, and making sure nothing points to them
191
+ (ie they are really heads).
192
+ 3. we know the locations of the bbat data, and mbat data. ensure that there are placeholder blocks
193
+ in the bat for them.
194
+ this stuff will ensure reliability of input better. otherwise, its actually worth doing a repack
195
+ directly after read, to ensure the above is probably acounted for, before subsequent writes possibly
196
+ destroy things.
197
+ =end
198
+ def flush
199
+ # recreate dirs from our tree, split into dirs and big and small files
200
+ @root.type = :root
201
+ @root.name = 'Root Entry'
202
+ @root.first_block = @sb_file.first_block
203
+ @root.size = @sb_file.size
204
+ @dirents = @root.flatten
205
+
206
+ # maybe i should move the block form up to RangesIO, and get it for free at all levels.
207
+ # Dirent#open gets block form for free then
208
+ io = RangesIOResizeable.new @bbat, @header.dirent_start
209
+ io.truncate 0
210
+ @dirents.each { |dirent| io.write dirent.save }
211
+ padding = (io.size / @bbat.block_size.to_f).ceil * @bbat.block_size - io.size
212
+ io.write 0.chr * padding
213
+ @header.dirent_start = io.first_block
214
+ io.close
215
+
216
+ # similarly for the sbat data.
217
+ io = RangesIOResizeable.new @bbat, @header.sbat_start
218
+ io.truncate 0
219
+ io.write @sbat.save
220
+ @header.sbat_start = io.first_block
221
+ @header.num_sbat = @bbat.chain(@header.sbat_start).length
222
+ io.close
223
+
224
+ # what follows will be slightly more complex for the bat fiddling.
225
+
226
+ # create RangesIOResizeable hooked up to the bbat. use that to claim bbat blocks using
227
+ # truncate. then when its time to write, convert that chain and some chunk of blocks at
228
+ # the end, into META_BAT blocks. write out the chain, and those meta bat blocks, and its
229
+ # done.
230
+ @bbat.table.map! do |b|
231
+ b == AllocationTable::BAT || b == AllocationTable::META_BAT ?
232
+ AllocationTable::AVAIL : b
233
+ end
234
+ io = RangesIOResizeable.new @bbat, AllocationTable::EOC
235
+
236
+ # use crappy loop for now:
237
+ while true
238
+ bbat_data = @bbat.save
239
+ #mbat_data = bbat_data.length / @bbat.block_size * 4
240
+ mbat_chain = @bbat.chain io.first_block
241
+ raise NotImplementedError, "don't handle writing out extra META_BAT blocks yet" if mbat_chain.length > 109
242
+ # so we can ignore meta blocks in this calculation:
243
+ break if io.size >= bbat_data.length # it shouldn't be bigger right?
244
+ # this may grow the bbat, depending on existing available blocks
245
+ io.truncate bbat_data.length
246
+ end
247
+
248
+ # now extract the info we want:
249
+ ranges = io.ranges
250
+ mbat_chain = @bbat.chain io.first_block
251
+ io.close
252
+ mbat_chain.each { |b| @bbat.table[b] = AllocationTable::BAT }
253
+ @header.num_bat = mbat_chain.length
254
+ #p @bbat.truncated_table
255
+ #p ranges
256
+ #p mbat_chain
257
+ # not resizeable!
258
+ io = RangesIO.new @io, ranges
259
+ io.write @bbat.save
260
+ io.close
261
+ mbat_chain += [AllocationTable::AVAIL] * (109 - mbat_chain.length)
262
+ @header.mbat_start = AllocationTable::EOC
263
+ @header.num_mbat = 0
264
+
265
+ =begin
266
+ # Old save code. remove shortly
267
+
268
+ bbat_data = new_bbat.save
269
+ # must exist as linear chain stored in header.
270
+ @header.num_bat = (bbat_data.length / new_bbat.block_size.to_f).ceil
271
+ base = io.pos / new_bbat.block_size - 1
272
+ io.write bbat_data
273
+ # now that spanned a number of blocks:
274
+ mbat = (0...@header.num_bat).map { |i| i + base }
275
+ mbat += [AllocationTable::AVAIL] * (109 - mbat.length) if mbat.length < 109
276
+ header_mbat = mbat[0...109]
277
+ other_mbat_data = mbat[109..-1].pack 'L*'
278
+ @header.mbat_start = base + @header.num_bat
279
+ @header.num_mbat = (other_mbat_data.length / new_bbat.block_size.to_f).ceil
280
+ io.write other_mbat_data
281
+ =end
282
+
283
+ @root.type = :dir
284
+
285
+ # now seek back and write the header out
286
+ @io.seek 0
287
+ @io.write @header.save + mbat_chain.pack('L*')
288
+ @io.flush
289
+ end
290
+
291
+ def clear
292
+ # initialize to equivalent of loading an empty ole document.
293
+ Log.warn 'creating new ole storage object on non-writable io' unless @writeable
294
+ @header = Header.new
295
+ @bbat = AllocationTable::Big.new self
296
+ @root = Dirent.new self, :dir
297
+ @root.name = 'Root Entry'
298
+ @dirents = [@root]
299
+ @root.idx = 0
300
+ @root.children = []
301
+ # size shouldn't display for non-files
302
+ @root.size = 0
303
+ @sb_file.close if @sb_file
304
+ @sb_file = RangesIOResizeable.new @bbat, AllocationTable::EOC
305
+ @sbat = AllocationTable::Small.new self
306
+ # throw everything else the hell away
307
+ @io.truncate 0
308
+ end
309
+
310
+ # could be useful with mis-behaving ole documents. or to just clean them up.
311
+ def repack temp=:file
312
+ case temp
313
+ when :file; Tempfile.open 'w+', &method(:repack_using_io)
314
+ when :mem; StringIO.open(&method(:repack_using_io))
315
+ else raise "unknown temp backing #{temp.inspect}"
316
+ end
317
+ end
318
+
319
+ def repack_using_io temp_io
320
+ @io.rewind
321
+ IO.copy @io, temp_io
322
+ clear
323
+ Storage.open temp_io do |temp_ole|
324
+ temp_ole.root.type = :dir
325
+ Dirent.copy temp_ole.root, root
326
+ end
327
+ end
328
+
329
+ def bat_for_size size
330
+ # note >=, not > previously.
331
+ size >= @header.threshold ? @bbat : @sbat
332
+ end
333
+
334
+ def inspect
335
+ "#<#{self.class} io=#{@io.inspect} root=#{@root.inspect}>"
336
+ end
337
+
338
+ # A class which wraps the ole header
339
+ class Header < Struct.new(
340
+ :magic, :clsid, :minor_ver, :major_ver, :byte_order, :b_shift, :s_shift,
341
+ :reserved, :csectdir, :num_bat, :dirent_start, :transacting_signature, :threshold,
342
+ :sbat_start, :num_sbat, :mbat_start, :num_mbat
343
+ )
344
+ PACK = 'a8 a16 S2 a2 S2 a6 L3 a4 L5'
345
+ SIZE = 0x4c
346
+ # i have seen it pointed out that the first 4 bytes of hex,
347
+ # 0xd0cf11e0, is supposed to spell out docfile. hmmm :)
348
+ MAGIC = "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # expected value of Header#magic
349
+ # what you get if creating new header from scratch.
350
+ # AllocationTable::EOC isn't available yet. meh.
351
+ EOC = 0xfffffffe
352
+ DEFAULT = [
353
+ MAGIC, 0.chr * 16, 59, 3, "\xfe\xff", 9, 6,
354
+ 0.chr * 6, 0, 1, EOC, 0.chr * 4,
355
+ 4096, EOC, 0, EOC, 0
356
+ ]
357
+
358
+ # 2 basic initializations, from scratch, or from a data string.
359
+ # from scratch will be geared towards creating a new ole object
360
+ def initialize *values
361
+ super(*(values.empty? ? DEFAULT : values))
362
+ validate!
363
+ end
364
+
365
+ def self.load str
366
+ Header.new(*str.unpack(PACK))
367
+ end
368
+
369
+ def save
370
+ to_a.pack PACK
371
+ end
372
+
373
+ def validate!
374
+ raise "OLE2 signature is invalid" unless magic == MAGIC
375
+ if num_bat == 0 or # is that valid for a completely empty file?
376
+ # not sure about this one. basically to do max possible bat given size of mbat
377
+ num_bat > 109 && num_bat > 109 + num_mbat * (1 << b_shift - 2) or
378
+ # shouldn't need to use the mbat as there is enough space in the header block
379
+ num_bat < 109 && num_mbat != 0 or
380
+ # given the size of the header is 76, if b_shift <= 6, blocks address the header.
381
+ s_shift > b_shift or b_shift <= 6 or b_shift >= 31 or
382
+ # we only handle little endian
383
+ byte_order != "\xfe\xff"
384
+ raise "not valid OLE2 structured storage file"
385
+ end
386
+ # relaxed this, due to test-msg/qwerty_[1-3]*.msg they all had
387
+ # 3 for this value.
388
+ # transacting_signature != "\x00" * 4 or
389
+ if threshold != 4096 or
390
+ num_mbat == 0 && mbat_start != AllocationTable::EOC or
391
+ reserved != "\x00" * 6
392
+ Log.warn "may not be a valid OLE2 structured storage file"
393
+ end
394
+ true
395
+ end
396
+ end
397
+
398
+ #
399
+ # +AllocationTable+'s hold the chains corresponding to files. Given
400
+ # an initial index, <tt>AllocationTable#chain</tt> follows the chain, returning
401
+ # the blocks that make up that file.
402
+ #
403
+ # There are 2 allocation tables, the bbat, and sbat, for big and small
404
+ # blocks respectively. The block chain should be loaded using either
405
+ # <tt>Storage#read_big_blocks</tt> or <tt>Storage#read_small_blocks</tt>
406
+ # as appropriate.
407
+ #
408
+ # Whether or not big or small blocks are used for a file depends on
409
+ # whether its size is over the <tt>Header#threshold</tt> level.
410
+ #
411
+ # An <tt>Ole::Storage</tt> document is serialized as a series of directory objects,
412
+ # which are stored in blocks throughout the file. The blocks are either
413
+ # big or small, and are accessed using the <tt>AllocationTable</tt>.
414
+ #
415
+ # The bbat allocation table's data is stored in the spare room in the header
416
+ # block, and in extra blocks throughout the file as referenced by the meta
417
+ # bat. That chain is linear, as there is no higher level table.
418
+ #
419
+ class AllocationTable
420
+ # a free block (I don't currently leave any blocks free), although I do pad out
421
+ # the allocation table with AVAIL to the block size.
422
+ AVAIL = 0xffffffff
423
+ EOC = 0xfffffffe # end of a chain
424
+ # these blocks correspond to the bat, and aren't part of a file, nor available.
425
+ # (I don't currently output these)
426
+ BAT = 0xfffffffd
427
+ META_BAT = 0xfffffffc
428
+
429
+ attr_reader :ole, :io, :table, :block_size
430
+ def initialize ole
431
+ @ole = ole
432
+ @table = []
433
+ end
434
+
435
+ def load data
436
+ @table = data.unpack('L*')
437
+ end
438
+
439
+ def truncated_table
440
+ # this strips trailing AVAILs. come to think of it, this has the potential to break
441
+ # bogus ole. if you terminate using AVAIL instead of EOC, like I did before. but that is
442
+ # very broken. however, if a chain ends with AVAIL, it should probably be fixed to EOC
443
+ # at load time.
444
+ temp = @table.reverse
445
+ not_avail = temp.find { |b| b != AVAIL } and temp = temp[temp.index(not_avail)..-1]
446
+ temp.reverse
447
+ end
448
+
449
+ def save
450
+ table = truncated_table #@table
451
+ # pad it out some
452
+ num = @ole.bbat.block_size / 4
453
+ # do you really use AVAIL? they probably extend past end of file, and may shortly
454
+ # be used for the bat. not really good.
455
+ table += [AVAIL] * (num - (table.length % num)) if (table.length % num) != 0
456
+ table.pack 'L*'
457
+ end
458
+
459
+ # rewriting this to be non-recursive. it broke on a large attachment
460
+ # building up the chain, causing a stack error. need tail-call elimination...
461
+ def chain start
462
+ a = []
463
+ idx = start
464
+ until idx >= META_BAT
465
+ raise "broken allocationtable chain" if idx < 0 || idx > @table.length
466
+ a << idx
467
+ idx = @table[idx]
468
+ end
469
+ Log.warn "invalid chain terminator #{idx}" unless idx == EOC
470
+ a
471
+ end
472
+
473
+ def ranges chain, size=nil
474
+ chain = self.chain(chain) unless Array === chain
475
+ blocks_to_ranges chain, size
476
+ end
477
+
478
+ # Turn a chain (an array given by +chain+) of big blocks, optionally
479
+ # truncated to +size+, into an array of arrays describing the stretches of
480
+ # bytes in the file that it belongs to.
481
+ #
482
+ # Big blocks are of size Ole::Storage::Header#b_size, and are stored
483
+ # directly in the parent file.
484
+ # truncate the chain if required
485
+ # convert chain to ranges of the block size
486
+ # truncate final range if required
487
+
488
+ def blocks_to_ranges chain, size=nil
489
+ chain = chain[0...(size.to_f / block_size).ceil] if size
490
+ ranges = chain.map { |i| [block_size * i, block_size] }
491
+ ranges.last[1] -= (ranges.length * block_size - size) if ranges.last and size
492
+ ranges
493
+ end
494
+
495
+ # quick shortcut. chain can be either a head (in which case the table is used to
496
+ # turn it into a chain), or a chain. it is converted to ranges, then to rangesio.
497
+ # its not resizeable or migrateable. it probably could be resizeable though, using
498
+ # self as the bat. but what would the first_block be?
499
+ def open chain, size=nil
500
+ io = RangesIO.new @io, ranges(chain, size)
501
+ if block_given?
502
+ begin yield io
503
+ ensure; io.close
504
+ end
505
+ else io
506
+ end
507
+ end
508
+
509
+ def read chain, size=nil
510
+ open chain, size, &:read
511
+ end
512
+
513
+ # ----------------------
514
+
515
+ def get_free_block
516
+ @table.each_index { |i| return i if @table[i] == AVAIL }
517
+ @table.push AVAIL
518
+ @table.length - 1
519
+ end
520
+
521
+ # must return first_block
522
+ def resize_chain first_block, size
523
+ new_num_blocks = (size / block_size.to_f).ceil
524
+ blocks = chain first_block
525
+ old_num_blocks = blocks.length
526
+ if new_num_blocks < old_num_blocks
527
+ # de-allocate some of our old blocks. TODO maybe zero them out in the file???
528
+ (new_num_blocks...old_num_blocks).each { |i| @table[blocks[i]] = AVAIL }
529
+ # if we have a chain, terminate it and return head, otherwise return EOC
530
+ if new_num_blocks > 0
531
+ @table[blocks[new_num_blocks-1]] = EOC
532
+ first_block
533
+ else EOC
534
+ end
535
+ elsif new_num_blocks > old_num_blocks
536
+ # need some more blocks.
537
+ last_block = blocks.last
538
+ (new_num_blocks - old_num_blocks).times do
539
+ block = get_free_block
540
+ # connect the chain. handle corner case of blocks being [] initially
541
+ if last_block
542
+ @table[last_block] = block
543
+ else
544
+ first_block = block
545
+ end
546
+ last_block = block
547
+ # this is just to inhibit the problem where it gets picked as being a free block
548
+ # again next time around.
549
+ @table[last_block] = EOC
550
+ end
551
+ first_block
552
+ else first_block
553
+ end
554
+ end
555
+
556
+ class Big < AllocationTable
557
+ def initialize(*args)
558
+ super
559
+ @block_size = 1 << @ole.header.b_shift
560
+ @io = @ole.io
561
+ end
562
+
563
+ # Big blocks are kind of -1 based, in order to not clash with the header.
564
+ def blocks_to_ranges blocks, size
565
+ super blocks.map { |b| b + 1 }, size
566
+ end
567
+ end
568
+
569
+ class Small < AllocationTable
570
+ def initialize(*args)
571
+ super
572
+ @block_size = 1 << @ole.header.s_shift
573
+ @io = @ole.sb_file
574
+ end
575
+ end
576
+ end
577
+
578
+ # like normal RangesIO, but Ole::Storage specific. the ranges are backed by an
579
+ # AllocationTable, and can be resized. used for read/write to 2 streams:
580
+ # 1. serialized dirent data
581
+ # 2. sbat table data
582
+ # 3. all dirents but through RangesIOMigrateable below
583
+ #
584
+ # Note that all internal access to first_block is through accessors, as it is sometimes
585
+ # useful to redirect it.
586
+ class RangesIOResizeable < RangesIO
587
+ attr_reader :bat
588
+ attr_accessor :first_block
589
+ def initialize bat, first_block, size=nil
590
+ @bat = bat
591
+ self.first_block = first_block
592
+ super @bat.io, @bat.ranges(first_block, size)
593
+ end
594
+
595
+ def truncate size
596
+ # note that old_blocks is != @ranges.length necessarily. i'm planning to write a
597
+ # merge_ranges function that merges sequential ranges into one as an optimization.
598
+ self.first_block = @bat.resize_chain first_block, size
599
+ @ranges = @bat.ranges first_block, size
600
+ @pos = @size if @pos > size
601
+
602
+ # don't know if this is required, but we explicitly request our @io to grow if necessary
603
+ # we never shrink it though. maybe this belongs in allocationtable, where smarter decisions
604
+ # can be made.
605
+ # maybe its ok to just seek out there later??
606
+ max = @ranges.map { |pos, len| pos + len }.max || 0
607
+ @io.truncate max if max > @io.size
608
+
609
+ @size = size
610
+ end
611
+ end
612
+
613
+ # like RangesIOResizeable, but Ole::Storage::Dirent specific. provides for migration
614
+ # between bats based on size, and updating the dirent, instead of the ole copy back
615
+ # on close.
616
+ class RangesIOMigrateable < RangesIOResizeable
617
+ attr_reader :dirent
618
+ def initialize dirent
619
+ @dirent = dirent
620
+ super @dirent.ole.bat_for_size(@dirent.size), @dirent.first_block, @dirent.size
621
+ end
622
+
623
+ def truncate size
624
+ bat = @dirent.ole.bat_for_size size
625
+ if bat != @bat
626
+ # bat migration needed! we need to backup some data. the amount of data
627
+ # should be <= @ole.header.threshold, so we can just hold it all in one buffer.
628
+ # backup this
629
+ pos = @pos
630
+ @pos = 0
631
+ keep = read [@size, size].min
632
+ # this does a normal truncate to 0, removing our presence from the old bat, and
633
+ # rewrite the dirent's first_block
634
+ super 0
635
+ @bat = bat
636
+ # just change the underlying io from right under everyone :)
637
+ @io = bat.io
638
+ # important to do this now, before the write. as the below write will always
639
+ # migrate us back to sbat! this will now allocate us +size+ in the new bat.
640
+ super
641
+ @pos = 0
642
+ write keep
643
+ @pos = pos
644
+ else
645
+ super
646
+ end
647
+ # now just update the file
648
+ @dirent.size = size
649
+ end
650
+
651
+ # forward this to the dirent
652
+ def first_block
653
+ @dirent.first_block
654
+ end
655
+
656
+ def first_block= val
657
+ @dirent.first_block = val
658
+ end
659
+ end
660
+
661
+ #
662
+ # A class which wraps an ole directory entry. Can be either a directory
663
+ # (<tt>Dirent#dir?</tt>) or a file (<tt>Dirent#file?</tt>)
664
+ #
665
+ # Most interaction with <tt>Ole::Storage</tt> is through this class.
666
+ # The 2 most important functions are <tt>Dirent#children</tt>, and
667
+ # <tt>Dirent#data</tt>.
668
+ #
669
+ # was considering separate classes for dirs and files. some methods/attrs only
670
+ # applicable to one or the other.
671
+ #
672
+ # Note that Dirent is still using a home grown Struct variant, with explicit
673
+ # MEMBERS etc. any reason for that still?
674
+ #
675
+ class Dirent
676
+ MEMBERS = [
677
+ :name_utf16, :name_len, :type_id, :colour, :prev, :next, :child,
678
+ :clsid, :flags, # dirs only
679
+ :create_time_str, :modify_time_str, # files only
680
+ :first_block, :size, :reserved
681
+ ]
682
+ PACK = 'a64 S C C L3 a16 L a8 a8 L2 a4'
683
+ SIZE = 128
684
+ TYPE_MAP = {
685
+ # this is temporary
686
+ 0 => :empty,
687
+ 1 => :dir,
688
+ 2 => :file,
689
+ 5 => :root
690
+ }
691
+ COLOUR_MAP = {
692
+ 0 => :red,
693
+ 1 => :black
694
+ }
695
+ # used in the next / prev / child stuff to show that the tree ends here.
696
+ # also used for first_block for directory.
697
+ EOT = 0xffffffff
698
+
699
+ include Enumerable
700
+
701
+ # Dirent's should be created in 1 of 2 ways, either Dirent.new ole, [:dir/:file/:root],
702
+ # or Dirent.load '... dirent data ...'
703
+ # its a bit clunky, but thats how it is at the moment. you can assign to type, but
704
+ # shouldn't.
705
+
706
+ attr_accessor :idx
707
+ # This returns all the children of this +Dirent+. It is filled in
708
+ # when the tree structure is recreated.
709
+ attr_accessor :children
710
+ attr_reader :ole, :type, :create_time, :modify_time, :name
711
+ def initialize ole, type
712
+ @ole = ole
713
+ # this isn't really good enough. need default values put in there.
714
+ @values = [
715
+ 0.chr * 2, 2, 0, # will get overwritten
716
+ 1, EOT, EOT, EOT,
717
+ 0.chr * 16, 0, nil, nil,
718
+ AllocationTable::EOC, 0, 0.chr * 4]
719
+ # maybe check types here.
720
+ @type = type
721
+ @create_time = @modify_time = nil
722
+ @children = []
723
+ if file?
724
+ @create_time = Time.now
725
+ @modify_time = Time.now
726
+ end
727
+ end
728
+
729
+ def self.load ole, str
730
+ # load should function without the need for the initializer.
731
+ dirent = Dirent.allocate
732
+ dirent.load ole, str
733
+ dirent
734
+ end
735
+
736
+ def load ole, str
737
+ @ole = ole
738
+ @values = str.unpack PACK
739
+ @name = Types::FROM_UTF16.iconv name_utf16[0...name_len].sub(/\x00\x00$/, '')
740
+ @type = TYPE_MAP[type_id] or raise "unknown type #{type_id.inspect}"
741
+ if file?
742
+ @create_time = Types.load_time create_time_str
743
+ @modify_time = Types.load_time modify_time_str
744
+ end
745
+ end
746
+
747
+ # only defined for files really. and the above children stuff is only for children.
748
+ # maybe i should have some sort of File and Dir class, that subclass Dirents? a dirent
749
+ # is just a data holder.
750
+ # this can be used for write support if the underlying io object was opened for writing.
751
+ # maybe take a mode string argument, and do truncation, append etc stuff.
752
+ def open
753
+ return nil unless file?
754
+ io = RangesIOMigrateable.new self
755
+ if block_given?
756
+ begin yield io
757
+ ensure; io.close
758
+ end
759
+ else io
760
+ end
761
+ end
762
+
763
+ def read limit=nil
764
+ open { |io| io.read limit }
765
+ end
766
+
767
+ def dir?
768
+ # to count root as a dir.
769
+ type != :file
770
+ end
771
+
772
+ def file?
773
+ type == :file
774
+ end
775
+
776
+ def time
777
+ # time is nil for streams, otherwise try to parse either of the time pairse (not
778
+ # sure of their meaning - created / modified?)
779
+ #@time ||= file? ? nil : (Dirent.parse_time(secs1, days1) || Dirent.parse_time(secs2, days2))
780
+ create_time || modify_time
781
+ end
782
+
783
+ def each(&block)
784
+ @children.each(&block)
785
+ end
786
+
787
+ def [] idx
788
+ return children[idx] if Integer === idx
789
+ # path style look up.
790
+ # maybe take another arg to allow creation? or leave that to the filesystem
791
+ # add on.
792
+ # not sure if '/' is a valid char in an Dirent#name, so no splitting etc at
793
+ # this level.
794
+ # also what about warning about multiple hits for the same name?
795
+ children.find { |child| idx === child.name }
796
+ end
797
+
798
+ # solution for the above '/' thing for now.
799
+ def / path
800
+ self[path]
801
+ end
802
+
803
+ def to_tree
804
+ if children and !children.empty?
805
+ str = "- #{inspect}\n"
806
+ children.each_with_index do |child, i|
807
+ last = i == children.length - 1
808
+ child.to_tree.split(/\n/).each_with_index do |line, j|
809
+ str << " #{last ? (j == 0 ? "\\" : ' ') : '|'}#{line}\n"
810
+ end
811
+ end
812
+ str
813
+ else "- #{inspect}\n"
814
+ end
815
+ end
816
+
817
+ MEMBERS.each_with_index do |sym, i|
818
+ define_method(sym) { @values[i] }
819
+ define_method(sym.to_s + '=') { |val| @values[i] = val }
820
+ end
821
+
822
+ def to_a
823
+ @values
824
+ end
825
+
826
+ # flattens the tree starting from here into +dirents+. note it modifies its argument.
827
+ def flatten dirents=[]
828
+ @idx = dirents.length
829
+ dirents << self
830
+ children.each { |child| child.flatten dirents }
831
+ self.child = Dirent.flatten_helper children
832
+ dirents
833
+ end
834
+
835
+ # i think making the tree structure optimized is actually more complex than this, and
836
+ # requires some intelligent ordering of the children based on names, but as long as
837
+ # it is valid its ok.
838
+ # actually, i think its ok. gsf for example only outputs a singly-linked-list, where
839
+ # prev is always EOT.
840
+ def self.flatten_helper children
841
+ return EOT if children.empty?
842
+ i = children.length / 2
843
+ this = children[i]
844
+ this.prev, this.next = [(0...i), (i+1..-1)].map { |r| flatten_helper children[r] }
845
+ this.idx
846
+ end
847
+
848
+ attr_accessor :name, :type
849
+ def save
850
+ tmp = Types::TO_UTF16.iconv(name)
851
+ tmp = tmp[0, 62] if tmp.length > 62
852
+ tmp += 0.chr * 2
853
+ self.name_len = tmp.length
854
+ self.name_utf16 = tmp + 0.chr * (64 - tmp.length)
855
+ begin
856
+ self.type_id = TYPE_MAP.to_a.find { |id, name| @type == name }.first
857
+ rescue
858
+ raise "unknown type #{type.inspect}"
859
+ end
860
+ # for the case of files, it is assumed that that was handled already
861
+ # note not dir?, so as not to override root's first_block
862
+ self.first_block = Dirent::EOT if type == :dir
863
+ if 0 #file?
864
+ #self.create_time_str = ?? #Types.load_time create_time_str
865
+ #self.modify_time_str = ?? #Types.load_time modify_time_str
866
+ else
867
+ self.create_time_str = 0.chr * 8
868
+ self.modify_time_str = 0.chr * 8
869
+ end
870
+ @values.pack PACK
871
+ end
872
+
873
+ def inspect
874
+ str = "#<Dirent:#{name.inspect}"
875
+ # perhaps i should remove the data snippet. its not that useful anymore.
876
+ if file?
877
+ tmp = read 9
878
+ data = tmp.length == 9 ? tmp[0, 5] + '...' : tmp
879
+ str << " size=#{size}" +
880
+ "#{time ? ' time=' + time.to_s.inspect : nil}" +
881
+ " data=#{data.inspect}"
882
+ else
883
+ # there is some dir specific stuff. like clsid, flags.
884
+ end
885
+ str + '>'
886
+ end
887
+
888
+ # --------
889
+ # and for creation of a dirent. don't like the name. is it a file or a directory?
890
+ # assign to type later? io will be empty.
891
+ def new_child type
892
+ child = Dirent.new ole, type
893
+ children << child
894
+ yield child if block_given?
895
+ child
896
+ end
897
+
898
+ def delete child
899
+ # remove from our child array, so that on reflatten and re-creation of @dirents, it will be gone
900
+ raise "#{child.inspect} not a child of #{self.inspect}" unless @children.delete child
901
+ # free our blocks
902
+ child.open { |io| io.truncate 0 }
903
+ end
904
+
905
+ def self.copy src, dst
906
+ # copies the contents of src to dst. must be the same type. this will throw an
907
+ # error on copying to root. maybe this will recurse too much for big documents??
908
+ raise 'differing types' if src.type == :file and dst.type != :file
909
+ dst.name = src.name
910
+ if src.dir?
911
+ src.children.each do |src_child|
912
+ dst.new_child(src_child.type) { |dst_child| Dirent.copy src_child, dst_child }
913
+ end
914
+ else
915
+ src.open do |src_io|
916
+ dst.open { |dst_io| IO.copy src_io, dst_io }
917
+ end
918
+ end
919
+ end
920
+ end
921
+ end
922
+ end
923
+
924
+ if $0 == __FILE__
925
+ puts Ole::Storage.open(ARGV[0]) { |ole| ole.root.to_tree }
926
+ end
927
+