ruby-ole 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,925 @@
1
+ #! /usr/bin/ruby -w
2
+
3
+ $: << File.dirname(__FILE__) + '/..'
4
+
5
+ require 'stringio'
6
+ require 'tempfile'
7
+
8
+ require 'ole/base'
9
+ require 'ole/types'
10
+ # not strictly ole related
11
+ require 'ole/io_helpers'
12
+
13
+ module Ole # :nodoc:
14
+ #
15
+ # = Introduction
16
+ #
17
+ # <tt>Ole::Storage</tt> is a class intended to abstract away details of the
18
+ # access to OLE2 structured storage files, such as those produced by
19
+ # Microsoft Office, eg *.doc, *.msg etc.
20
+ #
21
+ # Initially based on chicago's libole, source available at
22
+ # http://prdownloads.sf.net/chicago/ole.tgz
23
+ # Later augmented with some corrections by inspecting pole, and (purely
24
+ # for header definitions) gsf.
25
+ #
26
+ # = Usage
27
+ #
28
+ # Usage should be fairly straight forward:
29
+ #
30
+ # # get the parent ole storage object
31
+ # ole = Ole::Storage.open 'myfile.msg', 'r+'
32
+ # # => #<Ole::Storage io=#<File:myfile.msg> root=#<Dirent:"Root Entry">>
33
+ # # read some data
34
+ # ole.root[1].read 4
35
+ # # => "\001\000\376\377"
36
+ # # get the top level root object and output a tree structure for
37
+ # # debugging
38
+ # puts ole.root.to_tree
39
+ # # =>
40
+ # - #<Dirent:"Root Entry" size=3840 time="2006-11-03T00:52:53Z">
41
+ # |- #<Dirent:"__nameid_version1.0" size=0 time="2006-11-03T00:52:53Z">
42
+ # | |- #<Dirent:"__substg1.0_00020102" size=16 data="CCAGAAAAAADAAA...">
43
+ # ...
44
+ # |- #<Dirent:"__substg1.0_8002001E" size=4 data="MTEuMA==">
45
+ # |- #<Dirent:"__properties_version1.0" size=800 data="AAAAAAAAAAABAA...">
46
+ # \- #<Dirent:"__recip_version1.0_#00000000" size=0 time="2006-11-03T00:52:53Z">
47
+ # |- #<Dirent:"__substg1.0_0FF60102" size=4 data="AAAAAA==">
48
+ # ...
49
+ # # write some data, and finish up (note that open is 'r+', so this overwrites
50
+ # # but doesn't truncate)
51
+ # ole.root["\001CompObj"].open { |f| f.write "blah blah" }
52
+ # ole.close
53
+ #
54
+ # = TODO
55
+ #
56
+ # 1. tests. lock down how things work at the moment - mostly good.
57
+ # create from scratch works now, as does copying in a subtree of another doc, so
58
+ # ole embedded attachment serialization works now. i can save embedded xls in an msg
59
+ # into a separate file, and open it. this was a goal. now i would want to implemenet
60
+ # to_mime conversion for embedded attachments, that serializes them to ole, but handles
61
+ # some separately like various meta file types as plain .wmf attachments perhaps. this
62
+ # will give pretty good .eml's from emails with embedded attachments.
63
+ # the other todo is .rtf output, with full support for embedded ole objects...
64
+ # 2. lots of tidying up
65
+ # - main FIXME's in this regard are:
66
+ # * the custom header cruft for Header and Dirent needs some love.
67
+ # * i have a number of classes doing load/save combos: Header, AllocationTable, Dirent,
68
+ # and, in a manner of speaking, but arguably different, Storage itself.
69
+ # they have differing api's which would be nice to clean.
70
+ # AllocationTable::Big must be created aot now, as it is used for all subsequent reads.
71
+ # * ole types need work, can't serialize datetime at the moment.
72
+ # 3. need to fix META_BAT support in #flush.
73
+ #
74
+ class Storage
75
+ VERSION = '1.2.1'
76
+
77
+ # The top of the ole tree structure
78
+ attr_reader :root
79
+ # The tree structure in its original flattened form. only valid after #load, or #flush.
80
+ attr_reader :dirents
81
+ # The underlying io object to/from which the ole object is serialized, whether we
82
+ # should close it, and whether it is writeable
83
+ attr_reader :io, :close_parent, :writeable
84
+ # Low level internals, you probably shouldn't need to mess with these
85
+ attr_reader :header, :bbat, :sbat, :sb_file
86
+
87
+ # maybe include an option hash, and allow :close_parent => true, to be more general.
88
+ # +arg+ should be either a file, or an +IO+ object, and needs to be seekable.
89
+ def initialize arg, mode=nil
90
+ # get the io object
91
+ @close_parent, @io = if String === arg
92
+ [true, open(arg, mode || 'rb')]
93
+ else
94
+ raise 'unable to specify mode string with io object' if mode
95
+ [false, arg]
96
+ end
97
+ # do we have this file opened for writing? don't know of a better way to tell
98
+ @writeable = begin
99
+ @io.flush
100
+ true
101
+ rescue IOError
102
+ false
103
+ end
104
+ # silence undefined warning in clear
105
+ @sb_file = nil
106
+ # if the io object has data, we should load it, otherwise start afresh
107
+ @io.size > 0 ? load : clear
108
+ end
109
+
110
+ def self.new arg, mode=nil
111
+ ole = super
112
+ if block_given?
113
+ begin yield ole
114
+ ensure; ole.close
115
+ end
116
+ else ole
117
+ end
118
+ end
119
+
120
+ class << self
121
+ # encouraged
122
+ alias open :new
123
+ # deprecated
124
+ alias load :new
125
+ end
126
+
127
+ # load document from file.
128
+ def load
129
+ # we always read 512 for the header block. if the block size ends up being different,
130
+ # what happens to the 109 fat entries. are there more/less entries?
131
+ @io.rewind
132
+ header_block = @io.read 512
133
+ @header = Header.load header_block
134
+
135
+ # create an empty bbat
136
+ @bbat = AllocationTable::Big.new self
137
+ # extra mbat blocks
138
+ mbat_blocks = (0...@header.num_mbat).map { |i| i + @header.mbat_start }
139
+ bbat_chain = (header_block[Header::SIZE..-1] + @bbat.read(mbat_blocks)).unpack 'L*'
140
+ # am i using num_bat in the right way?
141
+ @bbat.load @bbat.read(bbat_chain[0, @header.num_bat])
142
+
143
+ # get block chain for directories, read it, then split it into chunks and load the
144
+ # directory entries. semantics changed - used to cut at first dir where dir.type == 0
145
+ @dirents = @bbat.read(@header.dirent_start).scan(/.{#{Dirent::SIZE}}/mo).
146
+ map { |str| Dirent.load self, str }.reject { |d| d.type_id == 0 }
147
+
148
+ # now reorder from flat into a tree
149
+ # links are stored in some kind of balanced binary tree
150
+ # check that everything is visited at least, and at most once
151
+ # similarly with the blocks of the file.
152
+ # was thinking of moving this to Dirent.to_tree instead.
153
+ class << @dirents
154
+ def to_tree idx=0
155
+ return [] if idx == Dirent::EOT
156
+ d = self[idx]
157
+ d.children = to_tree d.child
158
+ raise "directory #{d.inspect} used twice" if d.idx
159
+ d.idx = idx
160
+ to_tree(d.prev) + [d] + to_tree(d.next)
161
+ end
162
+ end
163
+
164
+ @root = @dirents.to_tree.first
165
+ Log.warn "root name was #{@root.name.inspect}" unless @root.name == 'Root Entry'
166
+ unused = @dirents.reject(&:idx).length
167
+ Log.warn "* #{unused} unused directories" if unused > 0
168
+
169
+ # FIXME i don't currently use @header.num_sbat which i should
170
+ # hmm. nor do i write it. it means what exactly again?
171
+ @sb_file = RangesIOResizeable.new @bbat, @root.first_block, @root.size
172
+ @sbat = AllocationTable::Small.new self
173
+ @sbat.load @bbat.read(@header.sbat_start)
174
+ end
175
+
176
+ def close
177
+ flush if @writeable
178
+ @sb_file.close
179
+ @io.close if @close_parent
180
+ end
181
+
182
+ # should have a #open_dirent i think. and use it in load and flush. neater.
183
+ # also was thinking about Dirent#open_padding. then i can more easily clean up the padding
184
+ # to be 0.chr
185
+ =begin
186
+ thoughts on fixes:
187
+ 1. reterminate any chain not ending in EOC.
188
+ 2. pass through all chain heads looking for collisions, and making sure nothing points to them
189
+ (ie they are really heads).
190
+ 3. we know the locations of the bbat data, and mbat data. ensure that there are placeholder blocks
191
+ in the bat for them.
192
+ this stuff will ensure reliability of input better. otherwise, its actually worth doing a repack
193
+ directly after read, to ensure the above is probably acounted for, before subsequent writes possibly
194
+ destroy things.
195
+ =end
196
+ def flush
197
+ # recreate dirs from our tree, split into dirs and big and small files
198
+ @root.type = :root
199
+ @root.name = 'Root Entry'
200
+ @root.first_block = @sb_file.first_block
201
+ @root.size = @sb_file.size
202
+ @dirents = @root.flatten
203
+
204
+ # maybe i should move the block form up to RangesIO, and get it for free at all levels.
205
+ # Dirent#open gets block form for free then
206
+ io = RangesIOResizeable.new @bbat, @header.dirent_start
207
+ io.truncate 0
208
+ @dirents.each { |dirent| io.write dirent.save }
209
+ padding = (io.size / @bbat.block_size.to_f).ceil * @bbat.block_size - io.size
210
+ io.write 0.chr * padding
211
+ @header.dirent_start = io.first_block
212
+ io.close
213
+
214
+ # similarly for the sbat data.
215
+ io = RangesIOResizeable.new @bbat, @header.sbat_start
216
+ io.truncate 0
217
+ io.write @sbat.save
218
+ @header.sbat_start = io.first_block
219
+ @header.num_sbat = @bbat.chain(@header.sbat_start).length
220
+ io.close
221
+
222
+ # what follows will be slightly more complex for the bat fiddling.
223
+
224
+ # create RangesIOResizeable hooked up to the bbat. use that to claim bbat blocks using
225
+ # truncate. then when its time to write, convert that chain and some chunk of blocks at
226
+ # the end, into META_BAT blocks. write out the chain, and those meta bat blocks, and its
227
+ # done.
228
+ @bbat.table.map! do |b|
229
+ b == AllocationTable::BAT || b == AllocationTable::META_BAT ?
230
+ AllocationTable::AVAIL : b
231
+ end
232
+ io = RangesIOResizeable.new @bbat, AllocationTable::EOC
233
+
234
+ # use crappy loop for now:
235
+ while true
236
+ bbat_data = @bbat.save
237
+ #mbat_data = bbat_data.length / @bbat.block_size * 4
238
+ mbat_chain = @bbat.chain io.first_block
239
+ raise NotImplementedError, "don't handle writing out extra META_BAT blocks yet" if mbat_chain.length > 109
240
+ # so we can ignore meta blocks in this calculation:
241
+ break if io.size >= bbat_data.length # it shouldn't be bigger right?
242
+ # this may grow the bbat, depending on existing available blocks
243
+ io.truncate bbat_data.length
244
+ end
245
+
246
+ # now extract the info we want:
247
+ ranges = io.ranges
248
+ mbat_chain = @bbat.chain io.first_block
249
+ io.close
250
+ mbat_chain.each { |b| @bbat.table[b] = AllocationTable::BAT }
251
+ @header.num_bat = mbat_chain.length
252
+ #p @bbat.truncated_table
253
+ #p ranges
254
+ #p mbat_chain
255
+ # not resizeable!
256
+ io = RangesIO.new @io, ranges
257
+ io.write @bbat.save
258
+ io.close
259
+ mbat_chain += [AllocationTable::AVAIL] * (109 - mbat_chain.length)
260
+ @header.mbat_start = AllocationTable::EOC
261
+ @header.num_mbat = 0
262
+
263
+ =begin
264
+ # Old save code. remove shortly
265
+
266
+ bbat_data = new_bbat.save
267
+ # must exist as linear chain stored in header.
268
+ @header.num_bat = (bbat_data.length / new_bbat.block_size.to_f).ceil
269
+ base = io.pos / new_bbat.block_size - 1
270
+ io.write bbat_data
271
+ # now that spanned a number of blocks:
272
+ mbat = (0...@header.num_bat).map { |i| i + base }
273
+ mbat += [AllocationTable::AVAIL] * (109 - mbat.length) if mbat.length < 109
274
+ header_mbat = mbat[0...109]
275
+ other_mbat_data = mbat[109..-1].pack 'L*'
276
+ @header.mbat_start = base + @header.num_bat
277
+ @header.num_mbat = (other_mbat_data.length / new_bbat.block_size.to_f).ceil
278
+ io.write other_mbat_data
279
+ =end
280
+
281
+ @root.type = :dir
282
+
283
+ # now seek back and write the header out
284
+ @io.seek 0
285
+ @io.write @header.save + mbat_chain.pack('L*')
286
+ @io.flush
287
+ end
288
+
289
+ def clear
290
+ # initialize to equivalent of loading an empty ole document.
291
+ Log.warn 'creating new ole storage object on non-writable io' unless @writeable
292
+ @header = Header.new
293
+ @bbat = AllocationTable::Big.new self
294
+ @root = Dirent.new self, :dir
295
+ @root.name = 'Root Entry'
296
+ @dirents = [@root]
297
+ @root.idx = 0
298
+ @root.children = []
299
+ # size shouldn't display for non-files
300
+ @root.size = 0
301
+ @sb_file.close if @sb_file
302
+ @sb_file = RangesIOResizeable.new @bbat, AllocationTable::EOC
303
+ @sbat = AllocationTable::Small.new self
304
+ # throw everything else the hell away
305
+ @io.truncate 0
306
+ end
307
+
308
+ # could be useful with mis-behaving ole documents. or to just clean them up.
309
+ def repack temp=:file
310
+ case temp
311
+ when :file; Tempfile.open 'w+', &method(:repack_using_io)
312
+ when :mem; StringIO.open(&method(:repack_using_io))
313
+ else raise "unknown temp backing #{temp.inspect}"
314
+ end
315
+ end
316
+
317
+ def repack_using_io temp_io
318
+ @io.rewind
319
+ IO.copy @io, temp_io
320
+ clear
321
+ Storage.open temp_io do |temp_ole|
322
+ temp_ole.root.type = :dir
323
+ Dirent.copy temp_ole.root, root
324
+ end
325
+ end
326
+
327
+ def bat_for_size size
328
+ # note >=, not > previously.
329
+ size >= @header.threshold ? @bbat : @sbat
330
+ end
331
+
332
+ def inspect
333
+ "#<#{self.class} io=#{@io.inspect} root=#{@root.inspect}>"
334
+ end
335
+
336
+ # A class which wraps the ole header
337
+ class Header < Struct.new(
338
+ :magic, :clsid, :minor_ver, :major_ver, :byte_order, :b_shift, :s_shift,
339
+ :reserved, :csectdir, :num_bat, :dirent_start, :transacting_signature, :threshold,
340
+ :sbat_start, :num_sbat, :mbat_start, :num_mbat
341
+ )
342
+ PACK = 'a8 a16 S2 a2 S2 a6 L3 a4 L5'
343
+ SIZE = 0x4c
344
+ # i have seen it pointed out that the first 4 bytes of hex,
345
+ # 0xd0cf11e0, is supposed to spell out docfile. hmmm :)
346
+ MAGIC = "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # expected value of Header#magic
347
+ # what you get if creating new header from scratch.
348
+ # AllocationTable::EOC isn't available yet. meh.
349
+ EOC = 0xfffffffe
350
+ DEFAULT = [
351
+ MAGIC, 0.chr * 16, 59, 3, "\xfe\xff", 9, 6,
352
+ 0.chr * 6, 0, 1, EOC, 0.chr * 4,
353
+ 4096, EOC, 0, EOC, 0
354
+ ]
355
+
356
+ # 2 basic initializations, from scratch, or from a data string.
357
+ # from scratch will be geared towards creating a new ole object
358
+ def initialize *values
359
+ super(*(values.empty? ? DEFAULT : values))
360
+ validate!
361
+ end
362
+
363
+ def self.load str
364
+ Header.new(*str.unpack(PACK))
365
+ end
366
+
367
+ def save
368
+ to_a.pack PACK
369
+ end
370
+
371
+ def validate!
372
+ raise "OLE2 signature is invalid" unless magic == MAGIC
373
+ if num_bat == 0 or # is that valid for a completely empty file?
374
+ # not sure about this one. basically to do max possible bat given size of mbat
375
+ num_bat > 109 && num_bat > 109 + num_mbat * (1 << b_shift - 2) or
376
+ # shouldn't need to use the mbat as there is enough space in the header block
377
+ num_bat < 109 && num_mbat != 0 or
378
+ # given the size of the header is 76, if b_shift <= 6, blocks address the header.
379
+ s_shift > b_shift or b_shift <= 6 or b_shift >= 31 or
380
+ # we only handle little endian
381
+ byte_order != "\xfe\xff"
382
+ raise "not valid OLE2 structured storage file"
383
+ end
384
+ # relaxed this, due to test-msg/qwerty_[1-3]*.msg they all had
385
+ # 3 for this value.
386
+ # transacting_signature != "\x00" * 4 or
387
+ if threshold != 4096 or
388
+ num_mbat == 0 && mbat_start != AllocationTable::EOC or
389
+ reserved != "\x00" * 6
390
+ Log.warn "may not be a valid OLE2 structured storage file"
391
+ end
392
+ true
393
+ end
394
+ end
395
+
396
+ #
397
+ # +AllocationTable+'s hold the chains corresponding to files. Given
398
+ # an initial index, <tt>AllocationTable#chain</tt> follows the chain, returning
399
+ # the blocks that make up that file.
400
+ #
401
+ # There are 2 allocation tables, the bbat, and sbat, for big and small
402
+ # blocks respectively. The block chain should be loaded using either
403
+ # <tt>Storage#read_big_blocks</tt> or <tt>Storage#read_small_blocks</tt>
404
+ # as appropriate.
405
+ #
406
+ # Whether or not big or small blocks are used for a file depends on
407
+ # whether its size is over the <tt>Header#threshold</tt> level.
408
+ #
409
+ # An <tt>Ole::Storage</tt> document is serialized as a series of directory objects,
410
+ # which are stored in blocks throughout the file. The blocks are either
411
+ # big or small, and are accessed using the <tt>AllocationTable</tt>.
412
+ #
413
+ # The bbat allocation table's data is stored in the spare room in the header
414
+ # block, and in extra blocks throughout the file as referenced by the meta
415
+ # bat. That chain is linear, as there is no higher level table.
416
+ #
417
+ class AllocationTable
418
+ # a free block (I don't currently leave any blocks free), although I do pad out
419
+ # the allocation table with AVAIL to the block size.
420
+ AVAIL = 0xffffffff
421
+ EOC = 0xfffffffe # end of a chain
422
+ # these blocks correspond to the bat, and aren't part of a file, nor available.
423
+ # (I don't currently output these)
424
+ BAT = 0xfffffffd
425
+ META_BAT = 0xfffffffc
426
+
427
+ attr_reader :ole, :io, :table, :block_size
428
+ def initialize ole
429
+ @ole = ole
430
+ @table = []
431
+ end
432
+
433
+ def load data
434
+ @table = data.unpack('L*')
435
+ end
436
+
437
+ def truncated_table
438
+ # this strips trailing AVAILs. come to think of it, this has the potential to break
439
+ # bogus ole. if you terminate using AVAIL instead of EOC, like I did before. but that is
440
+ # very broken. however, if a chain ends with AVAIL, it should probably be fixed to EOC
441
+ # at load time.
442
+ temp = @table.reverse
443
+ not_avail = temp.find { |b| b != AVAIL } and temp = temp[temp.index(not_avail)..-1]
444
+ temp.reverse
445
+ end
446
+
447
+ def save
448
+ table = truncated_table #@table
449
+ # pad it out some
450
+ num = @ole.bbat.block_size / 4
451
+ # do you really use AVAIL? they probably extend past end of file, and may shortly
452
+ # be used for the bat. not really good.
453
+ table += [AVAIL] * (num - (table.length % num)) if (table.length % num) != 0
454
+ table.pack 'L*'
455
+ end
456
+
457
+ # rewriting this to be non-recursive. it broke on a large attachment
458
+ # building up the chain, causing a stack error. need tail-call elimination...
459
+ def chain start
460
+ a = []
461
+ idx = start
462
+ until idx >= META_BAT
463
+ raise "broken allocationtable chain" if idx < 0 || idx > @table.length
464
+ a << idx
465
+ idx = @table[idx]
466
+ end
467
+ Log.warn "invalid chain terminator #{idx}" unless idx == EOC
468
+ a
469
+ end
470
+
471
+ def ranges chain, size=nil
472
+ chain = self.chain(chain) unless Array === chain
473
+ blocks_to_ranges chain, size
474
+ end
475
+
476
+ # Turn a chain (an array given by +chain+) of big blocks, optionally
477
+ # truncated to +size+, into an array of arrays describing the stretches of
478
+ # bytes in the file that it belongs to.
479
+ #
480
+ # Big blocks are of size Ole::Storage::Header#b_size, and are stored
481
+ # directly in the parent file.
482
+ # truncate the chain if required
483
+ # convert chain to ranges of the block size
484
+ # truncate final range if required
485
+
486
+ def blocks_to_ranges chain, size=nil
487
+ chain = chain[0...(size.to_f / block_size).ceil] if size
488
+ ranges = chain.map { |i| [block_size * i, block_size] }
489
+ ranges.last[1] -= (ranges.length * block_size - size) if ranges.last and size
490
+ ranges
491
+ end
492
+
493
+ # quick shortcut. chain can be either a head (in which case the table is used to
494
+ # turn it into a chain), or a chain. it is converted to ranges, then to rangesio.
495
+ # its not resizeable or migrateable. it probably could be resizeable though, using
496
+ # self as the bat. but what would the first_block be?
497
+ def open chain, size=nil
498
+ io = RangesIO.new @io, ranges(chain, size)
499
+ if block_given?
500
+ begin yield io
501
+ ensure; io.close
502
+ end
503
+ else io
504
+ end
505
+ end
506
+
507
+ def read chain, size=nil
508
+ open chain, size, &:read
509
+ end
510
+
511
+ # ----------------------
512
+
513
+ def get_free_block
514
+ @table.each_index { |i| return i if @table[i] == AVAIL }
515
+ @table.push AVAIL
516
+ @table.length - 1
517
+ end
518
+
519
+ # must return first_block
520
+ def resize_chain first_block, size
521
+ new_num_blocks = (size / block_size.to_f).ceil
522
+ blocks = chain first_block
523
+ old_num_blocks = blocks.length
524
+ if new_num_blocks < old_num_blocks
525
+ # de-allocate some of our old blocks. TODO maybe zero them out in the file???
526
+ (new_num_blocks...old_num_blocks).each { |i| @table[blocks[i]] = AVAIL }
527
+ # if we have a chain, terminate it and return head, otherwise return EOC
528
+ if new_num_blocks > 0
529
+ @table[blocks[new_num_blocks-1]] = EOC
530
+ first_block
531
+ else EOC
532
+ end
533
+ elsif new_num_blocks > old_num_blocks
534
+ # need some more blocks.
535
+ last_block = blocks.last
536
+ (new_num_blocks - old_num_blocks).times do
537
+ block = get_free_block
538
+ # connect the chain. handle corner case of blocks being [] initially
539
+ if last_block
540
+ @table[last_block] = block
541
+ else
542
+ first_block = block
543
+ end
544
+ last_block = block
545
+ # this is just to inhibit the problem where it gets picked as being a free block
546
+ # again next time around.
547
+ @table[last_block] = EOC
548
+ end
549
+ first_block
550
+ else first_block
551
+ end
552
+ end
553
+
554
+ class Big < AllocationTable
555
+ def initialize(*args)
556
+ super
557
+ @block_size = 1 << @ole.header.b_shift
558
+ @io = @ole.io
559
+ end
560
+
561
+ # Big blocks are kind of -1 based, in order to not clash with the header.
562
+ def blocks_to_ranges blocks, size
563
+ super blocks.map { |b| b + 1 }, size
564
+ end
565
+ end
566
+
567
+ class Small < AllocationTable
568
+ def initialize(*args)
569
+ super
570
+ @block_size = 1 << @ole.header.s_shift
571
+ @io = @ole.sb_file
572
+ end
573
+ end
574
+ end
575
+
576
+ # like normal RangesIO, but Ole::Storage specific. the ranges are backed by an
577
+ # AllocationTable, and can be resized. used for read/write to 2 streams:
578
+ # 1. serialized dirent data
579
+ # 2. sbat table data
580
+ # 3. all dirents but through RangesIOMigrateable below
581
+ #
582
+ # Note that all internal access to first_block is through accessors, as it is sometimes
583
+ # useful to redirect it.
584
+ class RangesIOResizeable < RangesIO
585
+ attr_reader :bat
586
+ attr_accessor :first_block
587
+ def initialize bat, first_block, size=nil
588
+ @bat = bat
589
+ self.first_block = first_block
590
+ super @bat.io, @bat.ranges(first_block, size)
591
+ end
592
+
593
+ def truncate size
594
+ # note that old_blocks is != @ranges.length necessarily. i'm planning to write a
595
+ # merge_ranges function that merges sequential ranges into one as an optimization.
596
+ self.first_block = @bat.resize_chain first_block, size
597
+ @ranges = @bat.ranges first_block, size
598
+ @pos = @size if @pos > size
599
+
600
+ # don't know if this is required, but we explicitly request our @io to grow if necessary
601
+ # we never shrink it though. maybe this belongs in allocationtable, where smarter decisions
602
+ # can be made.
603
+ # maybe its ok to just seek out there later??
604
+ max = @ranges.map { |pos, len| pos + len }.max || 0
605
+ @io.truncate max if max > @io.size
606
+
607
+ @size = size
608
+ end
609
+ end
610
+
611
+ # like RangesIOResizeable, but Ole::Storage::Dirent specific. provides for migration
612
+ # between bats based on size, and updating the dirent, instead of the ole copy back
613
+ # on close.
614
+ class RangesIOMigrateable < RangesIOResizeable
615
+ attr_reader :dirent
616
+ def initialize dirent
617
+ @dirent = dirent
618
+ super @dirent.ole.bat_for_size(@dirent.size), @dirent.first_block, @dirent.size
619
+ end
620
+
621
+ def truncate size
622
+ bat = @dirent.ole.bat_for_size size
623
+ if bat != @bat
624
+ # bat migration needed! we need to backup some data. the amount of data
625
+ # should be <= @ole.header.threshold, so we can just hold it all in one buffer.
626
+ # backup this
627
+ pos = @pos
628
+ @pos = 0
629
+ keep = read [@size, size].min
630
+ # this does a normal truncate to 0, removing our presence from the old bat, and
631
+ # rewrite the dirent's first_block
632
+ super 0
633
+ @bat = bat
634
+ # just change the underlying io from right under everyone :)
635
+ @io = bat.io
636
+ # important to do this now, before the write. as the below write will always
637
+ # migrate us back to sbat! this will now allocate us +size+ in the new bat.
638
+ super
639
+ @pos = 0
640
+ write keep
641
+ @pos = pos
642
+ else
643
+ super
644
+ end
645
+ # now just update the file
646
+ @dirent.size = size
647
+ end
648
+
649
+ # forward this to the dirent
650
+ def first_block
651
+ @dirent.first_block
652
+ end
653
+
654
+ def first_block= val
655
+ @dirent.first_block = val
656
+ end
657
+ end
658
+
659
+ #
660
+ # A class which wraps an ole directory entry. Can be either a directory
661
+ # (<tt>Dirent#dir?</tt>) or a file (<tt>Dirent#file?</tt>)
662
+ #
663
+ # Most interaction with <tt>Ole::Storage</tt> is through this class.
664
+ # The 2 most important functions are <tt>Dirent#children</tt>, and
665
+ # <tt>Dirent#data</tt>.
666
+ #
667
+ # was considering separate classes for dirs and files. some methods/attrs only
668
+ # applicable to one or the other.
669
+ #
670
+ # Note that Dirent is still using a home grown Struct variant, with explicit
671
+ # MEMBERS etc. any reason for that still?
672
+ #
673
+ class Dirent
674
+ MEMBERS = [
675
+ :name_utf16, :name_len, :type_id, :colour, :prev, :next, :child,
676
+ :clsid, :flags, # dirs only
677
+ :create_time_str, :modify_time_str, # files only
678
+ :first_block, :size, :reserved
679
+ ]
680
+ PACK = 'a64 S C C L3 a16 L a8 a8 L2 a4'
681
+ SIZE = 128
682
+ TYPE_MAP = {
683
+ # this is temporary
684
+ 0 => :empty,
685
+ 1 => :dir,
686
+ 2 => :file,
687
+ 5 => :root
688
+ }
689
+ COLOUR_MAP = {
690
+ 0 => :red,
691
+ 1 => :black
692
+ }
693
+ # used in the next / prev / child stuff to show that the tree ends here.
694
+ # also used for first_block for directory.
695
+ EOT = 0xffffffff
696
+
697
+ include Enumerable
698
+
699
+ # Dirent's should be created in 1 of 2 ways, either Dirent.new ole, [:dir/:file/:root],
700
+ # or Dirent.load '... dirent data ...'
701
+ # its a bit clunky, but thats how it is at the moment. you can assign to type, but
702
+ # shouldn't.
703
+
704
+ attr_accessor :idx
705
+ # This returns all the children of this +Dirent+. It is filled in
706
+ # when the tree structure is recreated.
707
+ attr_accessor :children
708
+ attr_reader :ole, :type, :create_time, :modify_time, :name
709
+ def initialize ole, type
710
+ @ole = ole
711
+ # this isn't really good enough. need default values put in there.
712
+ @values = [
713
+ 0.chr * 2, 2, 0, # will get overwritten
714
+ 1, EOT, EOT, EOT,
715
+ 0.chr * 16, 0, nil, nil,
716
+ AllocationTable::EOC, 0, 0.chr * 4]
717
+ # maybe check types here.
718
+ @type = type
719
+ @create_time = @modify_time = nil
720
+ @children = []
721
+ if file?
722
+ @create_time = Time.now
723
+ @modify_time = Time.now
724
+ end
725
+ end
726
+
727
+ def self.load ole, str
728
+ # load should function without the need for the initializer.
729
+ dirent = Dirent.allocate
730
+ dirent.load ole, str
731
+ dirent
732
+ end
733
+
734
+ def load ole, str
735
+ @ole = ole
736
+ @values = str.unpack PACK
737
+ @name = Types::FROM_UTF16.iconv name_utf16[0...name_len].sub(/\x00\x00$/, '')
738
+ @type = TYPE_MAP[type_id] or raise "unknown type #{type_id.inspect}"
739
+ if file?
740
+ @create_time = Types.load_time create_time_str
741
+ @modify_time = Types.load_time modify_time_str
742
+ end
743
+ end
744
+
745
+ # only defined for files really. and the above children stuff is only for children.
746
+ # maybe i should have some sort of File and Dir class, that subclass Dirents? a dirent
747
+ # is just a data holder.
748
+ # this can be used for write support if the underlying io object was opened for writing.
749
+ # maybe take a mode string argument, and do truncation, append etc stuff.
750
+ def open
751
+ return nil unless file?
752
+ io = RangesIOMigrateable.new self
753
+ if block_given?
754
+ begin yield io
755
+ ensure; io.close
756
+ end
757
+ else io
758
+ end
759
+ end
760
+
761
+ def read limit=nil
762
+ open { |io| io.read limit }
763
+ end
764
+
765
+ def dir?
766
+ # to count root as a dir.
767
+ type != :file
768
+ end
769
+
770
+ def file?
771
+ type == :file
772
+ end
773
+
774
+ def time
775
+ # time is nil for streams, otherwise try to parse either of the time pairse (not
776
+ # sure of their meaning - created / modified?)
777
+ #@time ||= file? ? nil : (Dirent.parse_time(secs1, days1) || Dirent.parse_time(secs2, days2))
778
+ create_time || modify_time
779
+ end
780
+
781
+ def each(&block)
782
+ @children.each(&block)
783
+ end
784
+
785
+ def [] idx
786
+ return children[idx] if Integer === idx
787
+ # path style look up.
788
+ # maybe take another arg to allow creation? or leave that to the filesystem
789
+ # add on.
790
+ # not sure if '/' is a valid char in an Dirent#name, so no splitting etc at
791
+ # this level.
792
+ # also what about warning about multiple hits for the same name?
793
+ children.find { |child| idx === child.name }
794
+ end
795
+
796
+ # solution for the above '/' thing for now.
797
+ def / path
798
+ self[path]
799
+ end
800
+
801
+ def to_tree
802
+ if children and !children.empty?
803
+ str = "- #{inspect}\n"
804
+ children.each_with_index do |child, i|
805
+ last = i == children.length - 1
806
+ child.to_tree.split(/\n/).each_with_index do |line, j|
807
+ str << " #{last ? (j == 0 ? "\\" : ' ') : '|'}#{line}\n"
808
+ end
809
+ end
810
+ str
811
+ else "- #{inspect}\n"
812
+ end
813
+ end
814
+
815
+ MEMBERS.each_with_index do |sym, i|
816
+ define_method(sym) { @values[i] }
817
+ define_method(sym.to_s + '=') { |val| @values[i] = val }
818
+ end
819
+
820
+ def to_a
821
+ @values
822
+ end
823
+
824
+ # flattens the tree starting from here into +dirents+. note it modifies its argument.
825
+ def flatten dirents=[]
826
+ @idx = dirents.length
827
+ dirents << self
828
+ children.each { |child| child.flatten dirents }
829
+ self.child = Dirent.flatten_helper children
830
+ dirents
831
+ end
832
+
833
+ # i think making the tree structure optimized is actually more complex than this, and
834
+ # requires some intelligent ordering of the children based on names, but as long as
835
+ # it is valid its ok.
836
+ # actually, i think its ok. gsf for example only outputs a singly-linked-list, where
837
+ # prev is always EOT.
838
+ def self.flatten_helper children
839
+ return EOT if children.empty?
840
+ i = children.length / 2
841
+ this = children[i]
842
+ this.prev, this.next = [(0...i), (i+1..-1)].map { |r| flatten_helper children[r] }
843
+ this.idx
844
+ end
845
+
846
+ attr_accessor :name, :type
847
+ def save
848
+ tmp = Types::TO_UTF16.iconv(name)
849
+ tmp = tmp[0, 62] if tmp.length > 62
850
+ tmp += 0.chr * 2
851
+ self.name_len = tmp.length
852
+ self.name_utf16 = tmp + 0.chr * (64 - tmp.length)
853
+ begin
854
+ self.type_id = TYPE_MAP.to_a.find { |id, name| @type == name }.first
855
+ rescue
856
+ raise "unknown type #{type.inspect}"
857
+ end
858
+ # for the case of files, it is assumed that that was handled already
859
+ # note not dir?, so as not to override root's first_block
860
+ self.first_block = Dirent::EOT if type == :dir
861
+ if 0 #file?
862
+ #self.create_time_str = ?? #Types.load_time create_time_str
863
+ #self.modify_time_str = ?? #Types.load_time modify_time_str
864
+ else
865
+ self.create_time_str = 0.chr * 8
866
+ self.modify_time_str = 0.chr * 8
867
+ end
868
+ @values.pack PACK
869
+ end
870
+
871
+ def inspect
872
+ str = "#<Dirent:#{name.inspect}"
873
+ # perhaps i should remove the data snippet. its not that useful anymore.
874
+ if file?
875
+ tmp = read 9
876
+ data = tmp.length == 9 ? tmp[0, 5] + '...' : tmp
877
+ str << " size=#{size}" +
878
+ "#{time ? ' time=' + time.to_s.inspect : nil}" +
879
+ " data=#{data.inspect}"
880
+ else
881
+ # there is some dir specific stuff. like clsid, flags.
882
+ end
883
+ str + '>'
884
+ end
885
+
886
+ # --------
887
+ # and for creation of a dirent. don't like the name. is it a file or a directory?
888
+ # assign to type later? io will be empty.
889
+ def new_child type
890
+ child = Dirent.new ole, type
891
+ children << child
892
+ yield child if block_given?
893
+ child
894
+ end
895
+
896
+ def delete child
897
+ # remove from our child array, so that on reflatten and re-creation of @dirents, it will be gone
898
+ raise "#{child.inspect} not a child of #{self.inspect}" unless @children.delete child
899
+ # free our blocks
900
+ child.open { |io| io.truncate 0 }
901
+ end
902
+
903
+ def self.copy src, dst
904
+ # copies the contents of src to dst. must be the same type. this will throw an
905
+ # error on copying to root. maybe this will recurse too much for big documents??
906
+ raise 'differing types' if src.type == :file and dst.type != :file
907
+ dst.name = src.name
908
+ if src.dir?
909
+ src.children.each do |src_child|
910
+ dst.new_child(src_child.type) { |dst_child| Dirent.copy src_child, dst_child }
911
+ end
912
+ else
913
+ src.open do |src_io|
914
+ dst.open { |dst_io| IO.copy src_io, dst_io }
915
+ end
916
+ end
917
+ end
918
+ end
919
+ end
920
+ end
921
+
922
+ if $0 == __FILE__
923
+ puts Ole::Storage.open(ARGV[0]) { |ole| ole.root.to_tree }
924
+ end
925
+