ruby-ole 1.2.6 → 1.2.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,948 @@
1
+ require 'tempfile'
2
+
3
+ require 'ole/base'
4
+ require 'ole/types'
5
+ require 'ole/ranges_io'
6
+
7
+ module Ole # :nodoc:
8
+ #
9
+ # = Introduction
10
+ #
11
+ # <tt>Ole::Storage</tt> is a class intended to abstract away details of the
12
+ # access to OLE2 structured storage files, such as those produced by
13
+ # Microsoft Office, eg *.doc, *.msg etc.
14
+ #
15
+ # = Usage
16
+ #
17
+ # Usage should be fairly straight forward:
18
+ #
19
+ # # get the parent ole storage object
20
+ # ole = Ole::Storage.open 'myfile.msg', 'r+'
21
+ # # => #<Ole::Storage io=#<File:myfile.msg> root=#<Dirent:"Root Entry">>
22
+ # # read some data
23
+ # ole.root[1].read 4
24
+ # # => "\001\000\376\377"
25
+ # # get the top level root object and output a tree structure for
26
+ # # debugging
27
+ # puts ole.root.to_tree
28
+ # # =>
29
+ # - #<Dirent:"Root Entry" size=3840 time="2006-11-03T00:52:53Z">
30
+ # |- #<Dirent:"__nameid_version1.0" size=0 time="2006-11-03T00:52:53Z">
31
+ # | |- #<Dirent:"__substg1.0_00020102" size=16 data="CCAGAAAAAADAAA...">
32
+ # ...
33
+ # |- #<Dirent:"__substg1.0_8002001E" size=4 data="MTEuMA==">
34
+ # |- #<Dirent:"__properties_version1.0" size=800 data="AAAAAAAAAAABAA...">
35
+ # \- #<Dirent:"__recip_version1.0_#00000000" size=0 time="2006-11-03T00:52:53Z">
36
+ # |- #<Dirent:"__substg1.0_0FF60102" size=4 data="AAAAAA==">
37
+ # ...
38
+ # # write some data, and finish up (note that open is 'r+', so this overwrites
39
+ # # but doesn't truncate)
40
+ # ole.root["\001CompObj"].open { |f| f.write "blah blah" }
41
+ # ole.close
42
+ #
43
+ # = Thanks
44
+ #
45
+ # * The code contained in this project was initially based on chicago's libole
46
+ # (source available at http://prdownloads.sf.net/chicago/ole.tgz).
47
+ #
48
+ # * It was later augmented with some corrections by inspecting pole, and (purely
49
+ # for header definitions) gsf.
50
+ #
51
+ # * The property set parsing code came from the apache java project POIFS.
52
+ #
53
+ # * The excellent idea for using a pseudo file system style interface by providing
54
+ # #file and #dir methods which mimic File and Dir, was borrowed (along with almost
55
+ # unchanged tests!) from Thomas Sondergaard's rubyzip.
56
+ #
57
+ # = TODO
58
+ #
59
+ # * the custom header cruft for Header and Dirent needs some love.
60
+ # * i have a number of classes doing load/save combos: Header, AllocationTable, Dirent,
61
+ # and, in a manner of speaking, but arguably different, Storage itself.
62
+ # they have differing api's which would be nice to rethink.
63
+ # AllocationTable::Big must be created aot now, as it is used for all subsequent reads.
64
+ #
65
+ class Storage
66
+ # thrown for any bogus OLE file errors.
67
+ class FormatError < StandardError # :nodoc:
68
+ end
69
+
70
+ VERSION = '1.2.7'
71
+
72
+ # options used at creation time
73
+ attr_reader :params
74
+ # The top of the ole tree structure
75
+ attr_reader :root
76
+ # The tree structure in its original flattened form. only valid after #load, or #flush.
77
+ attr_reader :dirents
78
+ # The underlying io object to/from which the ole object is serialized, whether we
79
+ # should close it, and whether it is writeable
80
+ attr_reader :io, :close_parent, :writeable
81
+ # Low level internals, you probably shouldn't need to mess with these
82
+ attr_reader :header, :bbat, :sbat, :sb_file
83
+
84
+ # maybe include an option hash, and allow :close_parent => true, to be more general.
85
+ # +arg+ should be either a file, or an +IO+ object, and needs to be seekable.
86
+ def initialize arg, mode=nil, params={}
87
+ params, mode = mode, nil if Hash === mode
88
+ params = {:update_timestamps => true}.merge(params)
89
+ @params = params
90
+
91
+ # get the io object
92
+ @close_parent, @io = if String === arg
93
+ mode ||= 'rb'
94
+ [true, open(arg, mode)]
95
+ else
96
+ raise ArgumentError, 'unable to specify mode string with io object' if mode
97
+ [false, arg]
98
+ end
99
+ # do we have this file opened for writing? don't know of a better way to tell
100
+ # (unless we parse the mode string in the open case)
101
+ # hmmm, note that in ruby 1.9 this doesn't work anymore. which is all the more
102
+ # reason to use mode string parsing when available, and fall back to something like
103
+ # io.writeable? otherwise.
104
+ @writeable = begin
105
+ if mode
106
+ IO::Mode.new(mode).writeable?
107
+ else
108
+ @io.flush
109
+ true
110
+ end
111
+ rescue IOError
112
+ false
113
+ end
114
+ # silence undefined warning in clear
115
+ @sb_file = nil
116
+ # if the io object has data, we should load it, otherwise start afresh
117
+ # this should be based on the mode string rather.
118
+ @io.size > 0 ? load : clear
119
+ end
120
+
121
+ def self.open arg, mode=nil, params={}
122
+ ole = new arg, mode, params
123
+ if block_given?
124
+ begin yield ole
125
+ ensure; ole.close
126
+ end
127
+ else ole
128
+ end
129
+ end
130
+
131
+ # load document from file.
132
+ #
133
+ # TODO: implement various allocationtable checks, maybe as a AllocationTable#fsck function :)
134
+ #
135
+ # 1. reterminate any chain not ending in EOC.
136
+ # compare file size with actually allocated blocks per file.
137
+ # 2. pass through all chain heads looking for collisions, and making sure nothing points to them
138
+ # (ie they are really heads). in both sbat and mbat
139
+ # 3. we know the locations of the bbat data, and mbat data. ensure that there are placeholder blocks
140
+ # in the bat for them.
141
+ # 4. maybe a check of excess data. if there is data outside the bbat.truncate.length + 1 * block_size,
142
+ # (eg what is used for truncate in #flush), then maybe add some sort of message about that. it
143
+ # will be automatically thrown away at close time.
144
+ def load
145
+ # we always read 512 for the header block. if the block size ends up being different,
146
+ # what happens to the 109 fat entries. are there more/less entries?
147
+ @io.rewind
148
+ header_block = @io.read 512
149
+ @header = Header.new header_block
150
+
151
+ # create an empty bbat.
152
+ @bbat = AllocationTable::Big.new self
153
+ mbat_blocks = (0...@header.num_mbat).map { |i| i + @header.mbat_start }
154
+ bbat_chain = (header_block[Header::SIZE..-1] + @bbat.read(mbat_blocks)).unpack 'V*'
155
+ # am i using num_bat in the right way?
156
+ @bbat.load @bbat.read(bbat_chain[0, @header.num_bat])
157
+
158
+ # get block chain for directories, read it, then split it into chunks and load the
159
+ # directory entries. semantics changed - used to cut at first dir where dir.type == 0
160
+ @dirents = @bbat.read(@header.dirent_start).scan(/.{#{Dirent::SIZE}}/mo).
161
+ map { |str| Dirent.new self, str }.reject { |d| d.type_id == 0 }
162
+
163
+ # now reorder from flat into a tree
164
+ # links are stored in some kind of balanced binary tree
165
+ # check that everything is visited at least, and at most once
166
+ # similarly with the blocks of the file.
167
+ # was thinking of moving this to Dirent.to_tree instead.
168
+ class << @dirents
169
+ def to_tree idx=0
170
+ return [] if idx == Dirent::EOT
171
+ d = self[idx]
172
+ d.children = to_tree d.child
173
+ raise FormatError, "directory #{d.inspect} used twice" if d.idx
174
+ d.idx = idx
175
+ to_tree(d.prev) + [d] + to_tree(d.next)
176
+ end
177
+ end
178
+
179
+ @root = @dirents.to_tree.first
180
+ Log.warn "root name was #{@root.name.inspect}" unless @root.name == 'Root Entry'
181
+ unused = @dirents.reject(&:idx).length
182
+ Log.warn "#{unused} unused directories" if unused > 0
183
+
184
+ # FIXME i don't currently use @header.num_sbat which i should
185
+ # hmm. nor do i write it. it means what exactly again?
186
+ # which mode to use here?
187
+ @sb_file = RangesIOResizeable.new @bbat, :first_block => @root.first_block, :size => @root.size
188
+ @sbat = AllocationTable::Small.new self
189
+ @sbat.load @bbat.read(@header.sbat_start)
190
+ end
191
+
192
+ def close
193
+ @sb_file.close
194
+ flush if @writeable
195
+ @io.close if @close_parent
196
+ end
197
+
198
+ # the flush method is the main "save" method. all file contents are always
199
+ # written directly to the file by the RangesIO objects, all this method does
200
+ # is write out all the file meta data - dirents, allocation tables, file header
201
+ # etc.
202
+ #
203
+ # maybe add an option to zero the padding, and any remaining avail blocks in the
204
+ # allocation table.
205
+ #
206
+ # TODO: long and overly complex. simplify and test better. eg, perhaps move serialization
207
+ # of bbat to AllocationTable::Big.
208
+ def flush
209
+ # update root dirent, and flatten dirent tree
210
+ @root.name = 'Root Entry'
211
+ @root.first_block = @sb_file.first_block
212
+ @root.size = @sb_file.size
213
+ @dirents = @root.flatten
214
+
215
+ # serialize the dirents using the bbat
216
+ RangesIOResizeable.open @bbat, 'w', :first_block => @header.dirent_start do |io|
217
+ @dirents.each { |dirent| io.write dirent.to_s }
218
+ padding = (io.size / @bbat.block_size.to_f).ceil * @bbat.block_size - io.size
219
+ io.write 0.chr * padding
220
+ @header.dirent_start = io.first_block
221
+ end
222
+
223
+ # serialize the sbat
224
+ # perhaps the blocks used by the sbat should be marked with BAT?
225
+ RangesIOResizeable.open @bbat, 'w', :first_block => @header.sbat_start do |io|
226
+ io.write @sbat.to_s
227
+ @header.sbat_start = io.first_block
228
+ @header.num_sbat = @bbat.chain(@header.sbat_start).length
229
+ end
230
+
231
+ # create RangesIOResizeable hooked up to the bbat. use that to claim bbat blocks using
232
+ # truncate. then when its time to write, convert that chain and some chunk of blocks at
233
+ # the end, into META_BAT blocks. write out the chain, and those meta bat blocks, and its
234
+ # done.
235
+ # this is perhaps not good, as we reclaim all bat blocks here, which
236
+ # may include the sbat we just wrote. FIXME
237
+ @bbat.map! do |b|
238
+ b == AllocationTable::BAT || b == AllocationTable::META_BAT ? AllocationTable::AVAIL : b
239
+ end
240
+
241
+ # currently we use a loop. this could be better, but basically,
242
+ # the act of writing out the bat, itself requires blocks which get
243
+ # recorded in the bat.
244
+ #
245
+ # i'm sure that there'd be some simpler closed form solution to this. solve
246
+ # recursive func:
247
+ #
248
+ # num_mbat_blocks = ceil(max((mbat_len - 109) * 4 / block_size, 0))
249
+ # bbat_len = initial_bbat_len + num_mbat_blocks
250
+ # mbat_len = ceil(bbat_len * 4 / block_size)
251
+ #
252
+ # the actual bbat allocation table is itself stored throughout the file, and that chain
253
+ # is stored in the initial blocks, and the mbat blocks.
254
+ num_mbat_blocks = 0
255
+ io = RangesIOResizeable.new @bbat, 'w', :first_block => AllocationTable::EOC
256
+ # truncate now, so that we can simplify size calcs - the mbat blocks will be appended in a
257
+ # contiguous chunk at the end.
258
+ # hmmm, i think this truncate should be matched with a truncate of the underlying io. if you
259
+ # delete a lot of stuff, and free up trailing blocks, the file size never shrinks. this can
260
+ # be fixed easily, add an io truncate
261
+ @bbat.truncate!
262
+ before = @io.size
263
+ @io.truncate @bbat.block_size * (@bbat.length + 1)
264
+ while true
265
+ # get total bbat size. equivalent to @bbat.to_s.length, but for the factoring in of
266
+ # the mbat blocks. we can't just add the mbat blocks directly to the bbat, as as this iteration
267
+ # progresses, more blocks may be needed for the bat itself (if there are no more gaps), and the
268
+ # mbat must remain contiguous.
269
+ bbat_data_len = ((@bbat.length + num_mbat_blocks) * 4 / @bbat.block_size.to_f).ceil * @bbat.block_size
270
+ # now storing the excess mbat blocks also increases the size of the bbat:
271
+ new_num_mbat_blocks = ([bbat_data_len / @bbat.block_size - 109, 0].max * 4 / @bbat.block_size.to_f).ceil
272
+ if new_num_mbat_blocks != num_mbat_blocks
273
+ # need more space for the mbat.
274
+ num_mbat_blocks = new_num_mbat_blocks
275
+ elsif io.size != bbat_data_len
276
+ # need more space for the bat
277
+ # this may grow the bbat, depending on existing available blocks
278
+ io.truncate bbat_data_len
279
+ else
280
+ break
281
+ end
282
+ end
283
+
284
+ # now extract the info we want:
285
+ ranges = io.ranges
286
+ bbat_chain = @bbat.chain io.first_block
287
+ # the extra mbat data is a set of contiguous blocks at the end
288
+ io.close
289
+ bbat_chain.each { |b| @bbat[b] = AllocationTable::BAT }
290
+ # tack on the mbat stuff
291
+ @header.mbat_start = @bbat.length # need to record this here before tacking on the mbat
292
+ @header.num_bat = bbat_chain.length
293
+ num_mbat_blocks.times { @bbat << AllocationTable::META_BAT }
294
+
295
+ # now finally write the bbat, using a not resizable io.
296
+ # the mode here will be 'r', which allows write atm.
297
+ RangesIO.open(@io, :ranges => ranges) { |f| f.write @bbat.to_s }
298
+
299
+ # this is the mbat. pad it out.
300
+ bbat_chain += [AllocationTable::AVAIL] * [109 - bbat_chain.length, 0].max
301
+ @header.num_mbat = num_mbat_blocks
302
+ if num_mbat_blocks == 0
303
+ @header.mbat_start = AllocationTable::EOC
304
+ else
305
+ # write out the mbat blocks now. first of all, where are they going to be?
306
+ mbat_data = bbat_chain[109..-1]
307
+ q = @bbat.block_size / 4
308
+ mbat_data += [AllocationTable::AVAIL] *((mbat_data.length / q.to_f).ceil * q - mbat_data.length)
309
+ ranges = @bbat.ranges((0...num_mbat_blocks).map { |i| @header.mbat_start + i })
310
+ RangesIO.open(@io, :ranges => ranges) { |f| f.write mbat_data.pack('V*') }
311
+ end
312
+
313
+ # now seek back and write the header out
314
+ @io.seek 0
315
+ @io.write @header.to_s + bbat_chain[0, 109].pack('V*')
316
+ @io.flush
317
+ end
318
+
319
+ def clear
320
+ # initialize to equivalent of loading an empty ole document.
321
+ Log.warn 'creating new ole storage object on non-writable io' unless @writeable
322
+ @header = Header.new
323
+ @bbat = AllocationTable::Big.new self
324
+ @root = Dirent.new self, :type => :root, :name => 'Root Entry'
325
+ @dirents = [@root]
326
+ @root.idx = 0
327
+ @sb_file.close if @sb_file
328
+ @sb_file = RangesIOResizeable.new @bbat, :first_block => AllocationTable::EOC
329
+ @sbat = AllocationTable::Small.new self
330
+ # throw everything else the hell away
331
+ @io.truncate 0
332
+ end
333
+
334
+ # could be useful with mis-behaving ole documents. or to just clean them up.
335
+ def repack temp=:file
336
+ case temp
337
+ when :file
338
+ Tempfile.open 'ole-repack' do |io|
339
+ io.binmode
340
+ repack_using_io io
341
+ end
342
+ when :mem; StringIO.open(&method(:repack_using_io))
343
+ else raise ArgumentError, "unknown temp backing #{temp.inspect}"
344
+ end
345
+ end
346
+
347
+ def repack_using_io temp_io
348
+ @io.rewind
349
+ IO.copy @io, temp_io
350
+ clear
351
+ Storage.open temp_io, nil, @params do |temp_ole|
352
+ #temp_ole.root.type = :dir
353
+ Dirent.copy temp_ole.root, root
354
+ end
355
+ end
356
+
357
+ def bat_for_size size
358
+ # note >=, not > previously.
359
+ size >= @header.threshold ? @bbat : @sbat
360
+ end
361
+
362
+ def inspect
363
+ "#<#{self.class} io=#{@io.inspect} root=#{@root.inspect}>"
364
+ end
365
+
366
+ #
367
+ # A class which wraps the ole header
368
+ #
369
+ # Header.new can be both used to load from a string, or to create from
370
+ # defaults. Serialization is accomplished with the #to_s method.
371
+ #
372
+ class Header < Struct.new(
373
+ :magic, :clsid, :minor_ver, :major_ver, :byte_order, :b_shift, :s_shift,
374
+ :reserved, :csectdir, :num_bat, :dirent_start, :transacting_signature, :threshold,
375
+ :sbat_start, :num_sbat, :mbat_start, :num_mbat
376
+ )
377
+ PACK = 'a8 a16 v2 a2 v2 a6 V3 a4 V5'
378
+ SIZE = 0x4c
379
+ # i have seen it pointed out that the first 4 bytes of hex,
380
+ # 0xd0cf11e0, is supposed to spell out docfile. hmmm :)
381
+ MAGIC = "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # expected value of Header#magic
382
+ # what you get if creating new header from scratch.
383
+ # AllocationTable::EOC isn't available yet. meh.
384
+ EOC = 0xfffffffe
385
+ DEFAULT = [
386
+ MAGIC, 0.chr * 16, 59, 3, "\xfe\xff", 9, 6,
387
+ 0.chr * 6, 0, 1, EOC, 0.chr * 4,
388
+ 4096, EOC, 0, EOC, 0
389
+ ]
390
+
391
+ def initialize values=DEFAULT
392
+ values = values.unpack(PACK) if String === values
393
+ super(*values)
394
+ validate!
395
+ end
396
+
397
+ def to_s
398
+ to_a.pack PACK
399
+ end
400
+
401
+ def validate!
402
+ raise FormatError, "OLE2 signature is invalid" unless magic == MAGIC
403
+ if num_bat == 0 or # is that valid for a completely empty file?
404
+ # not sure about this one. basically to do max possible bat given size of mbat
405
+ num_bat > 109 && num_bat > 109 + num_mbat * (1 << b_shift - 2) or
406
+ # shouldn't need to use the mbat as there is enough space in the header block
407
+ num_bat < 109 && num_mbat != 0 or
408
+ # given the size of the header is 76, if b_shift <= 6, blocks address the header.
409
+ s_shift > b_shift or b_shift <= 6 or b_shift >= 31 or
410
+ # we only handle little endian
411
+ byte_order != "\xfe\xff"
412
+ raise FormatError, "not valid OLE2 structured storage file"
413
+ end
414
+ # relaxed this, due to test-msg/qwerty_[1-3]*.msg they all had
415
+ # 3 for this value.
416
+ # transacting_signature != "\x00" * 4 or
417
+ if threshold != 4096 or
418
+ num_mbat == 0 && mbat_start != AllocationTable::EOC or
419
+ reserved != "\x00" * 6
420
+ Log.warn "may not be a valid OLE2 structured storage file"
421
+ end
422
+ true
423
+ end
424
+ end
425
+
426
+ #
427
+ # +AllocationTable+'s hold the chains corresponding to files. Given
428
+ # an initial index, <tt>AllocationTable#chain</tt> follows the chain, returning
429
+ # the blocks that make up that file.
430
+ #
431
+ # There are 2 allocation tables, the bbat, and sbat, for big and small
432
+ # blocks respectively. The block chain should be loaded using either
433
+ # <tt>Storage#read_big_blocks</tt> or <tt>Storage#read_small_blocks</tt>
434
+ # as appropriate.
435
+ #
436
+ # Whether or not big or small blocks are used for a file depends on
437
+ # whether its size is over the <tt>Header#threshold</tt> level.
438
+ #
439
+ # An <tt>Ole::Storage</tt> document is serialized as a series of directory objects,
440
+ # which are stored in blocks throughout the file. The blocks are either
441
+ # big or small, and are accessed using the <tt>AllocationTable</tt>.
442
+ #
443
+ # The bbat allocation table's data is stored in the spare room in the header
444
+ # block, and in extra blocks throughout the file as referenced by the meta
445
+ # bat. That chain is linear, as there is no higher level table.
446
+ #
447
+ # AllocationTable.new is used to create an empty table. It can parse a string
448
+ # with the #load method. Serialization is accomplished with the #to_s method.
449
+ #
450
+ class AllocationTable < Array
451
+ # a free block (I don't currently leave any blocks free), although I do pad out
452
+ # the allocation table with AVAIL to the block size.
453
+ AVAIL = 0xffffffff
454
+ EOC = 0xfffffffe # end of a chain
455
+ # these blocks are used for storing the allocation table chains
456
+ BAT = 0xfffffffd
457
+ META_BAT = 0xfffffffc
458
+
459
+ attr_reader :ole, :io, :block_size
460
+ def initialize ole
461
+ @ole = ole
462
+ @sparse = true
463
+ super()
464
+ end
465
+
466
+ def load data
467
+ replace data.unpack('V*')
468
+ end
469
+
470
+ def truncate
471
+ # this strips trailing AVAILs. come to think of it, this has the potential to break
472
+ # bogus ole. if you terminate using AVAIL instead of EOC, like I did before. but that is
473
+ # very broken. however, if a chain ends with AVAIL, it should probably be fixed to EOC
474
+ # at load time.
475
+ temp = reverse
476
+ not_avail = temp.find { |b| b != AVAIL } and temp = temp[temp.index(not_avail)..-1]
477
+ temp.reverse
478
+ end
479
+
480
+ def truncate!
481
+ replace truncate
482
+ end
483
+
484
+ def to_s
485
+ table = truncate
486
+ # pad it out some
487
+ num = @ole.bbat.block_size / 4
488
+ # do you really use AVAIL? they probably extend past end of file, and may shortly
489
+ # be used for the bat. not really good.
490
+ table += [AVAIL] * (num - (table.length % num)) if (table.length % num) != 0
491
+ table.pack 'V*'
492
+ end
493
+
494
+ # rewrote this to be non-recursive as it broke on a large attachment
495
+ # chain with a stack error
496
+ def chain idx
497
+ a = []
498
+ until idx >= META_BAT
499
+ raise FormatError, "broken allocationtable chain" if idx < 0 || idx > length
500
+ a << idx
501
+ idx = self[idx]
502
+ end
503
+ Log.warn "invalid chain terminator #{idx}" unless idx == EOC
504
+ a
505
+ end
506
+
507
+ # Turn a chain (an array given by +chain+) of blocks (optionally
508
+ # truncated to +size+) into an array of arrays describing the stretches of
509
+ # bytes in the file that it belongs to.
510
+ #
511
+ # The blocks are Big or Small blocks depending on the table type.
512
+ def blocks_to_ranges chain, size=nil
513
+ # truncate the chain if required
514
+ chain = chain[0...(size.to_f / block_size).ceil] if size
515
+ # convert chain to ranges of the block size
516
+ ranges = chain.map { |i| [block_size * i, block_size] }
517
+ # truncate final range if required
518
+ ranges.last[1] -= (ranges.length * block_size - size) if ranges.last and size
519
+ ranges
520
+ end
521
+
522
+ def ranges chain, size=nil
523
+ chain = self.chain(chain) unless Array === chain
524
+ blocks_to_ranges chain, size
525
+ end
526
+
527
+ # quick shortcut. chain can be either a head (in which case the table is used to
528
+ # turn it into a chain), or a chain. it is converted to ranges, then to rangesio.
529
+ def open chain, size=nil, &block
530
+ RangesIO.open @io, :ranges => ranges(chain, size), &block
531
+ end
532
+
533
+ def read chain, size=nil
534
+ open chain, size, &:read
535
+ end
536
+
537
+ # catch any method that may add an AVAIL somewhere in the middle, thus invalidating
538
+ # the @sparse speedup for free_block. annoying using eval, but define_method won't
539
+ # work for this.
540
+ # FIXME
541
+ [:map!, :collect!].each do |name|
542
+ eval <<-END
543
+ def #{name}(*args, &block)
544
+ @sparse = true
545
+ super
546
+ end
547
+ END
548
+ end
549
+
550
+ def []= idx, val
551
+ @sparse = true if val == AVAIL
552
+ super
553
+ end
554
+
555
+ def free_block
556
+ if @sparse
557
+ i = index(AVAIL) and return i
558
+ end
559
+ @sparse = false
560
+ push AVAIL
561
+ length - 1
562
+ end
563
+
564
+ # must return first_block
565
+ def resize_chain blocks, size
566
+ new_num_blocks = (size / block_size.to_f).ceil
567
+ old_num_blocks = blocks.length
568
+ if new_num_blocks < old_num_blocks
569
+ # de-allocate some of our old blocks. TODO maybe zero them out in the file???
570
+ (new_num_blocks...old_num_blocks).each { |i| self[blocks[i]] = AVAIL }
571
+ self[blocks[new_num_blocks-1]] = EOC if new_num_blocks > 0
572
+ blocks.slice! new_num_blocks..-1
573
+ elsif new_num_blocks > old_num_blocks
574
+ # need some more blocks.
575
+ last_block = blocks.last
576
+ (new_num_blocks - old_num_blocks).times do
577
+ block = free_block
578
+ # connect the chain. handle corner case of blocks being [] initially
579
+ self[last_block] = block if last_block
580
+ blocks << block
581
+ last_block = block
582
+ self[last_block] = EOC
583
+ end
584
+ end
585
+ # update ranges, and return that also now
586
+ blocks
587
+ end
588
+
589
+ class Big < AllocationTable
590
+ def initialize(*args)
591
+ super
592
+ @block_size = 1 << @ole.header.b_shift
593
+ @io = @ole.io
594
+ end
595
+
596
+ # Big blocks are kind of -1 based, in order to not clash with the header.
597
+ def blocks_to_ranges blocks, size
598
+ super blocks.map { |b| b + 1 }, size
599
+ end
600
+ end
601
+
602
+ class Small < AllocationTable
603
+ def initialize(*args)
604
+ super
605
+ @block_size = 1 << @ole.header.s_shift
606
+ @io = @ole.sb_file
607
+ end
608
+ end
609
+ end
610
+
611
+ # like normal RangesIO, but Ole::Storage specific. the ranges are backed by an
612
+ # AllocationTable, and can be resized. used for read/write to 2 streams:
613
+ # 1. serialized dirent data
614
+ # 2. sbat table data
615
+ # 3. all dirents but through RangesIOMigrateable below
616
+ #
617
+ # Note that all internal access to first_block is through accessors, as it is sometimes
618
+ # useful to redirect it.
619
+ class RangesIOResizeable < RangesIO
620
+ attr_reader :bat
621
+ attr_accessor :first_block
622
+ def initialize bat, mode='r', params={}
623
+ mode, params = 'r', mode if Hash === mode
624
+ first_block, size = params.values_at :first_block, :size
625
+ raise ArgumentError, 'must specify first_block' unless first_block
626
+ @bat = bat
627
+ self.first_block = first_block
628
+ # we now cache the blocks chain, for faster resizing.
629
+ @blocks = @bat.chain first_block
630
+ super @bat.io, mode, :ranges => @bat.ranges(@blocks, size)
631
+ end
632
+
633
+ def truncate size
634
+ # note that old_blocks is != @ranges.length necessarily. i'm planning to write a
635
+ # merge_ranges function that merges sequential ranges into one as an optimization.
636
+ @bat.resize_chain @blocks, size
637
+ @ranges = @bat.ranges @blocks, size
638
+ @pos = @size if @pos > size
639
+ self.first_block = @blocks.empty? ? AllocationTable::EOC : @blocks.first
640
+
641
+ # don't know if this is required, but we explicitly request our @io to grow if necessary
642
+ # we never shrink it though. maybe this belongs in allocationtable, where smarter decisions
643
+ # can be made.
644
+ # maybe its ok to just seek out there later??
645
+ max = @ranges.map { |pos, len| pos + len }.max || 0
646
+ @io.truncate max if max > @io.size
647
+
648
+ @size = size
649
+ end
650
+ end
651
+
652
+ # like RangesIOResizeable, but Ole::Storage::Dirent specific. provides for migration
653
+ # between bats based on size, and updating the dirent.
654
+ class RangesIOMigrateable < RangesIOResizeable
655
+ attr_reader :dirent
656
+ def initialize dirent, mode='r'
657
+ @dirent = dirent
658
+ super @dirent.ole.bat_for_size(@dirent.size), mode,
659
+ :first_block => @dirent.first_block, :size => @dirent.size
660
+ end
661
+
662
+ def truncate size
663
+ bat = @dirent.ole.bat_for_size size
664
+ if bat.class != @bat.class
665
+ # bat migration needed! we need to backup some data. the amount of data
666
+ # should be <= @ole.header.threshold, so we can just hold it all in one buffer.
667
+ # backup this
668
+ pos = @pos
669
+ @pos = 0
670
+ keep = read [@size, size].min
671
+ # this does a normal truncate to 0, removing our presence from the old bat, and
672
+ # rewrite the dirent's first_block
673
+ super 0
674
+ @bat = bat
675
+ # just change the underlying io from right under everyone :)
676
+ @io = bat.io
677
+ # important to do this now, before the write. as the below write will always
678
+ # migrate us back to sbat! this will now allocate us +size+ in the new bat.
679
+ super
680
+ @pos = 0
681
+ write keep
682
+ @pos = pos
683
+ else
684
+ super
685
+ end
686
+ # now just update the file
687
+ @dirent.size = size
688
+ end
689
+
690
+ # forward this to the dirent
691
+ def first_block
692
+ @dirent.first_block
693
+ end
694
+
695
+ def first_block= val
696
+ @dirent.first_block = val
697
+ end
698
+ end
699
+
700
+ #
701
+ # A class which wraps an ole directory entry. Can be either a directory
702
+ # (<tt>Dirent#dir?</tt>) or a file (<tt>Dirent#file?</tt>)
703
+ #
704
+ # Most interaction with <tt>Ole::Storage</tt> is through this class.
705
+ # The 2 most important functions are <tt>Dirent#children</tt>, and
706
+ # <tt>Dirent#data</tt>.
707
+ #
708
+ # was considering separate classes for dirs and files. some methods/attrs only
709
+ # applicable to one or the other.
710
+ #
711
+ # As with the other classes, #to_s performs the serialization.
712
+ #
713
+ class Dirent < Struct.new(
714
+ :name_utf16, :name_len, :type_id, :colour, :prev, :next, :child,
715
+ :clsid, :flags, # dirs only
716
+ :create_time_str, :modify_time_str, # files only
717
+ :first_block, :size, :reserved
718
+ )
719
+ include RecursivelyEnumerable
720
+
721
+ PACK = 'a64 v C C V3 a16 V a8 a8 V2 a4'
722
+ SIZE = 128
723
+ TYPE_MAP = {
724
+ # this is temporary
725
+ 0 => :empty,
726
+ 1 => :dir,
727
+ 2 => :file,
728
+ 5 => :root
729
+ }
730
+ # something to do with the fact that the tree is supposed to be red-black
731
+ COLOUR_MAP = {
732
+ 0 => :red,
733
+ 1 => :black
734
+ }
735
+ # used in the next / prev / child stuff to show that the tree ends here.
736
+ # also used for first_block for directory.
737
+ EOT = 0xffffffff
738
+ DEFAULT = [
739
+ 0.chr * 2, 2, 0, # will get overwritten
740
+ 1, EOT, EOT, EOT,
741
+ 0.chr * 16, 0, nil, nil,
742
+ AllocationTable::EOC, 0, 0.chr * 4
743
+ ]
744
+
745
+ # i think its just used by the tree building
746
+ attr_accessor :idx
747
+ # This returns all the children of this +Dirent+. It is filled in
748
+ # when the tree structure is recreated.
749
+ attr_accessor :children
750
+ attr_accessor :name
751
+ attr_reader :ole, :type, :create_time, :modify_time
752
+ def initialize ole, values=DEFAULT, params={}
753
+ @ole = ole
754
+ values, params = DEFAULT, values if Hash === values
755
+ values = values.unpack(PACK) if String === values
756
+ super(*values)
757
+
758
+ # extra parsing from the actual struct values
759
+ @name = params[:name] || Types::Variant.load(Types::VT_LPWSTR, name_utf16[0...name_len])
760
+ @type = if params[:type]
761
+ unless TYPE_MAP.values.include?(params[:type])
762
+ raise ArgumentError, "unknown type #{params[:type].inspect}"
763
+ end
764
+ params[:type]
765
+ else
766
+ TYPE_MAP[type_id] or raise FormatError, "unknown type_id #{type_id.inspect}"
767
+ end
768
+
769
+ # further extra type specific stuff
770
+ if file?
771
+ default_time = @ole.params[:update_timestamps] ? Time.now : nil
772
+ @create_time ||= default_time
773
+ @modify_time ||= default_time
774
+ @create_time = Types::Variant.load(Types::VT_FILETIME, create_time_str) if create_time_str
775
+ @modify_time = Types::Variant.load(Types::VT_FILETIME, create_time_str) if modify_time_str
776
+ @children = nil
777
+ else
778
+ @create_time = nil
779
+ @modify_time = nil
780
+ self.size = 0 unless @type == :root
781
+ @children = []
782
+ end
783
+
784
+ # to silence warnings. used for tree building at load time
785
+ # only.
786
+ @idx = nil
787
+ end
788
+
789
+ def open mode='r'
790
+ raise Errno::EISDIR unless file?
791
+ io = RangesIOMigrateable.new self, mode
792
+ # TODO work on the mode string stuff a bit more.
793
+ # maybe let the io object know about the mode, so it can refuse
794
+ # to work for read/write appropriately. maybe redefine all unusable
795
+ # methods using singleton class to throw errors.
796
+ # for now, i just want to implement truncation on use of 'w'. later,
797
+ # i need to do 'a' etc.
798
+ case mode
799
+ when 'r', 'r+'
800
+ # as i don't enforce reading/writing, nothing changes here. kind of
801
+ # need to enforce tt if i want modify times to work better.
802
+ @modify_time = Time.now if mode == 'r+'
803
+ when 'w'
804
+ @modify_time = Time.now
805
+ # io.truncate 0
806
+ #else
807
+ # raise NotImplementedError, "unsupported mode - #{mode.inspect}"
808
+ end
809
+ if block_given?
810
+ begin yield io
811
+ ensure; io.close
812
+ end
813
+ else io
814
+ end
815
+ end
816
+
817
+ def read limit=nil
818
+ open { |io| io.read limit }
819
+ end
820
+
821
+ def file?
822
+ type == :file
823
+ end
824
+
825
+ def dir?
826
+ # to count root as a dir.
827
+ !file?
828
+ end
829
+
830
+ # maybe need some options regarding case sensitivity.
831
+ def / name
832
+ children.find { |child| name === child.name }
833
+ end
834
+
835
+ def [] idx
836
+ if String === idx
837
+ #warn 'String form of Dirent#[] is deprecated'
838
+ self / idx
839
+ else
840
+ super
841
+ end
842
+ end
843
+
844
+ # move to ruby-msg. and remove from here
845
+ def time
846
+ #warn 'Dirent#time is deprecated'
847
+ create_time || modify_time
848
+ end
849
+
850
+ def each_child(&block)
851
+ @children.each(&block)
852
+ end
853
+
854
+ # flattens the tree starting from here into +dirents+. note it modifies its argument.
855
+ def flatten dirents=[]
856
+ @idx = dirents.length
857
+ dirents << self
858
+ if file?
859
+ self.prev = self.next = self.child = EOT
860
+ else
861
+ children.each { |child| child.flatten dirents }
862
+ self.child = Dirent.flatten_helper children
863
+ end
864
+ dirents
865
+ end
866
+
867
+ # i think making the tree structure optimized is actually more complex than this, and
868
+ # requires some intelligent ordering of the children based on names, but as long as
869
+ # it is valid its ok.
870
+ # actually, i think its ok. gsf for example only outputs a singly-linked-list, where
871
+ # prev is always EOT.
872
+ def self.flatten_helper children
873
+ return EOT if children.empty?
874
+ i = children.length / 2
875
+ this = children[i]
876
+ this.prev, this.next = [(0...i), (i+1..-1)].map { |r| flatten_helper children[r] }
877
+ this.idx
878
+ end
879
+
880
+ def to_s
881
+ tmp = Types::Variant.dump(Types::VT_LPWSTR, name)
882
+ tmp = tmp[0, 62] if tmp.length > 62
883
+ tmp += 0.chr * 2
884
+ self.name_len = tmp.length
885
+ self.name_utf16 = tmp + 0.chr * (64 - tmp.length)
886
+ # type_id can perhaps be set in the initializer, as its read only now.
887
+ self.type_id = TYPE_MAP.to_a.find { |id, name| @type == name }.first
888
+ # for the case of files, it is assumed that that was handled already
889
+ # note not dir?, so as not to override root's first_block
890
+ self.first_block = Dirent::EOT if type == :dir
891
+ if file?
892
+ # this is messed up. it changes the time stamps regardless of whether the file
893
+ # was actually touched. instead, any open call with a writeable mode, should update
894
+ # the modify time. create time would be set in new.
895
+ if @ole.params[:update_timestamps]
896
+ self.create_time_str = Types::Variant.dump Types::VT_FILETIME, @create_time
897
+ self.modify_time_str = Types::Variant.dump Types::VT_FILETIME, @modify_time
898
+ end
899
+ else
900
+ self.create_time_str = 0.chr * 8
901
+ self.modify_time_str = 0.chr * 8
902
+ end
903
+ to_a.pack PACK
904
+ end
905
+
906
+ def inspect
907
+ str = "#<Dirent:#{name.inspect}"
908
+ # perhaps i should remove the data snippet. its not that useful anymore.
909
+ # there is also some dir specific stuff. like clsid, flags, that i should
910
+ # probably include
911
+ if file?
912
+ tmp = read 9
913
+ data = tmp.length == 9 ? tmp[0, 5] + '...' : tmp
914
+ str << " size=#{size}" +
915
+ "#{modify_time ? ' modify_time=' + modify_time.to_s.inspect : nil}" +
916
+ " data=#{data.inspect}"
917
+ end
918
+ str + '>'
919
+ end
920
+
921
+ def delete child
922
+ # remove from our child array, so that on reflatten and re-creation of @dirents, it will be gone
923
+ raise ArgumentError, "#{child.inspect} not a child of #{self.inspect}" unless @children.delete child
924
+ # free our blocks
925
+ child.open { |io| io.truncate 0 }
926
+ end
927
+
928
+ def self.copy src, dst
929
+ # copies the contents of src to dst. must be the same type. this will throw an
930
+ # error on copying to root. maybe this will recurse too much for big documents??
931
+ raise ArgumentError, 'differing types' if src.file? and !dst.file?
932
+ dst.name = src.name
933
+ if src.dir?
934
+ src.children.each do |src_child|
935
+ dst_child = Dirent.new dst.ole, :type => src_child.type
936
+ dst.children << dst_child
937
+ Dirent.copy src_child, dst_child
938
+ end
939
+ else
940
+ src.open do |src_io|
941
+ dst.open { |dst_io| IO.copy src_io, dst_io }
942
+ end
943
+ end
944
+ end
945
+ end
946
+ end
947
+ end
948
+