ruby-ole 1.2.6 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,948 @@
1
+ require 'tempfile'
2
+
3
+ require 'ole/base'
4
+ require 'ole/types'
5
+ require 'ole/ranges_io'
6
+
7
+ module Ole # :nodoc:
8
+ #
9
+ # = Introduction
10
+ #
11
+ # <tt>Ole::Storage</tt> is a class intended to abstract away details of the
12
+ # access to OLE2 structured storage files, such as those produced by
13
+ # Microsoft Office, eg *.doc, *.msg etc.
14
+ #
15
+ # = Usage
16
+ #
17
+ # Usage should be fairly straight forward:
18
+ #
19
+ # # get the parent ole storage object
20
+ # ole = Ole::Storage.open 'myfile.msg', 'r+'
21
+ # # => #<Ole::Storage io=#<File:myfile.msg> root=#<Dirent:"Root Entry">>
22
+ # # read some data
23
+ # ole.root[1].read 4
24
+ # # => "\001\000\376\377"
25
+ # # get the top level root object and output a tree structure for
26
+ # # debugging
27
+ # puts ole.root.to_tree
28
+ # # =>
29
+ # - #<Dirent:"Root Entry" size=3840 time="2006-11-03T00:52:53Z">
30
+ # |- #<Dirent:"__nameid_version1.0" size=0 time="2006-11-03T00:52:53Z">
31
+ # | |- #<Dirent:"__substg1.0_00020102" size=16 data="CCAGAAAAAADAAA...">
32
+ # ...
33
+ # |- #<Dirent:"__substg1.0_8002001E" size=4 data="MTEuMA==">
34
+ # |- #<Dirent:"__properties_version1.0" size=800 data="AAAAAAAAAAABAA...">
35
+ # \- #<Dirent:"__recip_version1.0_#00000000" size=0 time="2006-11-03T00:52:53Z">
36
+ # |- #<Dirent:"__substg1.0_0FF60102" size=4 data="AAAAAA==">
37
+ # ...
38
+ # # write some data, and finish up (note that open is 'r+', so this overwrites
39
+ # # but doesn't truncate)
40
+ # ole.root["\001CompObj"].open { |f| f.write "blah blah" }
41
+ # ole.close
42
+ #
43
+ # = Thanks
44
+ #
45
+ # * The code contained in this project was initially based on chicago's libole
46
+ # (source available at http://prdownloads.sf.net/chicago/ole.tgz).
47
+ #
48
+ # * It was later augmented with some corrections by inspecting pole, and (purely
49
+ # for header definitions) gsf.
50
+ #
51
+ # * The property set parsing code came from the apache java project POIFS.
52
+ #
53
+ # * The excellent idea for using a pseudo file system style interface by providing
54
+ # #file and #dir methods which mimic File and Dir, was borrowed (along with almost
55
+ # unchanged tests!) from Thomas Sondergaard's rubyzip.
56
+ #
57
+ # = TODO
58
+ #
59
+ # * the custom header cruft for Header and Dirent needs some love.
60
+ # * i have a number of classes doing load/save combos: Header, AllocationTable, Dirent,
61
+ # and, in a manner of speaking, but arguably different, Storage itself.
62
+ # they have differing api's which would be nice to rethink.
63
+ # AllocationTable::Big must be created aot now, as it is used for all subsequent reads.
64
+ #
65
+ class Storage
66
+ # thrown for any bogus OLE file errors.
67
+ class FormatError < StandardError # :nodoc:
68
+ end
69
+
70
+ VERSION = '1.2.7'
71
+
72
+ # options used at creation time
73
+ attr_reader :params
74
+ # The top of the ole tree structure
75
+ attr_reader :root
76
+ # The tree structure in its original flattened form. only valid after #load, or #flush.
77
+ attr_reader :dirents
78
+ # The underlying io object to/from which the ole object is serialized, whether we
79
+ # should close it, and whether it is writeable
80
+ attr_reader :io, :close_parent, :writeable
81
+ # Low level internals, you probably shouldn't need to mess with these
82
+ attr_reader :header, :bbat, :sbat, :sb_file
83
+
84
+ # maybe include an option hash, and allow :close_parent => true, to be more general.
85
+ # +arg+ should be either a file, or an +IO+ object, and needs to be seekable.
86
+ def initialize arg, mode=nil, params={}
87
+ params, mode = mode, nil if Hash === mode
88
+ params = {:update_timestamps => true}.merge(params)
89
+ @params = params
90
+
91
+ # get the io object
92
+ @close_parent, @io = if String === arg
93
+ mode ||= 'rb'
94
+ [true, open(arg, mode)]
95
+ else
96
+ raise ArgumentError, 'unable to specify mode string with io object' if mode
97
+ [false, arg]
98
+ end
99
+ # do we have this file opened for writing? don't know of a better way to tell
100
+ # (unless we parse the mode string in the open case)
101
+ # hmmm, note that in ruby 1.9 this doesn't work anymore. which is all the more
102
+ # reason to use mode string parsing when available, and fall back to something like
103
+ # io.writeable? otherwise.
104
+ @writeable = begin
105
+ if mode
106
+ IO::Mode.new(mode).writeable?
107
+ else
108
+ @io.flush
109
+ true
110
+ end
111
+ rescue IOError
112
+ false
113
+ end
114
+ # silence undefined warning in clear
115
+ @sb_file = nil
116
+ # if the io object has data, we should load it, otherwise start afresh
117
+ # this should be based on the mode string rather.
118
+ @io.size > 0 ? load : clear
119
+ end
120
+
121
+ def self.open arg, mode=nil, params={}
122
+ ole = new arg, mode, params
123
+ if block_given?
124
+ begin yield ole
125
+ ensure; ole.close
126
+ end
127
+ else ole
128
+ end
129
+ end
130
+
131
+ # load document from file.
132
+ #
133
+ # TODO: implement various allocationtable checks, maybe as a AllocationTable#fsck function :)
134
+ #
135
+ # 1. reterminate any chain not ending in EOC.
136
+ # compare file size with actually allocated blocks per file.
137
+ # 2. pass through all chain heads looking for collisions, and making sure nothing points to them
138
+ # (ie they are really heads). in both sbat and mbat
139
+ # 3. we know the locations of the bbat data, and mbat data. ensure that there are placeholder blocks
140
+ # in the bat for them.
141
+ # 4. maybe a check of excess data. if there is data outside the bbat.truncate.length + 1 * block_size,
142
+ # (eg what is used for truncate in #flush), then maybe add some sort of message about that. it
143
+ # will be automatically thrown away at close time.
144
+ def load
145
+ # we always read 512 for the header block. if the block size ends up being different,
146
+ # what happens to the 109 fat entries. are there more/less entries?
147
+ @io.rewind
148
+ header_block = @io.read 512
149
+ @header = Header.new header_block
150
+
151
+ # create an empty bbat.
152
+ @bbat = AllocationTable::Big.new self
153
+ mbat_blocks = (0...@header.num_mbat).map { |i| i + @header.mbat_start }
154
+ bbat_chain = (header_block[Header::SIZE..-1] + @bbat.read(mbat_blocks)).unpack 'V*'
155
+ # am i using num_bat in the right way?
156
+ @bbat.load @bbat.read(bbat_chain[0, @header.num_bat])
157
+
158
+ # get block chain for directories, read it, then split it into chunks and load the
159
+ # directory entries. semantics changed - used to cut at first dir where dir.type == 0
160
+ @dirents = @bbat.read(@header.dirent_start).scan(/.{#{Dirent::SIZE}}/mo).
161
+ map { |str| Dirent.new self, str }.reject { |d| d.type_id == 0 }
162
+
163
+ # now reorder from flat into a tree
164
+ # links are stored in some kind of balanced binary tree
165
+ # check that everything is visited at least, and at most once
166
+ # similarly with the blocks of the file.
167
+ # was thinking of moving this to Dirent.to_tree instead.
168
+ class << @dirents
169
+ def to_tree idx=0
170
+ return [] if idx == Dirent::EOT
171
+ d = self[idx]
172
+ d.children = to_tree d.child
173
+ raise FormatError, "directory #{d.inspect} used twice" if d.idx
174
+ d.idx = idx
175
+ to_tree(d.prev) + [d] + to_tree(d.next)
176
+ end
177
+ end
178
+
179
+ @root = @dirents.to_tree.first
180
+ Log.warn "root name was #{@root.name.inspect}" unless @root.name == 'Root Entry'
181
+ unused = @dirents.reject(&:idx).length
182
+ Log.warn "#{unused} unused directories" if unused > 0
183
+
184
+ # FIXME i don't currently use @header.num_sbat which i should
185
+ # hmm. nor do i write it. it means what exactly again?
186
+ # which mode to use here?
187
+ @sb_file = RangesIOResizeable.new @bbat, :first_block => @root.first_block, :size => @root.size
188
+ @sbat = AllocationTable::Small.new self
189
+ @sbat.load @bbat.read(@header.sbat_start)
190
+ end
191
+
192
+ def close
193
+ @sb_file.close
194
+ flush if @writeable
195
+ @io.close if @close_parent
196
+ end
197
+
198
+ # the flush method is the main "save" method. all file contents are always
199
+ # written directly to the file by the RangesIO objects, all this method does
200
+ # is write out all the file meta data - dirents, allocation tables, file header
201
+ # etc.
202
+ #
203
+ # maybe add an option to zero the padding, and any remaining avail blocks in the
204
+ # allocation table.
205
+ #
206
+ # TODO: long and overly complex. simplify and test better. eg, perhaps move serialization
207
+ # of bbat to AllocationTable::Big.
208
+ def flush
209
+ # update root dirent, and flatten dirent tree
210
+ @root.name = 'Root Entry'
211
+ @root.first_block = @sb_file.first_block
212
+ @root.size = @sb_file.size
213
+ @dirents = @root.flatten
214
+
215
+ # serialize the dirents using the bbat
216
+ RangesIOResizeable.open @bbat, 'w', :first_block => @header.dirent_start do |io|
217
+ @dirents.each { |dirent| io.write dirent.to_s }
218
+ padding = (io.size / @bbat.block_size.to_f).ceil * @bbat.block_size - io.size
219
+ io.write 0.chr * padding
220
+ @header.dirent_start = io.first_block
221
+ end
222
+
223
+ # serialize the sbat
224
+ # perhaps the blocks used by the sbat should be marked with BAT?
225
+ RangesIOResizeable.open @bbat, 'w', :first_block => @header.sbat_start do |io|
226
+ io.write @sbat.to_s
227
+ @header.sbat_start = io.first_block
228
+ @header.num_sbat = @bbat.chain(@header.sbat_start).length
229
+ end
230
+
231
+ # create RangesIOResizeable hooked up to the bbat. use that to claim bbat blocks using
232
+ # truncate. then when its time to write, convert that chain and some chunk of blocks at
233
+ # the end, into META_BAT blocks. write out the chain, and those meta bat blocks, and its
234
+ # done.
235
+ # this is perhaps not good, as we reclaim all bat blocks here, which
236
+ # may include the sbat we just wrote. FIXME
237
+ @bbat.map! do |b|
238
+ b == AllocationTable::BAT || b == AllocationTable::META_BAT ? AllocationTable::AVAIL : b
239
+ end
240
+
241
+ # currently we use a loop. this could be better, but basically,
242
+ # the act of writing out the bat, itself requires blocks which get
243
+ # recorded in the bat.
244
+ #
245
+ # i'm sure that there'd be some simpler closed form solution to this. solve
246
+ # recursive func:
247
+ #
248
+ # num_mbat_blocks = ceil(max((mbat_len - 109) * 4 / block_size, 0))
249
+ # bbat_len = initial_bbat_len + num_mbat_blocks
250
+ # mbat_len = ceil(bbat_len * 4 / block_size)
251
+ #
252
+ # the actual bbat allocation table is itself stored throughout the file, and that chain
253
+ # is stored in the initial blocks, and the mbat blocks.
254
+ num_mbat_blocks = 0
255
+ io = RangesIOResizeable.new @bbat, 'w', :first_block => AllocationTable::EOC
256
+ # truncate now, so that we can simplify size calcs - the mbat blocks will be appended in a
257
+ # contiguous chunk at the end.
258
+ # hmmm, i think this truncate should be matched with a truncate of the underlying io. if you
259
+ # delete a lot of stuff, and free up trailing blocks, the file size never shrinks. this can
260
+ # be fixed easily, add an io truncate
261
+ @bbat.truncate!
262
+ before = @io.size
263
+ @io.truncate @bbat.block_size * (@bbat.length + 1)
264
+ while true
265
+ # get total bbat size. equivalent to @bbat.to_s.length, but for the factoring in of
266
+ # the mbat blocks. we can't just add the mbat blocks directly to the bbat, as as this iteration
267
+ # progresses, more blocks may be needed for the bat itself (if there are no more gaps), and the
268
+ # mbat must remain contiguous.
269
+ bbat_data_len = ((@bbat.length + num_mbat_blocks) * 4 / @bbat.block_size.to_f).ceil * @bbat.block_size
270
+ # now storing the excess mbat blocks also increases the size of the bbat:
271
+ new_num_mbat_blocks = ([bbat_data_len / @bbat.block_size - 109, 0].max * 4 / @bbat.block_size.to_f).ceil
272
+ if new_num_mbat_blocks != num_mbat_blocks
273
+ # need more space for the mbat.
274
+ num_mbat_blocks = new_num_mbat_blocks
275
+ elsif io.size != bbat_data_len
276
+ # need more space for the bat
277
+ # this may grow the bbat, depending on existing available blocks
278
+ io.truncate bbat_data_len
279
+ else
280
+ break
281
+ end
282
+ end
283
+
284
+ # now extract the info we want:
285
+ ranges = io.ranges
286
+ bbat_chain = @bbat.chain io.first_block
287
+ # the extra mbat data is a set of contiguous blocks at the end
288
+ io.close
289
+ bbat_chain.each { |b| @bbat[b] = AllocationTable::BAT }
290
+ # tack on the mbat stuff
291
+ @header.mbat_start = @bbat.length # need to record this here before tacking on the mbat
292
+ @header.num_bat = bbat_chain.length
293
+ num_mbat_blocks.times { @bbat << AllocationTable::META_BAT }
294
+
295
+ # now finally write the bbat, using a not resizable io.
296
+ # the mode here will be 'r', which allows write atm.
297
+ RangesIO.open(@io, :ranges => ranges) { |f| f.write @bbat.to_s }
298
+
299
+ # this is the mbat. pad it out.
300
+ bbat_chain += [AllocationTable::AVAIL] * [109 - bbat_chain.length, 0].max
301
+ @header.num_mbat = num_mbat_blocks
302
+ if num_mbat_blocks == 0
303
+ @header.mbat_start = AllocationTable::EOC
304
+ else
305
+ # write out the mbat blocks now. first of all, where are they going to be?
306
+ mbat_data = bbat_chain[109..-1]
307
+ q = @bbat.block_size / 4
308
+ mbat_data += [AllocationTable::AVAIL] *((mbat_data.length / q.to_f).ceil * q - mbat_data.length)
309
+ ranges = @bbat.ranges((0...num_mbat_blocks).map { |i| @header.mbat_start + i })
310
+ RangesIO.open(@io, :ranges => ranges) { |f| f.write mbat_data.pack('V*') }
311
+ end
312
+
313
+ # now seek back and write the header out
314
+ @io.seek 0
315
+ @io.write @header.to_s + bbat_chain[0, 109].pack('V*')
316
+ @io.flush
317
+ end
318
+
319
+ def clear
320
+ # initialize to equivalent of loading an empty ole document.
321
+ Log.warn 'creating new ole storage object on non-writable io' unless @writeable
322
+ @header = Header.new
323
+ @bbat = AllocationTable::Big.new self
324
+ @root = Dirent.new self, :type => :root, :name => 'Root Entry'
325
+ @dirents = [@root]
326
+ @root.idx = 0
327
+ @sb_file.close if @sb_file
328
+ @sb_file = RangesIOResizeable.new @bbat, :first_block => AllocationTable::EOC
329
+ @sbat = AllocationTable::Small.new self
330
+ # throw everything else the hell away
331
+ @io.truncate 0
332
+ end
333
+
334
+ # could be useful with mis-behaving ole documents. or to just clean them up.
335
+ def repack temp=:file
336
+ case temp
337
+ when :file
338
+ Tempfile.open 'ole-repack' do |io|
339
+ io.binmode
340
+ repack_using_io io
341
+ end
342
+ when :mem; StringIO.open(&method(:repack_using_io))
343
+ else raise ArgumentError, "unknown temp backing #{temp.inspect}"
344
+ end
345
+ end
346
+
347
+ def repack_using_io temp_io
348
+ @io.rewind
349
+ IO.copy @io, temp_io
350
+ clear
351
+ Storage.open temp_io, nil, @params do |temp_ole|
352
+ #temp_ole.root.type = :dir
353
+ Dirent.copy temp_ole.root, root
354
+ end
355
+ end
356
+
357
+ def bat_for_size size
358
+ # note >=, not > previously.
359
+ size >= @header.threshold ? @bbat : @sbat
360
+ end
361
+
362
+ def inspect
363
+ "#<#{self.class} io=#{@io.inspect} root=#{@root.inspect}>"
364
+ end
365
+
366
+ #
367
+ # A class which wraps the ole header
368
+ #
369
+ # Header.new can be both used to load from a string, or to create from
370
+ # defaults. Serialization is accomplished with the #to_s method.
371
+ #
372
+ class Header < Struct.new(
373
+ :magic, :clsid, :minor_ver, :major_ver, :byte_order, :b_shift, :s_shift,
374
+ :reserved, :csectdir, :num_bat, :dirent_start, :transacting_signature, :threshold,
375
+ :sbat_start, :num_sbat, :mbat_start, :num_mbat
376
+ )
377
+ PACK = 'a8 a16 v2 a2 v2 a6 V3 a4 V5'
378
+ SIZE = 0x4c
379
+ # i have seen it pointed out that the first 4 bytes of hex,
380
+ # 0xd0cf11e0, is supposed to spell out docfile. hmmm :)
381
+ MAGIC = "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # expected value of Header#magic
382
+ # what you get if creating new header from scratch.
383
+ # AllocationTable::EOC isn't available yet. meh.
384
+ EOC = 0xfffffffe
385
+ DEFAULT = [
386
+ MAGIC, 0.chr * 16, 59, 3, "\xfe\xff", 9, 6,
387
+ 0.chr * 6, 0, 1, EOC, 0.chr * 4,
388
+ 4096, EOC, 0, EOC, 0
389
+ ]
390
+
391
+ def initialize values=DEFAULT
392
+ values = values.unpack(PACK) if String === values
393
+ super(*values)
394
+ validate!
395
+ end
396
+
397
+ def to_s
398
+ to_a.pack PACK
399
+ end
400
+
401
+ def validate!
402
+ raise FormatError, "OLE2 signature is invalid" unless magic == MAGIC
403
+ if num_bat == 0 or # is that valid for a completely empty file?
404
+ # not sure about this one. basically to do max possible bat given size of mbat
405
+ num_bat > 109 && num_bat > 109 + num_mbat * (1 << b_shift - 2) or
406
+ # shouldn't need to use the mbat as there is enough space in the header block
407
+ num_bat < 109 && num_mbat != 0 or
408
+ # given the size of the header is 76, if b_shift <= 6, blocks address the header.
409
+ s_shift > b_shift or b_shift <= 6 or b_shift >= 31 or
410
+ # we only handle little endian
411
+ byte_order != "\xfe\xff"
412
+ raise FormatError, "not valid OLE2 structured storage file"
413
+ end
414
+ # relaxed this, due to test-msg/qwerty_[1-3]*.msg they all had
415
+ # 3 for this value.
416
+ # transacting_signature != "\x00" * 4 or
417
+ if threshold != 4096 or
418
+ num_mbat == 0 && mbat_start != AllocationTable::EOC or
419
+ reserved != "\x00" * 6
420
+ Log.warn "may not be a valid OLE2 structured storage file"
421
+ end
422
+ true
423
+ end
424
+ end
425
+
426
+ #
427
+ # +AllocationTable+'s hold the chains corresponding to files. Given
428
+ # an initial index, <tt>AllocationTable#chain</tt> follows the chain, returning
429
+ # the blocks that make up that file.
430
+ #
431
+ # There are 2 allocation tables, the bbat, and sbat, for big and small
432
+ # blocks respectively. The block chain should be loaded using either
433
+ # <tt>Storage#read_big_blocks</tt> or <tt>Storage#read_small_blocks</tt>
434
+ # as appropriate.
435
+ #
436
+ # Whether or not big or small blocks are used for a file depends on
437
+ # whether its size is over the <tt>Header#threshold</tt> level.
438
+ #
439
+ # An <tt>Ole::Storage</tt> document is serialized as a series of directory objects,
440
+ # which are stored in blocks throughout the file. The blocks are either
441
+ # big or small, and are accessed using the <tt>AllocationTable</tt>.
442
+ #
443
+ # The bbat allocation table's data is stored in the spare room in the header
444
+ # block, and in extra blocks throughout the file as referenced by the meta
445
+ # bat. That chain is linear, as there is no higher level table.
446
+ #
447
+ # AllocationTable.new is used to create an empty table. It can parse a string
448
+ # with the #load method. Serialization is accomplished with the #to_s method.
449
+ #
450
+ class AllocationTable < Array
451
+ # a free block (I don't currently leave any blocks free), although I do pad out
452
+ # the allocation table with AVAIL to the block size.
453
+ AVAIL = 0xffffffff
454
+ EOC = 0xfffffffe # end of a chain
455
+ # these blocks are used for storing the allocation table chains
456
+ BAT = 0xfffffffd
457
+ META_BAT = 0xfffffffc
458
+
459
+ attr_reader :ole, :io, :block_size
460
+ def initialize ole
461
+ @ole = ole
462
+ @sparse = true
463
+ super()
464
+ end
465
+
466
+ def load data
467
+ replace data.unpack('V*')
468
+ end
469
+
470
+ def truncate
471
+ # this strips trailing AVAILs. come to think of it, this has the potential to break
472
+ # bogus ole. if you terminate using AVAIL instead of EOC, like I did before. but that is
473
+ # very broken. however, if a chain ends with AVAIL, it should probably be fixed to EOC
474
+ # at load time.
475
+ temp = reverse
476
+ not_avail = temp.find { |b| b != AVAIL } and temp = temp[temp.index(not_avail)..-1]
477
+ temp.reverse
478
+ end
479
+
480
+ def truncate!
481
+ replace truncate
482
+ end
483
+
484
+ def to_s
485
+ table = truncate
486
+ # pad it out some
487
+ num = @ole.bbat.block_size / 4
488
+ # do you really use AVAIL? they probably extend past end of file, and may shortly
489
+ # be used for the bat. not really good.
490
+ table += [AVAIL] * (num - (table.length % num)) if (table.length % num) != 0
491
+ table.pack 'V*'
492
+ end
493
+
494
+ # rewrote this to be non-recursive as it broke on a large attachment
495
+ # chain with a stack error
496
+ def chain idx
497
+ a = []
498
+ until idx >= META_BAT
499
+ raise FormatError, "broken allocationtable chain" if idx < 0 || idx > length
500
+ a << idx
501
+ idx = self[idx]
502
+ end
503
+ Log.warn "invalid chain terminator #{idx}" unless idx == EOC
504
+ a
505
+ end
506
+
507
+ # Turn a chain (an array given by +chain+) of blocks (optionally
508
+ # truncated to +size+) into an array of arrays describing the stretches of
509
+ # bytes in the file that it belongs to.
510
+ #
511
+ # The blocks are Big or Small blocks depending on the table type.
512
+ def blocks_to_ranges chain, size=nil
513
+ # truncate the chain if required
514
+ chain = chain[0...(size.to_f / block_size).ceil] if size
515
+ # convert chain to ranges of the block size
516
+ ranges = chain.map { |i| [block_size * i, block_size] }
517
+ # truncate final range if required
518
+ ranges.last[1] -= (ranges.length * block_size - size) if ranges.last and size
519
+ ranges
520
+ end
521
+
522
+ def ranges chain, size=nil
523
+ chain = self.chain(chain) unless Array === chain
524
+ blocks_to_ranges chain, size
525
+ end
526
+
527
+ # quick shortcut. chain can be either a head (in which case the table is used to
528
+ # turn it into a chain), or a chain. it is converted to ranges, then to rangesio.
529
+ def open chain, size=nil, &block
530
+ RangesIO.open @io, :ranges => ranges(chain, size), &block
531
+ end
532
+
533
+ def read chain, size=nil
534
+ open chain, size, &:read
535
+ end
536
+
537
+ # catch any method that may add an AVAIL somewhere in the middle, thus invalidating
538
+ # the @sparse speedup for free_block. annoying using eval, but define_method won't
539
+ # work for this.
540
+ # FIXME
541
+ [:map!, :collect!].each do |name|
542
+ eval <<-END
543
+ def #{name}(*args, &block)
544
+ @sparse = true
545
+ super
546
+ end
547
+ END
548
+ end
549
+
550
+ def []= idx, val
551
+ @sparse = true if val == AVAIL
552
+ super
553
+ end
554
+
555
+ def free_block
556
+ if @sparse
557
+ i = index(AVAIL) and return i
558
+ end
559
+ @sparse = false
560
+ push AVAIL
561
+ length - 1
562
+ end
563
+
564
+ # must return first_block
565
+ def resize_chain blocks, size
566
+ new_num_blocks = (size / block_size.to_f).ceil
567
+ old_num_blocks = blocks.length
568
+ if new_num_blocks < old_num_blocks
569
+ # de-allocate some of our old blocks. TODO maybe zero them out in the file???
570
+ (new_num_blocks...old_num_blocks).each { |i| self[blocks[i]] = AVAIL }
571
+ self[blocks[new_num_blocks-1]] = EOC if new_num_blocks > 0
572
+ blocks.slice! new_num_blocks..-1
573
+ elsif new_num_blocks > old_num_blocks
574
+ # need some more blocks.
575
+ last_block = blocks.last
576
+ (new_num_blocks - old_num_blocks).times do
577
+ block = free_block
578
+ # connect the chain. handle corner case of blocks being [] initially
579
+ self[last_block] = block if last_block
580
+ blocks << block
581
+ last_block = block
582
+ self[last_block] = EOC
583
+ end
584
+ end
585
+ # update ranges, and return that also now
586
+ blocks
587
+ end
588
+
589
+ class Big < AllocationTable
590
+ def initialize(*args)
591
+ super
592
+ @block_size = 1 << @ole.header.b_shift
593
+ @io = @ole.io
594
+ end
595
+
596
+ # Big blocks are kind of -1 based, in order to not clash with the header.
597
+ def blocks_to_ranges blocks, size
598
+ super blocks.map { |b| b + 1 }, size
599
+ end
600
+ end
601
+
602
+ class Small < AllocationTable
603
+ def initialize(*args)
604
+ super
605
+ @block_size = 1 << @ole.header.s_shift
606
+ @io = @ole.sb_file
607
+ end
608
+ end
609
+ end
610
+
611
+ # like normal RangesIO, but Ole::Storage specific. the ranges are backed by an
612
+ # AllocationTable, and can be resized. used for read/write to 2 streams:
613
+ # 1. serialized dirent data
614
+ # 2. sbat table data
615
+ # 3. all dirents but through RangesIOMigrateable below
616
+ #
617
+ # Note that all internal access to first_block is through accessors, as it is sometimes
618
+ # useful to redirect it.
619
+ class RangesIOResizeable < RangesIO
620
+ attr_reader :bat
621
+ attr_accessor :first_block
622
+ def initialize bat, mode='r', params={}
623
+ mode, params = 'r', mode if Hash === mode
624
+ first_block, size = params.values_at :first_block, :size
625
+ raise ArgumentError, 'must specify first_block' unless first_block
626
+ @bat = bat
627
+ self.first_block = first_block
628
+ # we now cache the blocks chain, for faster resizing.
629
+ @blocks = @bat.chain first_block
630
+ super @bat.io, mode, :ranges => @bat.ranges(@blocks, size)
631
+ end
632
+
633
+ def truncate size
634
+ # note that old_blocks is != @ranges.length necessarily. i'm planning to write a
635
+ # merge_ranges function that merges sequential ranges into one as an optimization.
636
+ @bat.resize_chain @blocks, size
637
+ @ranges = @bat.ranges @blocks, size
638
+ @pos = @size if @pos > size
639
+ self.first_block = @blocks.empty? ? AllocationTable::EOC : @blocks.first
640
+
641
+ # don't know if this is required, but we explicitly request our @io to grow if necessary
642
+ # we never shrink it though. maybe this belongs in allocationtable, where smarter decisions
643
+ # can be made.
644
+ # maybe its ok to just seek out there later??
645
+ max = @ranges.map { |pos, len| pos + len }.max || 0
646
+ @io.truncate max if max > @io.size
647
+
648
+ @size = size
649
+ end
650
+ end
651
+
652
+ # like RangesIOResizeable, but Ole::Storage::Dirent specific. provides for migration
653
+ # between bats based on size, and updating the dirent.
654
+ class RangesIOMigrateable < RangesIOResizeable
655
+ attr_reader :dirent
656
+ def initialize dirent, mode='r'
657
+ @dirent = dirent
658
+ super @dirent.ole.bat_for_size(@dirent.size), mode,
659
+ :first_block => @dirent.first_block, :size => @dirent.size
660
+ end
661
+
662
+ def truncate size
663
+ bat = @dirent.ole.bat_for_size size
664
+ if bat.class != @bat.class
665
+ # bat migration needed! we need to backup some data. the amount of data
666
+ # should be <= @ole.header.threshold, so we can just hold it all in one buffer.
667
+ # backup this
668
+ pos = @pos
669
+ @pos = 0
670
+ keep = read [@size, size].min
671
+ # this does a normal truncate to 0, removing our presence from the old bat, and
672
+ # rewrite the dirent's first_block
673
+ super 0
674
+ @bat = bat
675
+ # just change the underlying io from right under everyone :)
676
+ @io = bat.io
677
+ # important to do this now, before the write. as the below write will always
678
+ # migrate us back to sbat! this will now allocate us +size+ in the new bat.
679
+ super
680
+ @pos = 0
681
+ write keep
682
+ @pos = pos
683
+ else
684
+ super
685
+ end
686
+ # now just update the file
687
+ @dirent.size = size
688
+ end
689
+
690
+ # forward this to the dirent
691
+ def first_block
692
+ @dirent.first_block
693
+ end
694
+
695
+ def first_block= val
696
+ @dirent.first_block = val
697
+ end
698
+ end
699
+
700
+ #
701
+ # A class which wraps an ole directory entry. Can be either a directory
702
+ # (<tt>Dirent#dir?</tt>) or a file (<tt>Dirent#file?</tt>)
703
+ #
704
+ # Most interaction with <tt>Ole::Storage</tt> is through this class.
705
+ # The 2 most important functions are <tt>Dirent#children</tt>, and
706
+ # <tt>Dirent#data</tt>.
707
+ #
708
+ # was considering separate classes for dirs and files. some methods/attrs only
709
+ # applicable to one or the other.
710
+ #
711
+ # As with the other classes, #to_s performs the serialization.
712
+ #
713
+ class Dirent < Struct.new(
714
+ :name_utf16, :name_len, :type_id, :colour, :prev, :next, :child,
715
+ :clsid, :flags, # dirs only
716
+ :create_time_str, :modify_time_str, # files only
717
+ :first_block, :size, :reserved
718
+ )
719
+ include RecursivelyEnumerable
720
+
721
+ PACK = 'a64 v C C V3 a16 V a8 a8 V2 a4'
722
+ SIZE = 128
723
+ TYPE_MAP = {
724
+ # this is temporary
725
+ 0 => :empty,
726
+ 1 => :dir,
727
+ 2 => :file,
728
+ 5 => :root
729
+ }
730
+ # something to do with the fact that the tree is supposed to be red-black
731
+ COLOUR_MAP = {
732
+ 0 => :red,
733
+ 1 => :black
734
+ }
735
+ # used in the next / prev / child stuff to show that the tree ends here.
736
+ # also used for first_block for directory.
737
+ EOT = 0xffffffff
738
+ DEFAULT = [
739
+ 0.chr * 2, 2, 0, # will get overwritten
740
+ 1, EOT, EOT, EOT,
741
+ 0.chr * 16, 0, nil, nil,
742
+ AllocationTable::EOC, 0, 0.chr * 4
743
+ ]
744
+
745
+ # i think its just used by the tree building
746
+ attr_accessor :idx
747
+ # This returns all the children of this +Dirent+. It is filled in
748
+ # when the tree structure is recreated.
749
+ attr_accessor :children
750
+ attr_accessor :name
751
+ attr_reader :ole, :type, :create_time, :modify_time
752
+ def initialize ole, values=DEFAULT, params={}
753
+ @ole = ole
754
+ values, params = DEFAULT, values if Hash === values
755
+ values = values.unpack(PACK) if String === values
756
+ super(*values)
757
+
758
+ # extra parsing from the actual struct values
759
+ @name = params[:name] || Types::Variant.load(Types::VT_LPWSTR, name_utf16[0...name_len])
760
+ @type = if params[:type]
761
+ unless TYPE_MAP.values.include?(params[:type])
762
+ raise ArgumentError, "unknown type #{params[:type].inspect}"
763
+ end
764
+ params[:type]
765
+ else
766
+ TYPE_MAP[type_id] or raise FormatError, "unknown type_id #{type_id.inspect}"
767
+ end
768
+
769
+ # further extra type specific stuff
770
+ if file?
771
+ default_time = @ole.params[:update_timestamps] ? Time.now : nil
772
+ @create_time ||= default_time
773
+ @modify_time ||= default_time
774
+ @create_time = Types::Variant.load(Types::VT_FILETIME, create_time_str) if create_time_str
775
+ @modify_time = Types::Variant.load(Types::VT_FILETIME, create_time_str) if modify_time_str
776
+ @children = nil
777
+ else
778
+ @create_time = nil
779
+ @modify_time = nil
780
+ self.size = 0 unless @type == :root
781
+ @children = []
782
+ end
783
+
784
+ # to silence warnings. used for tree building at load time
785
+ # only.
786
+ @idx = nil
787
+ end
788
+
789
+ def open mode='r'
790
+ raise Errno::EISDIR unless file?
791
+ io = RangesIOMigrateable.new self, mode
792
+ # TODO work on the mode string stuff a bit more.
793
+ # maybe let the io object know about the mode, so it can refuse
794
+ # to work for read/write appropriately. maybe redefine all unusable
795
+ # methods using singleton class to throw errors.
796
+ # for now, i just want to implement truncation on use of 'w'. later,
797
+ # i need to do 'a' etc.
798
+ case mode
799
+ when 'r', 'r+'
800
+ # as i don't enforce reading/writing, nothing changes here. kind of
801
+ # need to enforce tt if i want modify times to work better.
802
+ @modify_time = Time.now if mode == 'r+'
803
+ when 'w'
804
+ @modify_time = Time.now
805
+ # io.truncate 0
806
+ #else
807
+ # raise NotImplementedError, "unsupported mode - #{mode.inspect}"
808
+ end
809
+ if block_given?
810
+ begin yield io
811
+ ensure; io.close
812
+ end
813
+ else io
814
+ end
815
+ end
816
+
817
+ def read limit=nil
818
+ open { |io| io.read limit }
819
+ end
820
+
821
+ def file?
822
+ type == :file
823
+ end
824
+
825
+ def dir?
826
+ # to count root as a dir.
827
+ !file?
828
+ end
829
+
830
+ # maybe need some options regarding case sensitivity.
831
+ def / name
832
+ children.find { |child| name === child.name }
833
+ end
834
+
835
+ def [] idx
836
+ if String === idx
837
+ #warn 'String form of Dirent#[] is deprecated'
838
+ self / idx
839
+ else
840
+ super
841
+ end
842
+ end
843
+
844
+ # move to ruby-msg. and remove from here
845
+ def time
846
+ #warn 'Dirent#time is deprecated'
847
+ create_time || modify_time
848
+ end
849
+
850
+ def each_child(&block)
851
+ @children.each(&block)
852
+ end
853
+
854
+ # flattens the tree starting from here into +dirents+. note it modifies its argument.
855
+ def flatten dirents=[]
856
+ @idx = dirents.length
857
+ dirents << self
858
+ if file?
859
+ self.prev = self.next = self.child = EOT
860
+ else
861
+ children.each { |child| child.flatten dirents }
862
+ self.child = Dirent.flatten_helper children
863
+ end
864
+ dirents
865
+ end
866
+
867
+ # i think making the tree structure optimized is actually more complex than this, and
868
+ # requires some intelligent ordering of the children based on names, but as long as
869
+ # it is valid its ok.
870
+ # actually, i think its ok. gsf for example only outputs a singly-linked-list, where
871
+ # prev is always EOT.
872
+ def self.flatten_helper children
873
+ return EOT if children.empty?
874
+ i = children.length / 2
875
+ this = children[i]
876
+ this.prev, this.next = [(0...i), (i+1..-1)].map { |r| flatten_helper children[r] }
877
+ this.idx
878
+ end
879
+
880
+ def to_s
881
+ tmp = Types::Variant.dump(Types::VT_LPWSTR, name)
882
+ tmp = tmp[0, 62] if tmp.length > 62
883
+ tmp += 0.chr * 2
884
+ self.name_len = tmp.length
885
+ self.name_utf16 = tmp + 0.chr * (64 - tmp.length)
886
+ # type_id can perhaps be set in the initializer, as its read only now.
887
+ self.type_id = TYPE_MAP.to_a.find { |id, name| @type == name }.first
888
+ # for the case of files, it is assumed that that was handled already
889
+ # note not dir?, so as not to override root's first_block
890
+ self.first_block = Dirent::EOT if type == :dir
891
+ if file?
892
+ # this is messed up. it changes the time stamps regardless of whether the file
893
+ # was actually touched. instead, any open call with a writeable mode, should update
894
+ # the modify time. create time would be set in new.
895
+ if @ole.params[:update_timestamps]
896
+ self.create_time_str = Types::Variant.dump Types::VT_FILETIME, @create_time
897
+ self.modify_time_str = Types::Variant.dump Types::VT_FILETIME, @modify_time
898
+ end
899
+ else
900
+ self.create_time_str = 0.chr * 8
901
+ self.modify_time_str = 0.chr * 8
902
+ end
903
+ to_a.pack PACK
904
+ end
905
+
906
+ def inspect
907
+ str = "#<Dirent:#{name.inspect}"
908
+ # perhaps i should remove the data snippet. its not that useful anymore.
909
+ # there is also some dir specific stuff. like clsid, flags, that i should
910
+ # probably include
911
+ if file?
912
+ tmp = read 9
913
+ data = tmp.length == 9 ? tmp[0, 5] + '...' : tmp
914
+ str << " size=#{size}" +
915
+ "#{modify_time ? ' modify_time=' + modify_time.to_s.inspect : nil}" +
916
+ " data=#{data.inspect}"
917
+ end
918
+ str + '>'
919
+ end
920
+
921
+ def delete child
922
+ # remove from our child array, so that on reflatten and re-creation of @dirents, it will be gone
923
+ raise ArgumentError, "#{child.inspect} not a child of #{self.inspect}" unless @children.delete child
924
+ # free our blocks
925
+ child.open { |io| io.truncate 0 }
926
+ end
927
+
928
+ def self.copy src, dst
929
+ # copies the contents of src to dst. must be the same type. this will throw an
930
+ # error on copying to root. maybe this will recurse too much for big documents??
931
+ raise ArgumentError, 'differing types' if src.file? and !dst.file?
932
+ dst.name = src.name
933
+ if src.dir?
934
+ src.children.each do |src_child|
935
+ dst_child = Dirent.new dst.ole, :type => src_child.type
936
+ dst.children << dst_child
937
+ Dirent.copy src_child, dst_child
938
+ end
939
+ else
940
+ src.open do |src_io|
941
+ dst.open { |dst_io| IO.copy src_io, dst_io }
942
+ end
943
+ end
944
+ end
945
+ end
946
+ end
947
+ end
948
+