ruby-ole 1.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +60 -0
- data/bin/oletool +35 -0
- data/lib/ole/base.rb +7 -0
- data/lib/ole/file_system.rb +181 -0
- data/lib/ole/io_helpers.rb +184 -0
- data/lib/ole/storage.rb +925 -0
- data/lib/ole/support.rb +51 -0
- data/lib/ole/types.rb +36 -0
- data/test/test_storage.rb +139 -0
- data/test/test_word_6.doc +0 -0
- data/test/test_word_95.doc +0 -0
- data/test/test_word_97.doc +0 -0
- metadata +62 -0
data/lib/ole/storage.rb
ADDED
@@ -0,0 +1,925 @@
|
|
1
|
+
#! /usr/bin/ruby -w
|
2
|
+
|
3
|
+
$: << File.dirname(__FILE__) + '/..'
|
4
|
+
|
5
|
+
require 'stringio'
|
6
|
+
require 'tempfile'
|
7
|
+
|
8
|
+
require 'ole/base'
|
9
|
+
require 'ole/types'
|
10
|
+
# not strictly ole related
|
11
|
+
require 'ole/io_helpers'
|
12
|
+
|
13
|
+
module Ole # :nodoc:
|
14
|
+
#
|
15
|
+
# = Introduction
|
16
|
+
#
|
17
|
+
# <tt>Ole::Storage</tt> is a class intended to abstract away details of the
|
18
|
+
# access to OLE2 structured storage files, such as those produced by
|
19
|
+
# Microsoft Office, eg *.doc, *.msg etc.
|
20
|
+
#
|
21
|
+
# Initially based on chicago's libole, source available at
|
22
|
+
# http://prdownloads.sf.net/chicago/ole.tgz
|
23
|
+
# Later augmented with some corrections by inspecting pole, and (purely
|
24
|
+
# for header definitions) gsf.
|
25
|
+
#
|
26
|
+
# = Usage
|
27
|
+
#
|
28
|
+
# Usage should be fairly straight forward:
|
29
|
+
#
|
30
|
+
# # get the parent ole storage object
|
31
|
+
# ole = Ole::Storage.open 'myfile.msg', 'r+'
|
32
|
+
# # => #<Ole::Storage io=#<File:myfile.msg> root=#<Dirent:"Root Entry">>
|
33
|
+
# # read some data
|
34
|
+
# ole.root[1].read 4
|
35
|
+
# # => "\001\000\376\377"
|
36
|
+
# # get the top level root object and output a tree structure for
|
37
|
+
# # debugging
|
38
|
+
# puts ole.root.to_tree
|
39
|
+
# # =>
|
40
|
+
# - #<Dirent:"Root Entry" size=3840 time="2006-11-03T00:52:53Z">
|
41
|
+
# |- #<Dirent:"__nameid_version1.0" size=0 time="2006-11-03T00:52:53Z">
|
42
|
+
# | |- #<Dirent:"__substg1.0_00020102" size=16 data="CCAGAAAAAADAAA...">
|
43
|
+
# ...
|
44
|
+
# |- #<Dirent:"__substg1.0_8002001E" size=4 data="MTEuMA==">
|
45
|
+
# |- #<Dirent:"__properties_version1.0" size=800 data="AAAAAAAAAAABAA...">
|
46
|
+
# \- #<Dirent:"__recip_version1.0_#00000000" size=0 time="2006-11-03T00:52:53Z">
|
47
|
+
# |- #<Dirent:"__substg1.0_0FF60102" size=4 data="AAAAAA==">
|
48
|
+
# ...
|
49
|
+
# # write some data, and finish up (note that open is 'r+', so this overwrites
|
50
|
+
# # but doesn't truncate)
|
51
|
+
# ole.root["\001CompObj"].open { |f| f.write "blah blah" }
|
52
|
+
# ole.close
|
53
|
+
#
|
54
|
+
# = TODO
|
55
|
+
#
|
56
|
+
# 1. tests. lock down how things work at the moment - mostly good.
|
57
|
+
# create from scratch works now, as does copying in a subtree of another doc, so
|
58
|
+
# ole embedded attachment serialization works now. i can save embedded xls in an msg
|
59
|
+
# into a separate file, and open it. this was a goal. now i would want to implemenet
|
60
|
+
# to_mime conversion for embedded attachments, that serializes them to ole, but handles
|
61
|
+
# some separately like various meta file types as plain .wmf attachments perhaps. this
|
62
|
+
# will give pretty good .eml's from emails with embedded attachments.
|
63
|
+
# the other todo is .rtf output, with full support for embedded ole objects...
|
64
|
+
# 2. lots of tidying up
|
65
|
+
# - main FIXME's in this regard are:
|
66
|
+
# * the custom header cruft for Header and Dirent needs some love.
|
67
|
+
# * i have a number of classes doing load/save combos: Header, AllocationTable, Dirent,
|
68
|
+
# and, in a manner of speaking, but arguably different, Storage itself.
|
69
|
+
# they have differing api's which would be nice to clean.
|
70
|
+
# AllocationTable::Big must be created aot now, as it is used for all subsequent reads.
|
71
|
+
# * ole types need work, can't serialize datetime at the moment.
|
72
|
+
# 3. need to fix META_BAT support in #flush.
|
73
|
+
#
|
74
|
+
class Storage
|
75
|
+
VERSION = '1.2.1'
|
76
|
+
|
77
|
+
# The top of the ole tree structure
|
78
|
+
attr_reader :root
|
79
|
+
# The tree structure in its original flattened form. only valid after #load, or #flush.
|
80
|
+
attr_reader :dirents
|
81
|
+
# The underlying io object to/from which the ole object is serialized, whether we
|
82
|
+
# should close it, and whether it is writeable
|
83
|
+
attr_reader :io, :close_parent, :writeable
|
84
|
+
# Low level internals, you probably shouldn't need to mess with these
|
85
|
+
attr_reader :header, :bbat, :sbat, :sb_file
|
86
|
+
|
87
|
+
# maybe include an option hash, and allow :close_parent => true, to be more general.
|
88
|
+
# +arg+ should be either a file, or an +IO+ object, and needs to be seekable.
|
89
|
+
def initialize arg, mode=nil
|
90
|
+
# get the io object
|
91
|
+
@close_parent, @io = if String === arg
|
92
|
+
[true, open(arg, mode || 'rb')]
|
93
|
+
else
|
94
|
+
raise 'unable to specify mode string with io object' if mode
|
95
|
+
[false, arg]
|
96
|
+
end
|
97
|
+
# do we have this file opened for writing? don't know of a better way to tell
|
98
|
+
@writeable = begin
|
99
|
+
@io.flush
|
100
|
+
true
|
101
|
+
rescue IOError
|
102
|
+
false
|
103
|
+
end
|
104
|
+
# silence undefined warning in clear
|
105
|
+
@sb_file = nil
|
106
|
+
# if the io object has data, we should load it, otherwise start afresh
|
107
|
+
@io.size > 0 ? load : clear
|
108
|
+
end
|
109
|
+
|
110
|
+
def self.new arg, mode=nil
|
111
|
+
ole = super
|
112
|
+
if block_given?
|
113
|
+
begin yield ole
|
114
|
+
ensure; ole.close
|
115
|
+
end
|
116
|
+
else ole
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
class << self
|
121
|
+
# encouraged
|
122
|
+
alias open :new
|
123
|
+
# deprecated
|
124
|
+
alias load :new
|
125
|
+
end
|
126
|
+
|
127
|
+
# load document from file.
|
128
|
+
def load
|
129
|
+
# we always read 512 for the header block. if the block size ends up being different,
|
130
|
+
# what happens to the 109 fat entries. are there more/less entries?
|
131
|
+
@io.rewind
|
132
|
+
header_block = @io.read 512
|
133
|
+
@header = Header.load header_block
|
134
|
+
|
135
|
+
# create an empty bbat
|
136
|
+
@bbat = AllocationTable::Big.new self
|
137
|
+
# extra mbat blocks
|
138
|
+
mbat_blocks = (0...@header.num_mbat).map { |i| i + @header.mbat_start }
|
139
|
+
bbat_chain = (header_block[Header::SIZE..-1] + @bbat.read(mbat_blocks)).unpack 'L*'
|
140
|
+
# am i using num_bat in the right way?
|
141
|
+
@bbat.load @bbat.read(bbat_chain[0, @header.num_bat])
|
142
|
+
|
143
|
+
# get block chain for directories, read it, then split it into chunks and load the
|
144
|
+
# directory entries. semantics changed - used to cut at first dir where dir.type == 0
|
145
|
+
@dirents = @bbat.read(@header.dirent_start).scan(/.{#{Dirent::SIZE}}/mo).
|
146
|
+
map { |str| Dirent.load self, str }.reject { |d| d.type_id == 0 }
|
147
|
+
|
148
|
+
# now reorder from flat into a tree
|
149
|
+
# links are stored in some kind of balanced binary tree
|
150
|
+
# check that everything is visited at least, and at most once
|
151
|
+
# similarly with the blocks of the file.
|
152
|
+
# was thinking of moving this to Dirent.to_tree instead.
|
153
|
+
class << @dirents
|
154
|
+
def to_tree idx=0
|
155
|
+
return [] if idx == Dirent::EOT
|
156
|
+
d = self[idx]
|
157
|
+
d.children = to_tree d.child
|
158
|
+
raise "directory #{d.inspect} used twice" if d.idx
|
159
|
+
d.idx = idx
|
160
|
+
to_tree(d.prev) + [d] + to_tree(d.next)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
@root = @dirents.to_tree.first
|
165
|
+
Log.warn "root name was #{@root.name.inspect}" unless @root.name == 'Root Entry'
|
166
|
+
unused = @dirents.reject(&:idx).length
|
167
|
+
Log.warn "* #{unused} unused directories" if unused > 0
|
168
|
+
|
169
|
+
# FIXME i don't currently use @header.num_sbat which i should
|
170
|
+
# hmm. nor do i write it. it means what exactly again?
|
171
|
+
@sb_file = RangesIOResizeable.new @bbat, @root.first_block, @root.size
|
172
|
+
@sbat = AllocationTable::Small.new self
|
173
|
+
@sbat.load @bbat.read(@header.sbat_start)
|
174
|
+
end
|
175
|
+
|
176
|
+
def close
|
177
|
+
flush if @writeable
|
178
|
+
@sb_file.close
|
179
|
+
@io.close if @close_parent
|
180
|
+
end
|
181
|
+
|
182
|
+
# should have a #open_dirent i think. and use it in load and flush. neater.
|
183
|
+
# also was thinking about Dirent#open_padding. then i can more easily clean up the padding
|
184
|
+
# to be 0.chr
|
185
|
+
=begin
|
186
|
+
thoughts on fixes:
|
187
|
+
1. reterminate any chain not ending in EOC.
|
188
|
+
2. pass through all chain heads looking for collisions, and making sure nothing points to them
|
189
|
+
(ie they are really heads).
|
190
|
+
3. we know the locations of the bbat data, and mbat data. ensure that there are placeholder blocks
|
191
|
+
in the bat for them.
|
192
|
+
this stuff will ensure reliability of input better. otherwise, its actually worth doing a repack
|
193
|
+
directly after read, to ensure the above is probably acounted for, before subsequent writes possibly
|
194
|
+
destroy things.
|
195
|
+
=end
|
196
|
+
def flush
|
197
|
+
# recreate dirs from our tree, split into dirs and big and small files
|
198
|
+
@root.type = :root
|
199
|
+
@root.name = 'Root Entry'
|
200
|
+
@root.first_block = @sb_file.first_block
|
201
|
+
@root.size = @sb_file.size
|
202
|
+
@dirents = @root.flatten
|
203
|
+
|
204
|
+
# maybe i should move the block form up to RangesIO, and get it for free at all levels.
|
205
|
+
# Dirent#open gets block form for free then
|
206
|
+
io = RangesIOResizeable.new @bbat, @header.dirent_start
|
207
|
+
io.truncate 0
|
208
|
+
@dirents.each { |dirent| io.write dirent.save }
|
209
|
+
padding = (io.size / @bbat.block_size.to_f).ceil * @bbat.block_size - io.size
|
210
|
+
io.write 0.chr * padding
|
211
|
+
@header.dirent_start = io.first_block
|
212
|
+
io.close
|
213
|
+
|
214
|
+
# similarly for the sbat data.
|
215
|
+
io = RangesIOResizeable.new @bbat, @header.sbat_start
|
216
|
+
io.truncate 0
|
217
|
+
io.write @sbat.save
|
218
|
+
@header.sbat_start = io.first_block
|
219
|
+
@header.num_sbat = @bbat.chain(@header.sbat_start).length
|
220
|
+
io.close
|
221
|
+
|
222
|
+
# what follows will be slightly more complex for the bat fiddling.
|
223
|
+
|
224
|
+
# create RangesIOResizeable hooked up to the bbat. use that to claim bbat blocks using
|
225
|
+
# truncate. then when its time to write, convert that chain and some chunk of blocks at
|
226
|
+
# the end, into META_BAT blocks. write out the chain, and those meta bat blocks, and its
|
227
|
+
# done.
|
228
|
+
@bbat.table.map! do |b|
|
229
|
+
b == AllocationTable::BAT || b == AllocationTable::META_BAT ?
|
230
|
+
AllocationTable::AVAIL : b
|
231
|
+
end
|
232
|
+
io = RangesIOResizeable.new @bbat, AllocationTable::EOC
|
233
|
+
|
234
|
+
# use crappy loop for now:
|
235
|
+
while true
|
236
|
+
bbat_data = @bbat.save
|
237
|
+
#mbat_data = bbat_data.length / @bbat.block_size * 4
|
238
|
+
mbat_chain = @bbat.chain io.first_block
|
239
|
+
raise NotImplementedError, "don't handle writing out extra META_BAT blocks yet" if mbat_chain.length > 109
|
240
|
+
# so we can ignore meta blocks in this calculation:
|
241
|
+
break if io.size >= bbat_data.length # it shouldn't be bigger right?
|
242
|
+
# this may grow the bbat, depending on existing available blocks
|
243
|
+
io.truncate bbat_data.length
|
244
|
+
end
|
245
|
+
|
246
|
+
# now extract the info we want:
|
247
|
+
ranges = io.ranges
|
248
|
+
mbat_chain = @bbat.chain io.first_block
|
249
|
+
io.close
|
250
|
+
mbat_chain.each { |b| @bbat.table[b] = AllocationTable::BAT }
|
251
|
+
@header.num_bat = mbat_chain.length
|
252
|
+
#p @bbat.truncated_table
|
253
|
+
#p ranges
|
254
|
+
#p mbat_chain
|
255
|
+
# not resizeable!
|
256
|
+
io = RangesIO.new @io, ranges
|
257
|
+
io.write @bbat.save
|
258
|
+
io.close
|
259
|
+
mbat_chain += [AllocationTable::AVAIL] * (109 - mbat_chain.length)
|
260
|
+
@header.mbat_start = AllocationTable::EOC
|
261
|
+
@header.num_mbat = 0
|
262
|
+
|
263
|
+
=begin
|
264
|
+
# Old save code. remove shortly
|
265
|
+
|
266
|
+
bbat_data = new_bbat.save
|
267
|
+
# must exist as linear chain stored in header.
|
268
|
+
@header.num_bat = (bbat_data.length / new_bbat.block_size.to_f).ceil
|
269
|
+
base = io.pos / new_bbat.block_size - 1
|
270
|
+
io.write bbat_data
|
271
|
+
# now that spanned a number of blocks:
|
272
|
+
mbat = (0...@header.num_bat).map { |i| i + base }
|
273
|
+
mbat += [AllocationTable::AVAIL] * (109 - mbat.length) if mbat.length < 109
|
274
|
+
header_mbat = mbat[0...109]
|
275
|
+
other_mbat_data = mbat[109..-1].pack 'L*'
|
276
|
+
@header.mbat_start = base + @header.num_bat
|
277
|
+
@header.num_mbat = (other_mbat_data.length / new_bbat.block_size.to_f).ceil
|
278
|
+
io.write other_mbat_data
|
279
|
+
=end
|
280
|
+
|
281
|
+
@root.type = :dir
|
282
|
+
|
283
|
+
# now seek back and write the header out
|
284
|
+
@io.seek 0
|
285
|
+
@io.write @header.save + mbat_chain.pack('L*')
|
286
|
+
@io.flush
|
287
|
+
end
|
288
|
+
|
289
|
+
def clear
|
290
|
+
# initialize to equivalent of loading an empty ole document.
|
291
|
+
Log.warn 'creating new ole storage object on non-writable io' unless @writeable
|
292
|
+
@header = Header.new
|
293
|
+
@bbat = AllocationTable::Big.new self
|
294
|
+
@root = Dirent.new self, :dir
|
295
|
+
@root.name = 'Root Entry'
|
296
|
+
@dirents = [@root]
|
297
|
+
@root.idx = 0
|
298
|
+
@root.children = []
|
299
|
+
# size shouldn't display for non-files
|
300
|
+
@root.size = 0
|
301
|
+
@sb_file.close if @sb_file
|
302
|
+
@sb_file = RangesIOResizeable.new @bbat, AllocationTable::EOC
|
303
|
+
@sbat = AllocationTable::Small.new self
|
304
|
+
# throw everything else the hell away
|
305
|
+
@io.truncate 0
|
306
|
+
end
|
307
|
+
|
308
|
+
# could be useful with mis-behaving ole documents. or to just clean them up.
|
309
|
+
def repack temp=:file
|
310
|
+
case temp
|
311
|
+
when :file; Tempfile.open 'w+', &method(:repack_using_io)
|
312
|
+
when :mem; StringIO.open(&method(:repack_using_io))
|
313
|
+
else raise "unknown temp backing #{temp.inspect}"
|
314
|
+
end
|
315
|
+
end
|
316
|
+
|
317
|
+
def repack_using_io temp_io
|
318
|
+
@io.rewind
|
319
|
+
IO.copy @io, temp_io
|
320
|
+
clear
|
321
|
+
Storage.open temp_io do |temp_ole|
|
322
|
+
temp_ole.root.type = :dir
|
323
|
+
Dirent.copy temp_ole.root, root
|
324
|
+
end
|
325
|
+
end
|
326
|
+
|
327
|
+
def bat_for_size size
|
328
|
+
# note >=, not > previously.
|
329
|
+
size >= @header.threshold ? @bbat : @sbat
|
330
|
+
end
|
331
|
+
|
332
|
+
def inspect
|
333
|
+
"#<#{self.class} io=#{@io.inspect} root=#{@root.inspect}>"
|
334
|
+
end
|
335
|
+
|
336
|
+
# A class which wraps the ole header
|
337
|
+
class Header < Struct.new(
|
338
|
+
:magic, :clsid, :minor_ver, :major_ver, :byte_order, :b_shift, :s_shift,
|
339
|
+
:reserved, :csectdir, :num_bat, :dirent_start, :transacting_signature, :threshold,
|
340
|
+
:sbat_start, :num_sbat, :mbat_start, :num_mbat
|
341
|
+
)
|
342
|
+
PACK = 'a8 a16 S2 a2 S2 a6 L3 a4 L5'
|
343
|
+
SIZE = 0x4c
|
344
|
+
# i have seen it pointed out that the first 4 bytes of hex,
|
345
|
+
# 0xd0cf11e0, is supposed to spell out docfile. hmmm :)
|
346
|
+
MAGIC = "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # expected value of Header#magic
|
347
|
+
# what you get if creating new header from scratch.
|
348
|
+
# AllocationTable::EOC isn't available yet. meh.
|
349
|
+
EOC = 0xfffffffe
|
350
|
+
DEFAULT = [
|
351
|
+
MAGIC, 0.chr * 16, 59, 3, "\xfe\xff", 9, 6,
|
352
|
+
0.chr * 6, 0, 1, EOC, 0.chr * 4,
|
353
|
+
4096, EOC, 0, EOC, 0
|
354
|
+
]
|
355
|
+
|
356
|
+
# 2 basic initializations, from scratch, or from a data string.
|
357
|
+
# from scratch will be geared towards creating a new ole object
|
358
|
+
def initialize *values
|
359
|
+
super(*(values.empty? ? DEFAULT : values))
|
360
|
+
validate!
|
361
|
+
end
|
362
|
+
|
363
|
+
def self.load str
|
364
|
+
Header.new(*str.unpack(PACK))
|
365
|
+
end
|
366
|
+
|
367
|
+
def save
|
368
|
+
to_a.pack PACK
|
369
|
+
end
|
370
|
+
|
371
|
+
def validate!
|
372
|
+
raise "OLE2 signature is invalid" unless magic == MAGIC
|
373
|
+
if num_bat == 0 or # is that valid for a completely empty file?
|
374
|
+
# not sure about this one. basically to do max possible bat given size of mbat
|
375
|
+
num_bat > 109 && num_bat > 109 + num_mbat * (1 << b_shift - 2) or
|
376
|
+
# shouldn't need to use the mbat as there is enough space in the header block
|
377
|
+
num_bat < 109 && num_mbat != 0 or
|
378
|
+
# given the size of the header is 76, if b_shift <= 6, blocks address the header.
|
379
|
+
s_shift > b_shift or b_shift <= 6 or b_shift >= 31 or
|
380
|
+
# we only handle little endian
|
381
|
+
byte_order != "\xfe\xff"
|
382
|
+
raise "not valid OLE2 structured storage file"
|
383
|
+
end
|
384
|
+
# relaxed this, due to test-msg/qwerty_[1-3]*.msg they all had
|
385
|
+
# 3 for this value.
|
386
|
+
# transacting_signature != "\x00" * 4 or
|
387
|
+
if threshold != 4096 or
|
388
|
+
num_mbat == 0 && mbat_start != AllocationTable::EOC or
|
389
|
+
reserved != "\x00" * 6
|
390
|
+
Log.warn "may not be a valid OLE2 structured storage file"
|
391
|
+
end
|
392
|
+
true
|
393
|
+
end
|
394
|
+
end
|
395
|
+
|
396
|
+
#
|
397
|
+
# +AllocationTable+'s hold the chains corresponding to files. Given
|
398
|
+
# an initial index, <tt>AllocationTable#chain</tt> follows the chain, returning
|
399
|
+
# the blocks that make up that file.
|
400
|
+
#
|
401
|
+
# There are 2 allocation tables, the bbat, and sbat, for big and small
|
402
|
+
# blocks respectively. The block chain should be loaded using either
|
403
|
+
# <tt>Storage#read_big_blocks</tt> or <tt>Storage#read_small_blocks</tt>
|
404
|
+
# as appropriate.
|
405
|
+
#
|
406
|
+
# Whether or not big or small blocks are used for a file depends on
|
407
|
+
# whether its size is over the <tt>Header#threshold</tt> level.
|
408
|
+
#
|
409
|
+
# An <tt>Ole::Storage</tt> document is serialized as a series of directory objects,
|
410
|
+
# which are stored in blocks throughout the file. The blocks are either
|
411
|
+
# big or small, and are accessed using the <tt>AllocationTable</tt>.
|
412
|
+
#
|
413
|
+
# The bbat allocation table's data is stored in the spare room in the header
|
414
|
+
# block, and in extra blocks throughout the file as referenced by the meta
|
415
|
+
# bat. That chain is linear, as there is no higher level table.
|
416
|
+
#
|
417
|
+
class AllocationTable
|
418
|
+
# a free block (I don't currently leave any blocks free), although I do pad out
|
419
|
+
# the allocation table with AVAIL to the block size.
|
420
|
+
AVAIL = 0xffffffff
|
421
|
+
EOC = 0xfffffffe # end of a chain
|
422
|
+
# these blocks correspond to the bat, and aren't part of a file, nor available.
|
423
|
+
# (I don't currently output these)
|
424
|
+
BAT = 0xfffffffd
|
425
|
+
META_BAT = 0xfffffffc
|
426
|
+
|
427
|
+
attr_reader :ole, :io, :table, :block_size
|
428
|
+
def initialize ole
|
429
|
+
@ole = ole
|
430
|
+
@table = []
|
431
|
+
end
|
432
|
+
|
433
|
+
def load data
|
434
|
+
@table = data.unpack('L*')
|
435
|
+
end
|
436
|
+
|
437
|
+
def truncated_table
|
438
|
+
# this strips trailing AVAILs. come to think of it, this has the potential to break
|
439
|
+
# bogus ole. if you terminate using AVAIL instead of EOC, like I did before. but that is
|
440
|
+
# very broken. however, if a chain ends with AVAIL, it should probably be fixed to EOC
|
441
|
+
# at load time.
|
442
|
+
temp = @table.reverse
|
443
|
+
not_avail = temp.find { |b| b != AVAIL } and temp = temp[temp.index(not_avail)..-1]
|
444
|
+
temp.reverse
|
445
|
+
end
|
446
|
+
|
447
|
+
def save
|
448
|
+
table = truncated_table #@table
|
449
|
+
# pad it out some
|
450
|
+
num = @ole.bbat.block_size / 4
|
451
|
+
# do you really use AVAIL? they probably extend past end of file, and may shortly
|
452
|
+
# be used for the bat. not really good.
|
453
|
+
table += [AVAIL] * (num - (table.length % num)) if (table.length % num) != 0
|
454
|
+
table.pack 'L*'
|
455
|
+
end
|
456
|
+
|
457
|
+
# rewriting this to be non-recursive. it broke on a large attachment
|
458
|
+
# building up the chain, causing a stack error. need tail-call elimination...
|
459
|
+
def chain start
|
460
|
+
a = []
|
461
|
+
idx = start
|
462
|
+
until idx >= META_BAT
|
463
|
+
raise "broken allocationtable chain" if idx < 0 || idx > @table.length
|
464
|
+
a << idx
|
465
|
+
idx = @table[idx]
|
466
|
+
end
|
467
|
+
Log.warn "invalid chain terminator #{idx}" unless idx == EOC
|
468
|
+
a
|
469
|
+
end
|
470
|
+
|
471
|
+
def ranges chain, size=nil
|
472
|
+
chain = self.chain(chain) unless Array === chain
|
473
|
+
blocks_to_ranges chain, size
|
474
|
+
end
|
475
|
+
|
476
|
+
# Turn a chain (an array given by +chain+) of big blocks, optionally
|
477
|
+
# truncated to +size+, into an array of arrays describing the stretches of
|
478
|
+
# bytes in the file that it belongs to.
|
479
|
+
#
|
480
|
+
# Big blocks are of size Ole::Storage::Header#b_size, and are stored
|
481
|
+
# directly in the parent file.
|
482
|
+
# truncate the chain if required
|
483
|
+
# convert chain to ranges of the block size
|
484
|
+
# truncate final range if required
|
485
|
+
|
486
|
+
def blocks_to_ranges chain, size=nil
|
487
|
+
chain = chain[0...(size.to_f / block_size).ceil] if size
|
488
|
+
ranges = chain.map { |i| [block_size * i, block_size] }
|
489
|
+
ranges.last[1] -= (ranges.length * block_size - size) if ranges.last and size
|
490
|
+
ranges
|
491
|
+
end
|
492
|
+
|
493
|
+
# quick shortcut. chain can be either a head (in which case the table is used to
|
494
|
+
# turn it into a chain), or a chain. it is converted to ranges, then to rangesio.
|
495
|
+
# its not resizeable or migrateable. it probably could be resizeable though, using
|
496
|
+
# self as the bat. but what would the first_block be?
|
497
|
+
def open chain, size=nil
|
498
|
+
io = RangesIO.new @io, ranges(chain, size)
|
499
|
+
if block_given?
|
500
|
+
begin yield io
|
501
|
+
ensure; io.close
|
502
|
+
end
|
503
|
+
else io
|
504
|
+
end
|
505
|
+
end
|
506
|
+
|
507
|
+
def read chain, size=nil
|
508
|
+
open chain, size, &:read
|
509
|
+
end
|
510
|
+
|
511
|
+
# ----------------------
|
512
|
+
|
513
|
+
def get_free_block
|
514
|
+
@table.each_index { |i| return i if @table[i] == AVAIL }
|
515
|
+
@table.push AVAIL
|
516
|
+
@table.length - 1
|
517
|
+
end
|
518
|
+
|
519
|
+
# must return first_block
|
520
|
+
def resize_chain first_block, size
|
521
|
+
new_num_blocks = (size / block_size.to_f).ceil
|
522
|
+
blocks = chain first_block
|
523
|
+
old_num_blocks = blocks.length
|
524
|
+
if new_num_blocks < old_num_blocks
|
525
|
+
# de-allocate some of our old blocks. TODO maybe zero them out in the file???
|
526
|
+
(new_num_blocks...old_num_blocks).each { |i| @table[blocks[i]] = AVAIL }
|
527
|
+
# if we have a chain, terminate it and return head, otherwise return EOC
|
528
|
+
if new_num_blocks > 0
|
529
|
+
@table[blocks[new_num_blocks-1]] = EOC
|
530
|
+
first_block
|
531
|
+
else EOC
|
532
|
+
end
|
533
|
+
elsif new_num_blocks > old_num_blocks
|
534
|
+
# need some more blocks.
|
535
|
+
last_block = blocks.last
|
536
|
+
(new_num_blocks - old_num_blocks).times do
|
537
|
+
block = get_free_block
|
538
|
+
# connect the chain. handle corner case of blocks being [] initially
|
539
|
+
if last_block
|
540
|
+
@table[last_block] = block
|
541
|
+
else
|
542
|
+
first_block = block
|
543
|
+
end
|
544
|
+
last_block = block
|
545
|
+
# this is just to inhibit the problem where it gets picked as being a free block
|
546
|
+
# again next time around.
|
547
|
+
@table[last_block] = EOC
|
548
|
+
end
|
549
|
+
first_block
|
550
|
+
else first_block
|
551
|
+
end
|
552
|
+
end
|
553
|
+
|
554
|
+
class Big < AllocationTable
|
555
|
+
def initialize(*args)
|
556
|
+
super
|
557
|
+
@block_size = 1 << @ole.header.b_shift
|
558
|
+
@io = @ole.io
|
559
|
+
end
|
560
|
+
|
561
|
+
# Big blocks are kind of -1 based, in order to not clash with the header.
|
562
|
+
def blocks_to_ranges blocks, size
|
563
|
+
super blocks.map { |b| b + 1 }, size
|
564
|
+
end
|
565
|
+
end
|
566
|
+
|
567
|
+
class Small < AllocationTable
|
568
|
+
def initialize(*args)
|
569
|
+
super
|
570
|
+
@block_size = 1 << @ole.header.s_shift
|
571
|
+
@io = @ole.sb_file
|
572
|
+
end
|
573
|
+
end
|
574
|
+
end
|
575
|
+
|
576
|
+
# like normal RangesIO, but Ole::Storage specific. the ranges are backed by an
|
577
|
+
# AllocationTable, and can be resized. used for read/write to 2 streams:
|
578
|
+
# 1. serialized dirent data
|
579
|
+
# 2. sbat table data
|
580
|
+
# 3. all dirents but through RangesIOMigrateable below
|
581
|
+
#
|
582
|
+
# Note that all internal access to first_block is through accessors, as it is sometimes
|
583
|
+
# useful to redirect it.
|
584
|
+
class RangesIOResizeable < RangesIO
|
585
|
+
attr_reader :bat
|
586
|
+
attr_accessor :first_block
|
587
|
+
def initialize bat, first_block, size=nil
|
588
|
+
@bat = bat
|
589
|
+
self.first_block = first_block
|
590
|
+
super @bat.io, @bat.ranges(first_block, size)
|
591
|
+
end
|
592
|
+
|
593
|
+
def truncate size
|
594
|
+
# note that old_blocks is != @ranges.length necessarily. i'm planning to write a
|
595
|
+
# merge_ranges function that merges sequential ranges into one as an optimization.
|
596
|
+
self.first_block = @bat.resize_chain first_block, size
|
597
|
+
@ranges = @bat.ranges first_block, size
|
598
|
+
@pos = @size if @pos > size
|
599
|
+
|
600
|
+
# don't know if this is required, but we explicitly request our @io to grow if necessary
|
601
|
+
# we never shrink it though. maybe this belongs in allocationtable, where smarter decisions
|
602
|
+
# can be made.
|
603
|
+
# maybe its ok to just seek out there later??
|
604
|
+
max = @ranges.map { |pos, len| pos + len }.max || 0
|
605
|
+
@io.truncate max if max > @io.size
|
606
|
+
|
607
|
+
@size = size
|
608
|
+
end
|
609
|
+
end
|
610
|
+
|
611
|
+
# like RangesIOResizeable, but Ole::Storage::Dirent specific. provides for migration
|
612
|
+
# between bats based on size, and updating the dirent, instead of the ole copy back
|
613
|
+
# on close.
|
614
|
+
class RangesIOMigrateable < RangesIOResizeable
|
615
|
+
attr_reader :dirent
|
616
|
+
def initialize dirent
|
617
|
+
@dirent = dirent
|
618
|
+
super @dirent.ole.bat_for_size(@dirent.size), @dirent.first_block, @dirent.size
|
619
|
+
end
|
620
|
+
|
621
|
+
def truncate size
|
622
|
+
bat = @dirent.ole.bat_for_size size
|
623
|
+
if bat != @bat
|
624
|
+
# bat migration needed! we need to backup some data. the amount of data
|
625
|
+
# should be <= @ole.header.threshold, so we can just hold it all in one buffer.
|
626
|
+
# backup this
|
627
|
+
pos = @pos
|
628
|
+
@pos = 0
|
629
|
+
keep = read [@size, size].min
|
630
|
+
# this does a normal truncate to 0, removing our presence from the old bat, and
|
631
|
+
# rewrite the dirent's first_block
|
632
|
+
super 0
|
633
|
+
@bat = bat
|
634
|
+
# just change the underlying io from right under everyone :)
|
635
|
+
@io = bat.io
|
636
|
+
# important to do this now, before the write. as the below write will always
|
637
|
+
# migrate us back to sbat! this will now allocate us +size+ in the new bat.
|
638
|
+
super
|
639
|
+
@pos = 0
|
640
|
+
write keep
|
641
|
+
@pos = pos
|
642
|
+
else
|
643
|
+
super
|
644
|
+
end
|
645
|
+
# now just update the file
|
646
|
+
@dirent.size = size
|
647
|
+
end
|
648
|
+
|
649
|
+
# forward this to the dirent
|
650
|
+
def first_block
|
651
|
+
@dirent.first_block
|
652
|
+
end
|
653
|
+
|
654
|
+
def first_block= val
|
655
|
+
@dirent.first_block = val
|
656
|
+
end
|
657
|
+
end
|
658
|
+
|
659
|
+
#
|
660
|
+
# A class which wraps an ole directory entry. Can be either a directory
|
661
|
+
# (<tt>Dirent#dir?</tt>) or a file (<tt>Dirent#file?</tt>)
|
662
|
+
#
|
663
|
+
# Most interaction with <tt>Ole::Storage</tt> is through this class.
|
664
|
+
# The 2 most important functions are <tt>Dirent#children</tt>, and
|
665
|
+
# <tt>Dirent#data</tt>.
|
666
|
+
#
|
667
|
+
# was considering separate classes for dirs and files. some methods/attrs only
|
668
|
+
# applicable to one or the other.
|
669
|
+
#
|
670
|
+
# Note that Dirent is still using a home grown Struct variant, with explicit
|
671
|
+
# MEMBERS etc. any reason for that still?
|
672
|
+
#
|
673
|
+
class Dirent
|
674
|
+
MEMBERS = [
|
675
|
+
:name_utf16, :name_len, :type_id, :colour, :prev, :next, :child,
|
676
|
+
:clsid, :flags, # dirs only
|
677
|
+
:create_time_str, :modify_time_str, # files only
|
678
|
+
:first_block, :size, :reserved
|
679
|
+
]
|
680
|
+
PACK = 'a64 S C C L3 a16 L a8 a8 L2 a4'
|
681
|
+
SIZE = 128
|
682
|
+
TYPE_MAP = {
|
683
|
+
# this is temporary
|
684
|
+
0 => :empty,
|
685
|
+
1 => :dir,
|
686
|
+
2 => :file,
|
687
|
+
5 => :root
|
688
|
+
}
|
689
|
+
COLOUR_MAP = {
|
690
|
+
0 => :red,
|
691
|
+
1 => :black
|
692
|
+
}
|
693
|
+
# used in the next / prev / child stuff to show that the tree ends here.
|
694
|
+
# also used for first_block for directory.
|
695
|
+
EOT = 0xffffffff
|
696
|
+
|
697
|
+
include Enumerable
|
698
|
+
|
699
|
+
# Dirent's should be created in 1 of 2 ways, either Dirent.new ole, [:dir/:file/:root],
|
700
|
+
# or Dirent.load '... dirent data ...'
|
701
|
+
# its a bit clunky, but thats how it is at the moment. you can assign to type, but
|
702
|
+
# shouldn't.
|
703
|
+
|
704
|
+
attr_accessor :idx
|
705
|
+
# This returns all the children of this +Dirent+. It is filled in
|
706
|
+
# when the tree structure is recreated.
|
707
|
+
attr_accessor :children
|
708
|
+
attr_reader :ole, :type, :create_time, :modify_time, :name
|
709
|
+
def initialize ole, type
|
710
|
+
@ole = ole
|
711
|
+
# this isn't really good enough. need default values put in there.
|
712
|
+
@values = [
|
713
|
+
0.chr * 2, 2, 0, # will get overwritten
|
714
|
+
1, EOT, EOT, EOT,
|
715
|
+
0.chr * 16, 0, nil, nil,
|
716
|
+
AllocationTable::EOC, 0, 0.chr * 4]
|
717
|
+
# maybe check types here.
|
718
|
+
@type = type
|
719
|
+
@create_time = @modify_time = nil
|
720
|
+
@children = []
|
721
|
+
if file?
|
722
|
+
@create_time = Time.now
|
723
|
+
@modify_time = Time.now
|
724
|
+
end
|
725
|
+
end
|
726
|
+
|
727
|
+
def self.load ole, str
|
728
|
+
# load should function without the need for the initializer.
|
729
|
+
dirent = Dirent.allocate
|
730
|
+
dirent.load ole, str
|
731
|
+
dirent
|
732
|
+
end
|
733
|
+
|
734
|
+
def load ole, str
|
735
|
+
@ole = ole
|
736
|
+
@values = str.unpack PACK
|
737
|
+
@name = Types::FROM_UTF16.iconv name_utf16[0...name_len].sub(/\x00\x00$/, '')
|
738
|
+
@type = TYPE_MAP[type_id] or raise "unknown type #{type_id.inspect}"
|
739
|
+
if file?
|
740
|
+
@create_time = Types.load_time create_time_str
|
741
|
+
@modify_time = Types.load_time modify_time_str
|
742
|
+
end
|
743
|
+
end
|
744
|
+
|
745
|
+
# only defined for files really. and the above children stuff is only for children.
|
746
|
+
# maybe i should have some sort of File and Dir class, that subclass Dirents? a dirent
|
747
|
+
# is just a data holder.
|
748
|
+
# this can be used for write support if the underlying io object was opened for writing.
|
749
|
+
# maybe take a mode string argument, and do truncation, append etc stuff.
|
750
|
+
def open
|
751
|
+
return nil unless file?
|
752
|
+
io = RangesIOMigrateable.new self
|
753
|
+
if block_given?
|
754
|
+
begin yield io
|
755
|
+
ensure; io.close
|
756
|
+
end
|
757
|
+
else io
|
758
|
+
end
|
759
|
+
end
|
760
|
+
|
761
|
+
def read limit=nil
|
762
|
+
open { |io| io.read limit }
|
763
|
+
end
|
764
|
+
|
765
|
+
def dir?
|
766
|
+
# to count root as a dir.
|
767
|
+
type != :file
|
768
|
+
end
|
769
|
+
|
770
|
+
def file?
|
771
|
+
type == :file
|
772
|
+
end
|
773
|
+
|
774
|
+
def time
|
775
|
+
# time is nil for streams, otherwise try to parse either of the time pairse (not
|
776
|
+
# sure of their meaning - created / modified?)
|
777
|
+
#@time ||= file? ? nil : (Dirent.parse_time(secs1, days1) || Dirent.parse_time(secs2, days2))
|
778
|
+
create_time || modify_time
|
779
|
+
end
|
780
|
+
|
781
|
+
def each(&block)
|
782
|
+
@children.each(&block)
|
783
|
+
end
|
784
|
+
|
785
|
+
def [] idx
|
786
|
+
return children[idx] if Integer === idx
|
787
|
+
# path style look up.
|
788
|
+
# maybe take another arg to allow creation? or leave that to the filesystem
|
789
|
+
# add on.
|
790
|
+
# not sure if '/' is a valid char in an Dirent#name, so no splitting etc at
|
791
|
+
# this level.
|
792
|
+
# also what about warning about multiple hits for the same name?
|
793
|
+
children.find { |child| idx === child.name }
|
794
|
+
end
|
795
|
+
|
796
|
+
# solution for the above '/' thing for now.
|
797
|
+
def / path
|
798
|
+
self[path]
|
799
|
+
end
|
800
|
+
|
801
|
+
def to_tree
|
802
|
+
if children and !children.empty?
|
803
|
+
str = "- #{inspect}\n"
|
804
|
+
children.each_with_index do |child, i|
|
805
|
+
last = i == children.length - 1
|
806
|
+
child.to_tree.split(/\n/).each_with_index do |line, j|
|
807
|
+
str << " #{last ? (j == 0 ? "\\" : ' ') : '|'}#{line}\n"
|
808
|
+
end
|
809
|
+
end
|
810
|
+
str
|
811
|
+
else "- #{inspect}\n"
|
812
|
+
end
|
813
|
+
end
|
814
|
+
|
815
|
+
MEMBERS.each_with_index do |sym, i|
|
816
|
+
define_method(sym) { @values[i] }
|
817
|
+
define_method(sym.to_s + '=') { |val| @values[i] = val }
|
818
|
+
end
|
819
|
+
|
820
|
+
def to_a
|
821
|
+
@values
|
822
|
+
end
|
823
|
+
|
824
|
+
# flattens the tree starting from here into +dirents+. note it modifies its argument.
|
825
|
+
def flatten dirents=[]
|
826
|
+
@idx = dirents.length
|
827
|
+
dirents << self
|
828
|
+
children.each { |child| child.flatten dirents }
|
829
|
+
self.child = Dirent.flatten_helper children
|
830
|
+
dirents
|
831
|
+
end
|
832
|
+
|
833
|
+
# i think making the tree structure optimized is actually more complex than this, and
|
834
|
+
# requires some intelligent ordering of the children based on names, but as long as
|
835
|
+
# it is valid its ok.
|
836
|
+
# actually, i think its ok. gsf for example only outputs a singly-linked-list, where
|
837
|
+
# prev is always EOT.
|
838
|
+
def self.flatten_helper children
|
839
|
+
return EOT if children.empty?
|
840
|
+
i = children.length / 2
|
841
|
+
this = children[i]
|
842
|
+
this.prev, this.next = [(0...i), (i+1..-1)].map { |r| flatten_helper children[r] }
|
843
|
+
this.idx
|
844
|
+
end
|
845
|
+
|
846
|
+
attr_accessor :name, :type
|
847
|
+
def save
|
848
|
+
tmp = Types::TO_UTF16.iconv(name)
|
849
|
+
tmp = tmp[0, 62] if tmp.length > 62
|
850
|
+
tmp += 0.chr * 2
|
851
|
+
self.name_len = tmp.length
|
852
|
+
self.name_utf16 = tmp + 0.chr * (64 - tmp.length)
|
853
|
+
begin
|
854
|
+
self.type_id = TYPE_MAP.to_a.find { |id, name| @type == name }.first
|
855
|
+
rescue
|
856
|
+
raise "unknown type #{type.inspect}"
|
857
|
+
end
|
858
|
+
# for the case of files, it is assumed that that was handled already
|
859
|
+
# note not dir?, so as not to override root's first_block
|
860
|
+
self.first_block = Dirent::EOT if type == :dir
|
861
|
+
if 0 #file?
|
862
|
+
#self.create_time_str = ?? #Types.load_time create_time_str
|
863
|
+
#self.modify_time_str = ?? #Types.load_time modify_time_str
|
864
|
+
else
|
865
|
+
self.create_time_str = 0.chr * 8
|
866
|
+
self.modify_time_str = 0.chr * 8
|
867
|
+
end
|
868
|
+
@values.pack PACK
|
869
|
+
end
|
870
|
+
|
871
|
+
def inspect
|
872
|
+
str = "#<Dirent:#{name.inspect}"
|
873
|
+
# perhaps i should remove the data snippet. its not that useful anymore.
|
874
|
+
if file?
|
875
|
+
tmp = read 9
|
876
|
+
data = tmp.length == 9 ? tmp[0, 5] + '...' : tmp
|
877
|
+
str << " size=#{size}" +
|
878
|
+
"#{time ? ' time=' + time.to_s.inspect : nil}" +
|
879
|
+
" data=#{data.inspect}"
|
880
|
+
else
|
881
|
+
# there is some dir specific stuff. like clsid, flags.
|
882
|
+
end
|
883
|
+
str + '>'
|
884
|
+
end
|
885
|
+
|
886
|
+
# --------
|
887
|
+
# and for creation of a dirent. don't like the name. is it a file or a directory?
|
888
|
+
# assign to type later? io will be empty.
|
889
|
+
def new_child type
|
890
|
+
child = Dirent.new ole, type
|
891
|
+
children << child
|
892
|
+
yield child if block_given?
|
893
|
+
child
|
894
|
+
end
|
895
|
+
|
896
|
+
def delete child
|
897
|
+
# remove from our child array, so that on reflatten and re-creation of @dirents, it will be gone
|
898
|
+
raise "#{child.inspect} not a child of #{self.inspect}" unless @children.delete child
|
899
|
+
# free our blocks
|
900
|
+
child.open { |io| io.truncate 0 }
|
901
|
+
end
|
902
|
+
|
903
|
+
def self.copy src, dst
|
904
|
+
# copies the contents of src to dst. must be the same type. this will throw an
|
905
|
+
# error on copying to root. maybe this will recurse too much for big documents??
|
906
|
+
raise 'differing types' if src.type == :file and dst.type != :file
|
907
|
+
dst.name = src.name
|
908
|
+
if src.dir?
|
909
|
+
src.children.each do |src_child|
|
910
|
+
dst.new_child(src_child.type) { |dst_child| Dirent.copy src_child, dst_child }
|
911
|
+
end
|
912
|
+
else
|
913
|
+
src.open do |src_io|
|
914
|
+
dst.open { |dst_io| IO.copy src_io, dst_io }
|
915
|
+
end
|
916
|
+
end
|
917
|
+
end
|
918
|
+
end
|
919
|
+
end
|
920
|
+
end
|
921
|
+
|
922
|
+
if $0 == __FILE__
|
923
|
+
puts Ole::Storage.open(ARGV[0]) { |ole| ole.root.to_tree }
|
924
|
+
end
|
925
|
+
|