simple_cfb 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1256 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ostruct'
4
+ require 'date'
5
+ require 'stringio'
6
+ require 'active_support/core_ext/object/blank.rb'
7
+ require 'active_support/core_ext/object/try.rb'
8
+
9
+ # Ported from https://github.com/SheetJS/js-cfb.
10
+ #
11
+ # File data is added with #add then, when finished, the entire blob of CFB
12
+ # data is generated in one go with #write. Progressive creation is impossible
13
+ # as the CFB file requires information on file sizes and directory entries at
14
+ # the start of output, so all of that must be known beforehand.
15
+ #
16
+ # Files can be parsed into a new object with #parse!, then #file_index and
17
+ # #full_paths examined to extract the parsed CFB container components.
18
+ #
19
+ # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-cfb/
20
+ #
21
+ # This Ruby port tries to be equivalent to the JavaScript original, but in so
22
+ # doing there are likely additional bugs and I've omitted anything that wasn't
23
+ # needed for encrypted OOXML writing and reading.
24
+ #
25
+ class SimpleCfb
26
+
27
+ # CFB miscellaneous
28
+ #
29
+ MSSZ = 64 # Mini Sector Size = 1<<6
30
+ MSCSZ = 4096 # Mini Stream Cutoff Size
31
+
32
+ # Convenience accessor to binary-encoded NUL byte.
33
+ #
34
+ NUL = String.new("\x00", encoding: 'ASCII-8BIT')
35
+
36
+ # 2.1 Compound File Sector Numbers and Types
37
+ #
38
+ FREESECT = -1
39
+ ENDOFCHAIN = -2
40
+ FATSECT = -3
41
+ DIFSECT = -4
42
+ MAXREGSECT = -6
43
+
44
+ # Compound File Header
45
+ #
46
+ HEADER_SIGNATURE = String.new("\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", encoding: 'ASCII-8BIT')
47
+ HEADER_CLSID = String.new("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", encoding: 'ASCII-8BIT')
48
+ HEADER_MINOR_VERSION = String.new("\x3e\x00", encoding: 'ASCII-8BIT')
49
+ MAXREGSID = -6
50
+ NOSTREAM = -1
51
+ STREAM = 2
52
+
53
+ # 2.6.1 Compound File Directory Entry
54
+ #
55
+ ENTRY_TYPES = ['unknown', 'storage', 'stream', 'lockbytes', 'property', 'root']
56
+
57
+ # Initial seed filename
58
+ #
59
+ SEED_FILENAME = "\u0001Sh33tJ5"
60
+
61
+ # Used internally for parser.
62
+ #
63
+ class SectorList < Array
64
+ attr_accessor :fat_addrs
65
+ attr_accessor :ssz
66
+ end
67
+
68
+ # =========================================================================
69
+ # PUBLIC CLASS INTERFACE
70
+ # =========================================================================
71
+
72
+ # Returns +true+ if the executing computer is little-endian natively,
73
+ # else +false+.
74
+ #
75
+ def self.host_is_little_endian?
76
+ [42].pack('l').bytes[0] == 42
77
+ end
78
+
79
+ # Treat an input ASCII-8BIT encoded string as 4 bytes and from this parse and
80
+ # return an unsigned 32-bit little-endian integer.
81
+ #
82
+ # +input+:: ASCII-8BIT encoded string including 4 byte sequence
83
+ # +index+:: Index into +input+ to start reading bytes (default 0)
84
+ #
85
+ def self.get_uint32le(input, index = 0)
86
+ data = input.slice(index, 4)
87
+ data = data.reverse() unless self.host_is_little_endian?
88
+
89
+ data.unpack('L').first
90
+ end
91
+
92
+ # Treat an input ASCII-8BIT encoded string as 4 bytes and from this parse and
93
+ # return a signed 32-bit little-endian integer.
94
+ #
95
+ # +input+:: ASCII-8BIT encoded string including 4 byte sequence
96
+ # +index+:: Index into +input+ to start reading bytes (default 0)
97
+ #
98
+ def self.get_int32le(input, index = 0)
99
+ data = input.slice(index, 4)
100
+ data = data.reverse() unless self.host_is_little_endian?
101
+
102
+ data.unpack('l').first
103
+ end
104
+
105
+ # Parse a ctime/mtime 8-byte sequence (4 16-bit little endian pairs) into a
106
+ # returned Ruby Time object, or +nil+ if the values are all zero.
107
+ #
108
+ # +data+:: ASCII-8BIT encoded string, 8 bytes long.
109
+ #
110
+ def self.get_time(data)
111
+ high = self.get_uint32le(data, 4)
112
+ low = self.get_uint32le(data, 0)
113
+
114
+ return nil if high.zero? && low.zero?
115
+
116
+ high = (high / 1e7) * 2.pow(32)
117
+ low = (low / 1e7)
118
+
119
+ return Time.at(high + low - 11644473600).utc
120
+ end
121
+
122
+ # =========================================================================
123
+ # PUBLIC INSTANCE INTERFACE
124
+ # =========================================================================
125
+
126
+ attr_accessor :full_paths, :file_index
127
+
128
+ def initialize
129
+ self.reinit()
130
+ end
131
+
132
+ # Add a file entry. Supports only root filenames only. File must not be
133
+ # added already.
134
+ #
135
+ # +name+:: Filename, e.g. "Foo", in your preferred string encoding
136
+ # +content+:: Mandatory ASCII-8BIT encoded string containing file data
137
+ #
138
+ def add(name, content)
139
+ self.reinit()
140
+
141
+ fpath = self.full_paths[0]
142
+
143
+ if name.slice(0, fpath.size) == fpath
144
+ fpath = name
145
+ else
146
+ fpath += '/' unless fpath.end_with?('/')
147
+ fpath = (fpath + name).gsub('//', '/')
148
+ end
149
+
150
+ file = OpenStruct.new({name: filename(name), type: 2, content: content, size: content.bytesize})
151
+
152
+ self.file_index << file
153
+ self.full_paths << fpath
154
+
155
+ rebuild(force_gc: true)
156
+
157
+ return file
158
+ end
159
+
160
+ # Compile and return the CFB file data.
161
+ #
162
+ def write
163
+
164
+ # Commented out for now, because we prefer parity with the JS code for
165
+ # test verification purposes. The overhead seems minimal.
166
+ #
167
+ # # Get rid of the seed file if it's still present and we seem to have
168
+ # # more file entries than the root directory and seed entry.
169
+ # #
170
+ # seed_leaf = "/#{SEED_FILENAME}"
171
+ # seed_index = self.full_paths.find_index do | path |
172
+ # path.end_with?(seed_leaf)
173
+ # end
174
+ #
175
+ # unless seed_index.nil? || self.file_index.size < 3
176
+ # self.file_index.delete_at(seed_index)
177
+ # self.full_paths.delete_at(seed_index)
178
+ # end
179
+ #
180
+ # self.rebuild(force_gc: true)
181
+ self.rebuild(force_gc: false)
182
+
183
+ mini_size = 0
184
+ fat_size = 0
185
+
186
+ 0.upto(self.file_index.size - 1) do | i |
187
+ flen = self.file_index[i]&.content&.bytesize
188
+ next if flen.nil? || flen.zero?
189
+
190
+ if flen < 0x1000
191
+ mini_size += (flen + 0x3F) >> 6
192
+ else
193
+ fat_size += (flen + 0x01FF) >> 9
194
+ end
195
+ end
196
+
197
+ dir_cnt = (self.full_paths.size + 3) >> 2
198
+ mini_cnt = (mini_size + 7) >> 3
199
+ mfat_cnt = (mini_size + 0x7F) >> 7
200
+ fat_base = mini_cnt + fat_size + dir_cnt + mfat_cnt
201
+ fat_cnt = (fat_base + 0x7F) >> 7
202
+ difat_cnt = fat_cnt <= 109 ? 0 : ((fat_cnt - 109).to_f / 0x7F).ceil()
203
+
204
+ while (((fat_base + fat_cnt + difat_cnt + 0x7F) >> 7) > fat_cnt)
205
+ fat_cnt += 1
206
+ difat_cnt = fat_cnt <= 109 ? 0 : ((fat_cnt - 109).to_f / 0x7F).ceil()
207
+ end
208
+
209
+ el = [1, difat_cnt, fat_cnt, mfat_cnt, dir_cnt, fat_size, mini_size, 0]
210
+
211
+ self.file_index[0].size = mini_size << 6
212
+ self.file_index[0].start = el[0] + el[1] + el[2] + el[3] + el[4] + el[5]
213
+
214
+ el[7] = el[0] + el[1] + el[2] + el[3] + el[4] + el[5] + ((el[6] + 7) >> 3)
215
+
216
+ o = String.new(encoding: 'ASCII-8BIT')
217
+
218
+ o << HEADER_SIGNATURE
219
+ o << NUL * 2 * 8
220
+ o << write_shift(2, 0x003E)
221
+ o << write_shift(2, 0x0003)
222
+ o << write_shift(2, 0xFFFE)
223
+ o << write_shift(2, 0x0009)
224
+ o << write_shift(2, 0x0006)
225
+ o << NUL * 2 * 3
226
+
227
+ o << write_shift( 4, 0)
228
+ o << write_shift( 4, el[2])
229
+ o << write_shift( 4, el[0] + el[1] + el[2] + el[3] - 1)
230
+ o << write_shift( 4, 0)
231
+ o << write_shift( 4, 1<<12)
232
+ o << write_shift( 4, (el[3].blank? || el[3].zero?) ? ENDOFCHAIN : el[0] + el[1] + el[2] - 1)
233
+ o << write_shift( 4, el[3])
234
+ o << write_shift(-4, (el[1].blank? || el[1].zero?) ? ENDOFCHAIN : el[0] - 1)
235
+ o << write_shift( 4, el[1])
236
+
237
+ i = 0
238
+ t = 0
239
+
240
+ while i < 109
241
+ o << write_shift(-4, i < el[2] ? el[1] + i : -1)
242
+ i += 1
243
+ end
244
+
245
+ unless el[1].blank? || el[1].zero?
246
+ t = 0
247
+ while t < el[1]
248
+ while i < 236 + t * 127
249
+ o << write_shift(-4, i < el[2] ? el[1] + i : -1)
250
+ i += 1
251
+ end
252
+
253
+ o << write_shift(-4, t == el[1] - 1 ? ENDOFCHAIN : t + 1)
254
+ t += 1
255
+ end
256
+ end
257
+
258
+ chainit = Proc.new do | w |
259
+ t += w
260
+
261
+ while i < t - 1
262
+ o << write_shift(-4, i + 1)
263
+ i += 1
264
+ end
265
+
266
+ unless w.blank? || w.zero?
267
+ i += 1
268
+ o << write_shift(-4, ENDOFCHAIN)
269
+ end
270
+ end
271
+
272
+ i = 0
273
+ t = el[1]
274
+
275
+ while i < t
276
+ o << write_shift(-4, DIFSECT)
277
+ i += 1
278
+ end
279
+
280
+ t += el[2]
281
+
282
+ while i < t
283
+ o << write_shift(-4, FATSECT)
284
+ i += 1
285
+ end
286
+
287
+ chainit.call(el[3])
288
+ chainit.call(el[4])
289
+
290
+ j = 0
291
+ flen = 0
292
+ file = self.file_index[0]
293
+
294
+ while j < self.file_index.size
295
+ file = self.file_index[j]
296
+ j += 1
297
+
298
+ next if file.content.nil?
299
+
300
+ flen = file.content.bytesize
301
+ next if flen < 0x1000
302
+
303
+ file.start = t
304
+ chainit.call((flen + 0x01FF) >> 9)
305
+ end
306
+
307
+ chainit.call((el[6] + 7) >> 3)
308
+
309
+ while o.size & 0x1FF != 0
310
+ o << write_shift(-4, ENDOFCHAIN)
311
+ end
312
+
313
+ t = i = j = 0
314
+
315
+ while j < self.file_index.size do
316
+ file = self.file_index[j]
317
+ j += 1
318
+
319
+ next if file.content.nil?
320
+
321
+ flen = file.content.bytesize
322
+ next if flen == 0 || flen >= 0x1000
323
+
324
+ file.start = t
325
+ chainit.call((flen + 0x3F) >> 6)
326
+ end
327
+
328
+ while o.size & 0x1FF != 0
329
+ o << write_shift(-4, ENDOFCHAIN)
330
+ end
331
+
332
+ i = 0
333
+
334
+ while i < (el[4] << 2) do
335
+ nm = self.full_paths[i]
336
+
337
+ if nm.blank?
338
+ 0.upto(16) { o << write_shift(4, 0) } # Remember, #upto is inclusive -> *17* words
339
+ 0.upto(2 ) { o << write_shift(4, -1) }
340
+ 0.upto(11) { o << write_shift(4, 0) }
341
+
342
+ i += 1
343
+ next # NOTE EARLY LOOP RESTART
344
+ end
345
+
346
+ file = self.file_index[i]
347
+
348
+ if i.zero?
349
+ file.start = file.size.blank? || file.size.zero? ? ENDOFCHAIN : file.start - 1;
350
+ end
351
+
352
+ u_nm = file.name
353
+ u_nm = u_nm[0...32] if u_nm.size > 32
354
+
355
+ flen = 2 * (u_nm.size + 1)
356
+
357
+ o << write_shift(64, u_nm, 'utf16le')
358
+ o << write_shift(2, flen)
359
+ o << write_shift(1, file.type)
360
+ o << write_shift(1, file.color)
361
+ o << write_shift(-4, file.L)
362
+ o << write_shift(-4, file.R)
363
+ o << write_shift(-4, file.C)
364
+
365
+ if file.clsid.blank?
366
+ j = 0
367
+ while j < 4
368
+ o << write_shift(4, 0)
369
+ j += 1
370
+ end
371
+ else
372
+ o << file.clsid
373
+ end
374
+
375
+ o << write_shift(4, file.state.blank? || file.state.zero? ? 0 : file.state)
376
+ o << write_shift(4, 0)
377
+ o << write_shift(4, 0)
378
+ o << write_shift(4, 0)
379
+ o << write_shift(4, 0)
380
+ o << write_shift(4, file.start)
381
+ o << write_shift(4, file.size)
382
+ o << write_shift(4, 0)
383
+
384
+ i += 1
385
+ end
386
+
387
+ i = 1
388
+
389
+ while i < self.file_index.size do
390
+ file = self.file_index[i]
391
+
392
+ if file.size.present? && file.size >= 0x1000
393
+ aligned_size = (file.start + 1) << 9
394
+ while (o.size < aligned_size) do; o << 0x00; end
395
+
396
+ o << file.content
397
+ while (o.size % 512 != 0) do; o << 0x00; end
398
+ end
399
+
400
+ i += 1
401
+ end
402
+
403
+ i = 1
404
+
405
+ while i < self.file_index.size do
406
+ file = self.file_index[i]
407
+
408
+ if file.size.present? && file.size > 0 && file.size < 0x1000
409
+ o << file.content
410
+ while (o.size % 64 != 0) do; o << 0x00; end
411
+ end
412
+
413
+ i += 1
414
+ end
415
+
416
+ while (o.size < el[7] << 9) do; o << 0x00; end
417
+
418
+ return o
419
+ end # "def write"
420
+
421
+ # Parses an input file into this object, allowing you to extract individual
422
+ # files thereafter via #read.
423
+ #
424
+ # +file+:: Source I/O stream. Data is read from the current file pointer,
425
+ # which will therefore have advanced when the method returns.
426
+ #
427
+ def parse!(file)
428
+ raise "CFB corrupt - file size < 512 bytes" if file.size < 512
429
+
430
+ mver = 3
431
+ ssz = 512
432
+ nmfs = 0 # number of mini FAT sectors
433
+ difat_sec_cnt = 0
434
+ dir_start = 0
435
+ minifat_start = 0
436
+ difat_start = 0
437
+ fat_addrs = [] # locations of FAT sectors
438
+
439
+ # [MS-CFB] 2.2 Compound File Header
440
+ # Check major version
441
+ #
442
+ major, minor = self.check_get_mver(file)
443
+
444
+ if major == 3
445
+ ssz = 512
446
+ elsif major == 4
447
+ ssz = 4096
448
+ elsif major == 0 && minor == 0
449
+ raise 'Zip contents are not supported'
450
+ else
451
+ raise "Major version: Only 3 or 4 is supported; #{mver} encountered"
452
+ end
453
+
454
+ self.check_shifts(file, major)
455
+
456
+ # Number of Directory Sectors
457
+ #
458
+ dir_cnt = self.read_shift(file, 4, 'i')
459
+ raise "Directory sectors: Expected 0, saw #{dir_cnt}" if major == 3 && dir_cnt != 0
460
+
461
+ # Number of FAT Sectors
462
+ #
463
+ file.seek(file.pos + 4)
464
+
465
+ # First Directory Sector Location
466
+ #
467
+ dir_start = self.read_shift(file, 4, 'i')
468
+
469
+ # Transaction Signature
470
+ #
471
+ file.seek(file.pos + 4)
472
+
473
+ # Mini Stream Cutoff Size
474
+ #
475
+ self.check_field(file, "\x00\x10\x00\x00", 'Mini stream cutoff size')
476
+
477
+ # First Mini FAT Sector Location
478
+ #
479
+ minifat_start = self.read_shift(file, 4, 'i')
480
+
481
+ # Number of Mini FAT Sectors
482
+ #
483
+ nmfs = self.read_shift(file, 4, 'i')
484
+
485
+ # First DIFAT sector location
486
+ #
487
+ difat_start = self.read_shift(file, 4, 'i')
488
+
489
+ # Number of DIFAT Sectors
490
+ #
491
+ difat_sec_cnt = self.read_shift(file, 4, 'i')
492
+
493
+ # Grab FAT Sector Locations
494
+ #
495
+ q = -1
496
+ j = 0
497
+
498
+ while (j < 109) # 109 = (512 - file.pos) >> 2
499
+ q = self.read_shift(file, 4, 'i')
500
+ break if q < 0
501
+ fat_addrs[j] = q
502
+ j += 1
503
+ end
504
+
505
+ # Break the file up into sectors, skipping the file header of 'ssz' size.
506
+ #
507
+ sectors = []
508
+ file.seek(ssz)
509
+
510
+ while ! file.eof?
511
+ sectors << file.read(ssz)
512
+ end
513
+
514
+ self.sleuth_fat(difat_start, difat_sec_cnt, sectors, ssz, fat_addrs)
515
+
516
+ # Chains
517
+ #
518
+ sector_list = self.make_sector_list(sectors, dir_start, fat_addrs, ssz)
519
+ sector_list[dir_start].name = '!Directory'
520
+
521
+ if nmfs > 0 && minifat_start != ENDOFCHAIN
522
+ sector_list[minifat_start].name = '!MiniFAT'
523
+ end
524
+
525
+ sector_list[fat_addrs[0]].name = '!FAT'
526
+ sector_list.fat_addrs = fat_addrs
527
+ sector_list.ssz = ssz
528
+
529
+ # [MS-CFB] 2.6.1 Compound File Directory Entry
530
+ #
531
+ files = {}
532
+ paths = []
533
+
534
+ self.full_paths = []
535
+ self.file_index = []
536
+ self.read_directory(
537
+ dir_start,
538
+ sector_list,
539
+ sectors,
540
+ paths,
541
+ nmfs,
542
+ files,
543
+ minifat_start
544
+ )
545
+
546
+ self.build_full_paths(paths)
547
+ ensure
548
+ file.close() unless file.nil?
549
+ end # "def parse!"
550
+
551
+ # =========================================================================
552
+ # PRIVATE INSTANCE METHODS
553
+ # =========================================================================
554
+ #
555
+ private
556
+
557
+ # Initialise or reinitialise the internal file data. After being called
558
+ # for the first time, calling here is really only useful to make sure
559
+ # that internal file path and index arrays look consistent.
560
+ #
561
+ def reinit
562
+ self.full_paths ||= []
563
+ self.file_index ||= []
564
+
565
+ if self.full_paths.size != self.file_index.size
566
+ raise 'Inconsistent CFB structure'
567
+ end
568
+
569
+ if self.full_paths.size == 0
570
+ root = 'Root Entry'
571
+
572
+ self.full_paths << root + '/'
573
+ self.file_index << OpenStruct.new({name: root, type: 5})
574
+
575
+ # Add starting seed file
576
+ #
577
+ nm = SEED_FILENAME
578
+ p = [55, 50, 54, 50].pack('C*')
579
+
580
+ self.full_paths << self.full_paths[0] + nm
581
+ self.file_index << OpenStruct.new({name: nm, type: 2, content: p, R: 69, L: 69, C: 69})
582
+ end
583
+ end
584
+
585
+ # Strange function that's very much not the same as "File.dirname".
586
+ #
587
+ def dirname(p)
588
+ if p.end_with?('/')
589
+ chomped = p.chomp('/')
590
+ return chomped.include?('/') ? self.dirname(chomped) : p # NOTE EARLY EXIT AND RECURSION
591
+ end
592
+
593
+ c = p.rindex('/')
594
+
595
+ return c.nil? ? p : p.slice(0, c + 1)
596
+ end
597
+
598
+ # Strange function that's very much not the same as "File.basename".
599
+ #
600
+ def filename(p)
601
+ if p.end_with?('/')
602
+ return filename(p.chomp('/')) # NOTE EARLY EXIT AND RECURSION
603
+ end
604
+
605
+ c = p.rindex('/')
606
+
607
+ return c.nil? ? p : p[(c + 1)..]
608
+ end
609
+
610
+ # Compare file-path-name with some FAT concepts thrown in (L vs R); related
611
+ # to CFB section 2.6.4 (red-black trees).
612
+ #
613
+ def namecmp(l, r)
614
+ el = l.split('/')
615
+ ar = r.split('/')
616
+ i = 0
617
+ z = [el.size, ar.size].min
618
+
619
+ while i < z do
620
+ c = el[i].size - ar[i].size
621
+
622
+ return c if c != 0
623
+ return el[i] < r[i] ? -1 : 1 if el[i] != ar[i]
624
+
625
+ i += 1
626
+ end
627
+
628
+ return el.size - ar.size
629
+ end
630
+
631
+ # CFB internal knowledge would be required to understand this; seems to be
632
+ # recalculating data structures that then theoretically would make life
633
+ # easier during the file output stage.
634
+ #
635
+ def rebuild(force_gc: false)
636
+ self.reinit()
637
+
638
+ s = false
639
+ gc = force_gc
640
+
641
+ unless gc == true
642
+ (self.full_paths.size - 1).downto(0) do | i |
643
+ file = self.file_index[i]
644
+
645
+ case file.type
646
+ when 0
647
+ if s == true
648
+ gc = true
649
+ else
650
+ self.file_index.pop()
651
+ self.full_paths.pop()
652
+ end
653
+
654
+ when 1, 2, 5
655
+ s = true
656
+ gc ||= (file.R * file.L * file.C rescue nil).nil?
657
+ gc ||= file.R.try(:>, -1) && file.L.try(:>, -1) && file.R == file.L
658
+
659
+ else
660
+ gc = true
661
+ end
662
+ end
663
+ end
664
+
665
+ return unless gc == true
666
+
667
+ now = Date.parse('1987-01-19')
668
+
669
+ # Track which names exist
670
+
671
+ track_full_paths = {}
672
+ data = []
673
+
674
+ 0.upto(self.full_paths.size - 1) do | i |
675
+ track_full_paths[self.full_paths[i]] = true
676
+ next if self.file_index[i].type == 0
677
+ data.push([self.full_paths[i], self.file_index[i]])
678
+ end
679
+
680
+ 0.upto(data.size - 1) do | i |
681
+ dad = self.dirname(data[i][0])
682
+ s = track_full_paths[dad]
683
+
684
+ while s.blank?
685
+ while self.dirname(dad).present? && track_full_paths[self.dirname(dad)].blank?
686
+ dir = self.dirname(dad)
687
+ end
688
+
689
+ data.push([
690
+ dad,
691
+ OpenStruct.new({
692
+ name: self.filname(dad).gsub('/', ''),
693
+ type: 1,
694
+ clsid: HEADER_CLSID,
695
+ ct: now,
696
+ mt: now,
697
+ content: null
698
+ })
699
+ ])
700
+
701
+ # Add name to set
702
+ #
703
+ track_full_paths[dad] = true
704
+
705
+ dad = self.dirname(data[i][0])
706
+ s = track_full_paths[dad]
707
+ end
708
+ end
709
+
710
+ data.sort! { |x, y| self.namecmp(x[0], y[0]) }
711
+
712
+ self.full_paths = []
713
+ self.file_index = []
714
+
715
+ 0.upto(data.size - 1) do | i |
716
+ self.full_paths << data[i][0]
717
+ self.file_index << data[i][1]
718
+ end
719
+
720
+ 0.upto(data.size - 1) do | i |
721
+ nm = self.full_paths[i]
722
+ elt = self.file_index[i]
723
+
724
+ elt.name = self.filename(nm).gsub('/', '')
725
+ elt.color = 1
726
+ elt.L = -1
727
+ elt.R = -1
728
+ elt.C = -1
729
+ elt.size = elt.content.nil? ? 0 : elt.content.bytesize
730
+ elt.start = 0
731
+ elt.clsid = elt.clsid || HEADER_CLSID
732
+
733
+ if i == 0
734
+ elt.C = data.size > 1 ? 1 : -1
735
+ elt.size = 0
736
+ elt.type = 5
737
+
738
+ elsif nm.end_with?('/')
739
+ j = i + 1
740
+ while j < data.size do
741
+ break if self.dirname(self.full_paths[j]) == nm
742
+ j += 1
743
+ end
744
+
745
+ elt.C = j >= data.size ? -1 : j
746
+
747
+ j = i + 1
748
+ while j < data.size do
749
+ break if self.dirname(self.full_paths[j]) == self.dirname(nm)
750
+ j += 1
751
+ end
752
+
753
+ elt.R = j >= data.size ? -1 : j
754
+ elt.type = 1
755
+
756
+ else
757
+ elt.R = i + 1 if self.dirname(self.full_paths[i + 1] || '') == self.dirname(nm)
758
+ elt.type = 2
759
+
760
+ end
761
+ end
762
+ end
763
+
764
+ # Returns a chunk of data representing a converted write of the input in
765
+ # the +value+ parameter.
766
+ #
767
+ # The JS code from which this was ported has a very, VERY strange method
768
+ # signature...
769
+ #
770
+ # +size+:: Either a number of bytes to write or a format specifier (see
771
+ # below).
772
+ #
773
+ # +value+:: A value to write; its type is interpreted through both the
774
+ # +size+ and +format+ parameters.
775
+ #
776
+ # +format+:: Either 'hex' or 'utf16le' in which case the value is treated
777
+ # as a hex string (e.g. "deadbeef", high nibble first) or
778
+ # character data in arbitrary Ruby string encoding; written to
779
+ # the output as parsed bytes from the hex data, or little
780
+ # endian UTF-16 byte pairs, respectively. If the input value
781
+ # is longer than +size+ *IN BYTES* then it is truncated, else
782
+ # if need be, padded with zeros - again *IN BYTES*, so the
783
+ # maximum length in characters of a "utf16le" string is half
784
+ # the amount in +size+.
785
+ #
786
+ # If +format+ is something else or omitted, "size" becomes an
787
+ # indication of format (!). The value is treated as an 8-bit
788
+ # byte (+size+ is 1) and masked as such, 16-bit unsigned
789
+ # little-endian value (2), or uint32 (4) - or a signed int32
790
+ # (+size+ is -4 - yes, that's minus 4) - written out as four
791
+ # bytes, little-endian.
792
+ #
793
+ def write_shift(size, value, format = nil)
794
+ output_buffer = nil
795
+
796
+ case format
797
+ when 'hex'
798
+ bytes = [value].pack('H*').ljust(size, NUL)
799
+ bytes = bytes[0...size]
800
+
801
+ output_buffer = bytes
802
+
803
+ when 'utf16le'
804
+ chars = value.ljust(size / 2, NUL)
805
+ chars = chars[0...(size / 2)]
806
+
807
+ output_buffer = chars.encode('UTF-16LE').force_encoding('ASCII-8BIT')
808
+
809
+ else
810
+ case size
811
+ when 1
812
+ output_buffer = [value].pack('C') # Unsigned 8-bit, bitwise truncated
813
+ when 2
814
+ output_buffer = [value].pack('v') # Unsigned 16-bit little-endian, bitwise truncated
815
+ when 4
816
+ output_buffer = [value].pack('V') # Unsigned 32-bit little-endian, bitwise truncated
817
+ when -4
818
+ int32_4_bytes = [value].pack('l')
819
+ int32_4_bytes = int32_4_bytes.reverse() unless self.class.host_is_little_endian?
820
+ output_buffer = int32_4_bytes
821
+ end
822
+ end
823
+
824
+ return output_buffer
825
+ end
826
+
827
+ # A method that's a companion to #write_shift and equally strange!
828
+ #
829
+ # Read from file for 'size' bytes if size is 1, 2 or 4, parsing the bytes
830
+ # as an 8-bit unsigned, 16-bit unsigned or 32-bit integer where the value
831
+ # of 't' indicates if the 32-bit integer is signed ('t' is string 'i') or
832
+ # unsigned ('t' is anything else); or if size is 16, just return a string
833
+ # of 16 bytes read as-is.
834
+ #
835
+ # This implementation is slightly cleaner and more appropriate than the
836
+ # one in the original source, by omitting unused conversions.
837
+ #
838
+ # +file+:: Source I/O stream. Data is read from the current file pointer,
839
+ # which will therefore have advanced when the method returns.
840
+ #
841
+ # +size+:: 1, 2, 4 to read 1, 2 or 4 bytes returned as a parsed 8, 16 or
842
+ # 32-bit little-endian integer respectively, or pass 16 to read
843
+ # 16 bytes of raw data returned as an ASCII-8BIT encoded string.
844
+ #
845
+ # +type+:: If +size+ is 4, pass 'i' to read as a signed 32-bit integer,
846
+ # else (omitted, or not 'i') value is read as unsigned.
847
+ #
848
+ def read_shift(file, size, t = nil)
849
+ return case size
850
+ when 1 # Unsigned 8-bit
851
+ file.read(1).bytes.first
852
+
853
+ when 2 # Unsigned 16-bit little-endian
854
+ file.read(2).unpack('v').first
855
+
856
+ when 4 # 32-bit little-endian signed or unsigned
857
+ data = file.read(4)
858
+
859
+ if t == 'i' # Signed 32-bit little-endian
860
+ self.class.get_int32le(data)
861
+ else # Unsigned 32-bit little-endian
862
+ self.class.get_uint32le(data)
863
+ end
864
+
865
+ when 16
866
+ file.read(16)
867
+ end
868
+ end
869
+
870
+ # Read from the file, expecting to see a particular value; if not, throw
871
+ # an exception.
872
+ #
873
+ # +file+:: Source I/O stream. Data is read from the current file
874
+ # pointer, which will therefore have advanced when the
875
+ # method returns.
876
+ #
877
+ # +expected+:: The expected value, as a String that'll be forced to
878
+ # ASCII-8BIT encoding, if not that way already.
879
+ #
880
+ # +field_name+:: The field name to include in the raised exception, just
881
+ # for human diagnostic purposes.
882
+ #
883
+ def check_field(file, expected, field_name)
884
+ expected = expected.dup.force_encoding('ASCII-8BIT')
885
+ data = file.read(expected.bytesize)
886
+
887
+ if data != expected
888
+ raise "#{field_name}: Expected #{expected.inspect}, but got #{data.inspect}"
889
+ end
890
+ end
891
+
892
+ # Return a tuple array of major, minor file version, with 0, 0 for ZIP
893
+ # files, else read from the CFB file, checking header in passing. File
894
+ # pointer is assumed to be at zero on entry.
895
+ #
896
+ # +file+:: Source I/O stream. Data is read from the current file pointer,
897
+ # which will therefore have advanced when the method returns.
898
+ #
899
+ def check_get_mver(file)
900
+ return [0, 0] if file.read(1) == 0x50 && file.read(1) == 0x4b
901
+
902
+ file.rewind()
903
+ check_field(file, HEADER_SIGNATURE, 'Header signature')
904
+
905
+ file.seek(file.pos + 16) # Skip all-NUL CLSID, 16 bytes
906
+
907
+ # Minor version
908
+ minor = self.read_shift(file, 2)
909
+ major = self.read_shift(file, 2)
910
+
911
+ return [major, minor]
912
+ end
913
+
914
+ # Check sector shifts in the file header.
915
+ #
916
+ # +file+:: Source I/O stream. Data is read from the current file pointer,
917
+ # which will therefore have advanced when the method returns.
918
+ #
919
+ # +major+:: Major version number - must be 3 or 4.
920
+ #
921
+ def check_shifts(file, major)
922
+
923
+ # Skip byte order marker (always indicates little-endian)
924
+ #
925
+ file.seek(file.pos + 2)
926
+
927
+ shift = self.read_shift(file, 2)
928
+
929
+ case shift
930
+ when 0x09
931
+ raise "Sector shift: Expected 9, saw #{shift}" if major != 3
932
+ when 0x0c
933
+ raise "Sector shift: Expected 12, saw #{shift}" if major != 4
934
+ else
935
+ raise "Sector shift: Unsupported value #{shift}"
936
+ end
937
+
938
+ # Mini Sector Shift
939
+ #
940
+ self.check_field(file, "\x06\x00", 'Mini sector shift')
941
+
942
+ # Reserved
943
+ #
944
+ self.check_field(file, "\x00\x00\x00\x00\x00\x00", 'Reserved')
945
+ end
946
+
947
+ # Chase down the rest of the DIFAT chain to build a comprehensive list
948
+ # DIFAT chains by storing the next sector number as the last 32 bits.
949
+ #
950
+ # +idx+:: Sector index; usually, start DIFAT sector initially
951
+ # +cnt+:: DIFAT sector count expected
952
+ # +sectors+:: Array of sectors
953
+ # +ssz+:: Size of a sector
954
+ # +fat_addrs+:: Array MODIFIED IN PLACE with sector addresses added
955
+ #
956
+ def sleuth_fat(idx, cnt, sectors, ssz, fat_addrs)
957
+ q = ENDOFCHAIN
958
+
959
+ if idx == ENDOFCHAIN
960
+ raise 'DIFAT chain shorter than expected' if cnt != 0
961
+ elsif idx != FREESECT
962
+ sector = sectors[idx]
963
+ m = (ssz >> 2) - 1
964
+ i = 0
965
+
966
+ return if sector.nil?
967
+
968
+ while i < m
969
+ q = self.class.get_int32le(sector, i * 4)
970
+ break if q == ENDOFCHAIN
971
+
972
+ fat_addrs << q
973
+ i += 1
974
+ end
975
+
976
+ if cnt >= 1
977
+ self.sleuth_fat(
978
+ self.class.get_int32le(sector, ssz - 4),
979
+ cnt - 1,
980
+ sectors,
981
+ ssz,
982
+ fat_addrs
983
+ )
984
+ end
985
+ end
986
+ end
987
+
988
+ # Follow the linked list of sectors for a given starting point.
989
+ #
990
+ # Parameters need to be guessed from caller use cases.
991
+ #
992
+ def get_sector_list(sectors, start, fat_addrs, ssz, chkd)
993
+ chkd ||= []
994
+ buf = []
995
+ buf_chain = []
996
+ modulus = ssz - 1
997
+ j = start
998
+ jj = 0
999
+
1000
+ while j >= 0
1001
+ chkd[j] = true
1002
+ buf[buf.length] = j
1003
+ buf_chain.push(sectors[j])
1004
+
1005
+ addr = fat_addrs[((j * 4).to_f / ssz).floor()]
1006
+ jj = ((j * 4) & modulus)
1007
+
1008
+ raise "FAT boundary crossed: #{j} 4 #{ssz}" if ssz < 4 + jj
1009
+ break if sectors[addr].nil?
1010
+
1011
+ j = self.class.get_int32le(sectors[addr], jj)
1012
+ end
1013
+
1014
+ return OpenStruct.new(nodes: buf, data: buf_chain.join)
1015
+ end
1016
+
1017
+ # Chase down the sector linked lists.
1018
+ #
1019
+ # Parameters need to be guessed from caller use cases.
1020
+ #
1021
+ def make_sector_list(sectors, dir_start, fat_addrs, ssz)
1022
+ sl = sectors.length
1023
+ sector_list = SectorList.new
1024
+ chkd = []
1025
+ buf = []
1026
+ buf_chain = []
1027
+
1028
+ modulus = ssz - 1
1029
+ i = 0
1030
+ j = 0
1031
+ k = 0
1032
+ jj = 0
1033
+
1034
+ 0.upto(sl - 1) do | i |
1035
+ buf = []
1036
+ k = i + dir_start
1037
+ k -= sl if k >= sl
1038
+
1039
+ next if chkd[k]
1040
+
1041
+ buf_chain = []
1042
+ seen = []
1043
+ j = k
1044
+
1045
+ while j >= 0
1046
+ seen[j] = true
1047
+ chkd[j] = true
1048
+
1049
+ buf[buf.size] = j;
1050
+ buf_chain << sectors[j]
1051
+
1052
+ addr = fat_addrs[((j * 4).to_f / ssz).floor()]
1053
+ jj = (j * 4) & modulus
1054
+
1055
+ raise "FAT boundary crossed: #{j} 4 #{ssz}" if ssz < 4 + jj
1056
+ break if sectors[addr].nil?
1057
+
1058
+ j = self.class.get_int32le(sectors[addr], jj)
1059
+ break if seen[j]
1060
+ end
1061
+
1062
+ sector_list[k] = OpenStruct.new(nodes: buf, data: buf_chain.join())
1063
+ end
1064
+
1065
+ return sector_list
1066
+ end
1067
+
1068
+ # [MS-CFB] 2.6.1 Compound File Directory Entry.
1069
+ #
1070
+ # Parameters need to be guessed from caller use cases.
1071
+ #
1072
+ def read_directory(dir_start, sector_list, sectors, paths, nmfs, files, mini)
1073
+ minifat_store = 0
1074
+ pl = paths.any? ? 2 : 0
1075
+ sector = sector_list[dir_start].data
1076
+ i = 0
1077
+ namelen = 0
1078
+ name = nil
1079
+
1080
+ while i < sector.size
1081
+ blob = StringIO.new(sector.slice(i, 128))
1082
+
1083
+ blob.seek(64)
1084
+ namelen = self.read_shift(blob, 2)
1085
+
1086
+ blob.seek(0)
1087
+ name = blob.read(namelen - pl).force_encoding('UTF-16LE')
1088
+ nul_terminator = String.new("\x00\x00", encoding: 'UTF-16LE')
1089
+ name.chomp!(nul_terminator)
1090
+ name.encode!('UTF-8')
1091
+
1092
+ paths << name
1093
+
1094
+ blob.seek(66)
1095
+ o = OpenStruct.new({
1096
+ name: name,
1097
+ type: self.read_shift(blob, 1),
1098
+ color: self.read_shift(blob, 1),
1099
+ L: self.read_shift(blob, 4, 'i'),
1100
+ R: self.read_shift(blob, 4, 'i'),
1101
+ C: self.read_shift(blob, 4, 'i'),
1102
+ clsid: self.read_shift(blob, 16),
1103
+ state: self.read_shift(blob, 4, 'i'),
1104
+ start: 0,
1105
+ size: 0
1106
+ })
1107
+
1108
+ o.ct = self.class.get_time(blob.read(8))
1109
+ o.mt = self.class.get_time(blob.read(8))
1110
+ o.start = self.read_shift(blob, 4, 'i')
1111
+ o.size = self.read_shift(blob, 4, 'i')
1112
+
1113
+ if o.size < 0 && o.start < 0
1114
+ o.size = o.type = 0
1115
+ o.start = ENDOFCHAIN
1116
+ o.name = ''
1117
+ end
1118
+
1119
+ if o.type === 5 # Root
1120
+ minifat_store = o.start
1121
+
1122
+ if nmfs > 0 && minifat_store != ENDOFCHAIN
1123
+ sector_list[minifat_store].name = '!StreamData'
1124
+ end
1125
+ elsif o.size >= 4096 # MSCSZ
1126
+ o.storage = 'fat'
1127
+ if sector_list[o.start].nil?
1128
+ sector_list[o.start] = self.get_sector_list(sectors, o.start, sector_list.fat_addrs, sector_list.ssz)
1129
+ end
1130
+ sector_list[o.start].name = o.name
1131
+ o.content = sector_list[o.start].data.slice(0, o.size)
1132
+ else
1133
+ o.storage = 'minifat';
1134
+
1135
+ if o.size < 0
1136
+ o.size = 0
1137
+ elsif minifat_store != ENDOFCHAIN && o.start != ENDOFCHAIN && ! sector_list[minifat_store].nil?
1138
+ o.content = self.get_mfat_entry(o, sector_list[minifat_store].data, sector_list[mini]&.data)
1139
+ end
1140
+ end
1141
+
1142
+ files[name] = o;
1143
+ self.file_index << o
1144
+
1145
+ i += 128
1146
+ end
1147
+ end
1148
+
1149
+ # [MS-CFB] 2.6.4 Red-Black Tree.
1150
+ #
1151
+ # +paths+:: Array of incomplete paths (often just leafnames) where indices
1152
+ # in the array correspond to "self.file_index" entries; contents
1153
+ # in "self.full_paths" will be overwritten if present.
1154
+ #
1155
+ def build_full_paths(paths)
1156
+ i = 0
1157
+ j = 0
1158
+ el = ar = ce = 0
1159
+ pl = paths.length
1160
+ dad = []
1161
+ q = []
1162
+
1163
+ while i < pl
1164
+ dad[i] = q[i] = i
1165
+ self.full_paths[i] = paths[i]
1166
+
1167
+ i += 1
1168
+ end
1169
+
1170
+ while j < q.length
1171
+ i = q[j]
1172
+ el = self.file_index[i].L
1173
+ ar = self.file_index[i].R
1174
+ ce = self.file_index[i].C
1175
+
1176
+ if dad[i] == i
1177
+ dad[i] = dad[el] if el != NOSTREAM && dad[el] != el
1178
+ dad[i] = dad[ar] if ar != NOSTREAM && dad[ar] != ar
1179
+ end
1180
+
1181
+ dad[ce] = i if ce != NOSTREAM
1182
+
1183
+ if el != NOSTREAM && i != dad[i]
1184
+ dad[el] = dad[i]
1185
+ q << el if q.rindex(el) < j
1186
+ end
1187
+
1188
+ if ar != NOSTREAM && i != dad[i]
1189
+ dad[ar] = dad[i]
1190
+ q << ar if q.rindex(ar) < j
1191
+ end
1192
+
1193
+ j += 1
1194
+ end
1195
+
1196
+ 1.upto(pl - 1) do | i |
1197
+ if dad[i] == i
1198
+ if ar != NOSTREAM && dad[ar] != ar
1199
+ dad[i] = dad[ar]
1200
+ elsif el != NOSTREAM && dad[el] != el
1201
+ dad[i] = dad[el]
1202
+ end
1203
+ end
1204
+ end
1205
+
1206
+ 1.upto(pl - 1) do | i |
1207
+ next if self.file_index[i].type == 0 # (unknown)
1208
+
1209
+ j = i;
1210
+
1211
+ if j != dad[j]
1212
+ loop do
1213
+ j = dad[j]
1214
+ self.full_paths[i] = self.full_paths[j] + '/' + self.full_paths[i]
1215
+
1216
+ break unless j != 0 && NOSTREAM != dad[j] && j != dad[j]
1217
+ end
1218
+ end
1219
+
1220
+ dad[i] = -1
1221
+ end
1222
+
1223
+ self.full_paths[0] << '/'
1224
+
1225
+ 1.upto(pl - 1) do | i |
1226
+ if self.file_index[i].type != STREAM
1227
+ self.full_paths[i] << '/'
1228
+ end
1229
+ end
1230
+ end
1231
+
1232
+ # Read entry contents. Undocumented in JS code; looks like:
1233
+ #
1234
+ # +entry+:: The internal file structure being compiled; updated on exit
1235
+ # +payload+:: MiniFAT sector data (file contents within)
1236
+ # +mini+:: MiniFAT indices (of file contents in sector data)
1237
+ #
1238
+ # Returns the extracted data as an ASCII-8BIT encoded string.
1239
+ #
1240
+ def get_mfat_entry(entry, payload, mini)
1241
+ start = entry.start
1242
+ size = entry.size
1243
+ o = String.new(encoding: 'ASCII-8BIT')
1244
+ idx = start;
1245
+
1246
+ while mini.present? && size > 0 && idx >= 0 do
1247
+ o << payload.slice(idx * MSSZ, MSSZ)
1248
+ size -= MSSZ
1249
+ idx = self.class.get_int32le(mini, idx * 4)
1250
+ end
1251
+
1252
+ return '' if o.bytesize == 0
1253
+ return o.slice(0, entry.size)
1254
+ end
1255
+
1256
+ end # "class SimpleCfb"