simple_cfb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1256 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ostruct'
4
+ require 'date'
5
+ require 'stringio'
6
+ require 'active_support/core_ext/object/blank.rb'
7
+ require 'active_support/core_ext/object/try.rb'
8
+
9
+ # Ported from https://github.com/SheetJS/js-cfb.
10
+ #
11
+ # File data is added with #add then, when finished, the entire blob of CFB
12
+ # data is generated in one go with #write. Progressive creation is impossible
13
+ # as the CFB file requires information on file sizes and directory entries at
14
+ # the start of output, so all of that must be known beforehand.
15
+ #
16
+ # Files can be parsed into a new object with #parse!, then #file_index and
17
+ # #full_paths examined to extract the parsed CFB container components.
18
+ #
19
+ # https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-cfb/
20
+ #
21
+ # This Ruby port tries to be equivalent to the JavaScript original, but in so
22
+ # doing there are likely additional bugs and I've omitted anything that wasn't
23
+ # needed for encrypted OOXML writing and reading.
24
+ #
25
+ class SimpleCfb
26
+
27
+ # CFB miscellaneous
28
+ #
29
+ MSSZ = 64 # Mini Sector Size = 1<<6
30
+ MSCSZ = 4096 # Mini Stream Cutoff Size
31
+
32
+ # Convenience accessor to binary-encoded NUL byte.
33
+ #
34
+ NUL = String.new("\x00", encoding: 'ASCII-8BIT')
35
+
36
+ # 2.1 Compound File Sector Numbers and Types
37
+ #
38
+ FREESECT = -1
39
+ ENDOFCHAIN = -2
40
+ FATSECT = -3
41
+ DIFSECT = -4
42
+ MAXREGSECT = -6
43
+
44
+ # Compound File Header
45
+ #
46
+ HEADER_SIGNATURE = String.new("\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", encoding: 'ASCII-8BIT')
47
+ HEADER_CLSID = String.new("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", encoding: 'ASCII-8BIT')
48
+ HEADER_MINOR_VERSION = String.new("\x3e\x00", encoding: 'ASCII-8BIT')
49
+ MAXREGSID = -6
50
+ NOSTREAM = -1
51
+ STREAM = 2
52
+
53
+ # 2.6.1 Compound File Directory Entry
54
+ #
55
+ ENTRY_TYPES = ['unknown', 'storage', 'stream', 'lockbytes', 'property', 'root']
56
+
57
+ # Initial seed filename
58
+ #
59
+ SEED_FILENAME = "\u0001Sh33tJ5"
60
+
61
+ # Used internally for parser.
62
+ #
63
+ class SectorList < Array
64
+ attr_accessor :fat_addrs
65
+ attr_accessor :ssz
66
+ end
67
+
68
+ # =========================================================================
69
+ # PUBLIC CLASS INTERFACE
70
+ # =========================================================================
71
+
72
+ # Returns +true+ if the executing computer is little-endian natively,
73
+ # else +false+.
74
+ #
75
+ def self.host_is_little_endian?
76
+ [42].pack('l').bytes[0] == 42
77
+ end
78
+
79
+ # Treat an input ASCII-8BIT encoded string as 4 bytes and from this parse and
80
+ # return an unsigned 32-bit little-endian integer.
81
+ #
82
+ # +input+:: ASCII-8BIT encoded string including 4 byte sequence
83
+ # +index+:: Index into +input+ to start reading bytes (default 0)
84
+ #
85
+ def self.get_uint32le(input, index = 0)
86
+ data = input.slice(index, 4)
87
+ data = data.reverse() unless self.host_is_little_endian?
88
+
89
+ data.unpack('L').first
90
+ end
91
+
92
+ # Treat an input ASCII-8BIT encoded string as 4 bytes and from this parse and
93
+ # return a signed 32-bit little-endian integer.
94
+ #
95
+ # +input+:: ASCII-8BIT encoded string including 4 byte sequence
96
+ # +index+:: Index into +input+ to start reading bytes (default 0)
97
+ #
98
+ def self.get_int32le(input, index = 0)
99
+ data = input.slice(index, 4)
100
+ data = data.reverse() unless self.host_is_little_endian?
101
+
102
+ data.unpack('l').first
103
+ end
104
+
105
+ # Parse a ctime/mtime 8-byte sequence (4 16-bit little endian pairs) into a
106
+ # returned Ruby Time object, or +nil+ if the values are all zero.
107
+ #
108
+ # +data+:: ASCII-8BIT encoded string, 8 bytes long.
109
+ #
110
+ def self.get_time(data)
111
+ high = self.get_uint32le(data, 4)
112
+ low = self.get_uint32le(data, 0)
113
+
114
+ return nil if high.zero? && low.zero?
115
+
116
+ high = (high / 1e7) * 2.pow(32)
117
+ low = (low / 1e7)
118
+
119
+ return Time.at(high + low - 11644473600).utc
120
+ end
121
+
122
+ # =========================================================================
123
+ # PUBLIC INSTANCE INTERFACE
124
+ # =========================================================================
125
+
126
+ attr_accessor :full_paths, :file_index
127
+
128
+ def initialize
129
+ self.reinit()
130
+ end
131
+
132
+ # Add a file entry. Supports only root filenames only. File must not be
133
+ # added already.
134
+ #
135
+ # +name+:: Filename, e.g. "Foo", in your preferred string encoding
136
+ # +content+:: Mandatory ASCII-8BIT encoded string containing file data
137
+ #
138
+ def add(name, content)
139
+ self.reinit()
140
+
141
+ fpath = self.full_paths[0]
142
+
143
+ if name.slice(0, fpath.size) == fpath
144
+ fpath = name
145
+ else
146
+ fpath += '/' unless fpath.end_with?('/')
147
+ fpath = (fpath + name).gsub('//', '/')
148
+ end
149
+
150
+ file = OpenStruct.new({name: filename(name), type: 2, content: content, size: content.bytesize})
151
+
152
+ self.file_index << file
153
+ self.full_paths << fpath
154
+
155
+ rebuild(force_gc: true)
156
+
157
+ return file
158
+ end
159
+
160
+ # Compile and return the CFB file data.
161
+ #
162
+ def write
163
+
164
+ # Commented out for now, because we prefer parity with the JS code for
165
+ # test verification purposes. The overhead seems minimal.
166
+ #
167
+ # # Get rid of the seed file if it's still present and we seem to have
168
+ # # more file entries than the root directory and seed entry.
169
+ # #
170
+ # seed_leaf = "/#{SEED_FILENAME}"
171
+ # seed_index = self.full_paths.find_index do | path |
172
+ # path.end_with?(seed_leaf)
173
+ # end
174
+ #
175
+ # unless seed_index.nil? || self.file_index.size < 3
176
+ # self.file_index.delete_at(seed_index)
177
+ # self.full_paths.delete_at(seed_index)
178
+ # end
179
+ #
180
+ # self.rebuild(force_gc: true)
181
+ self.rebuild(force_gc: false)
182
+
183
+ mini_size = 0
184
+ fat_size = 0
185
+
186
+ 0.upto(self.file_index.size - 1) do | i |
187
+ flen = self.file_index[i]&.content&.bytesize
188
+ next if flen.nil? || flen.zero?
189
+
190
+ if flen < 0x1000
191
+ mini_size += (flen + 0x3F) >> 6
192
+ else
193
+ fat_size += (flen + 0x01FF) >> 9
194
+ end
195
+ end
196
+
197
+ dir_cnt = (self.full_paths.size + 3) >> 2
198
+ mini_cnt = (mini_size + 7) >> 3
199
+ mfat_cnt = (mini_size + 0x7F) >> 7
200
+ fat_base = mini_cnt + fat_size + dir_cnt + mfat_cnt
201
+ fat_cnt = (fat_base + 0x7F) >> 7
202
+ difat_cnt = fat_cnt <= 109 ? 0 : ((fat_cnt - 109).to_f / 0x7F).ceil()
203
+
204
+ while (((fat_base + fat_cnt + difat_cnt + 0x7F) >> 7) > fat_cnt)
205
+ fat_cnt += 1
206
+ difat_cnt = fat_cnt <= 109 ? 0 : ((fat_cnt - 109).to_f / 0x7F).ceil()
207
+ end
208
+
209
+ el = [1, difat_cnt, fat_cnt, mfat_cnt, dir_cnt, fat_size, mini_size, 0]
210
+
211
+ self.file_index[0].size = mini_size << 6
212
+ self.file_index[0].start = el[0] + el[1] + el[2] + el[3] + el[4] + el[5]
213
+
214
+ el[7] = el[0] + el[1] + el[2] + el[3] + el[4] + el[5] + ((el[6] + 7) >> 3)
215
+
216
+ o = String.new(encoding: 'ASCII-8BIT')
217
+
218
+ o << HEADER_SIGNATURE
219
+ o << NUL * 2 * 8
220
+ o << write_shift(2, 0x003E)
221
+ o << write_shift(2, 0x0003)
222
+ o << write_shift(2, 0xFFFE)
223
+ o << write_shift(2, 0x0009)
224
+ o << write_shift(2, 0x0006)
225
+ o << NUL * 2 * 3
226
+
227
+ o << write_shift( 4, 0)
228
+ o << write_shift( 4, el[2])
229
+ o << write_shift( 4, el[0] + el[1] + el[2] + el[3] - 1)
230
+ o << write_shift( 4, 0)
231
+ o << write_shift( 4, 1<<12)
232
+ o << write_shift( 4, (el[3].blank? || el[3].zero?) ? ENDOFCHAIN : el[0] + el[1] + el[2] - 1)
233
+ o << write_shift( 4, el[3])
234
+ o << write_shift(-4, (el[1].blank? || el[1].zero?) ? ENDOFCHAIN : el[0] - 1)
235
+ o << write_shift( 4, el[1])
236
+
237
+ i = 0
238
+ t = 0
239
+
240
+ while i < 109
241
+ o << write_shift(-4, i < el[2] ? el[1] + i : -1)
242
+ i += 1
243
+ end
244
+
245
+ unless el[1].blank? || el[1].zero?
246
+ t = 0
247
+ while t < el[1]
248
+ while i < 236 + t * 127
249
+ o << write_shift(-4, i < el[2] ? el[1] + i : -1)
250
+ i += 1
251
+ end
252
+
253
+ o << write_shift(-4, t == el[1] - 1 ? ENDOFCHAIN : t + 1)
254
+ t += 1
255
+ end
256
+ end
257
+
258
+ chainit = Proc.new do | w |
259
+ t += w
260
+
261
+ while i < t - 1
262
+ o << write_shift(-4, i + 1)
263
+ i += 1
264
+ end
265
+
266
+ unless w.blank? || w.zero?
267
+ i += 1
268
+ o << write_shift(-4, ENDOFCHAIN)
269
+ end
270
+ end
271
+
272
+ i = 0
273
+ t = el[1]
274
+
275
+ while i < t
276
+ o << write_shift(-4, DIFSECT)
277
+ i += 1
278
+ end
279
+
280
+ t += el[2]
281
+
282
+ while i < t
283
+ o << write_shift(-4, FATSECT)
284
+ i += 1
285
+ end
286
+
287
+ chainit.call(el[3])
288
+ chainit.call(el[4])
289
+
290
+ j = 0
291
+ flen = 0
292
+ file = self.file_index[0]
293
+
294
+ while j < self.file_index.size
295
+ file = self.file_index[j]
296
+ j += 1
297
+
298
+ next if file.content.nil?
299
+
300
+ flen = file.content.bytesize
301
+ next if flen < 0x1000
302
+
303
+ file.start = t
304
+ chainit.call((flen + 0x01FF) >> 9)
305
+ end
306
+
307
+ chainit.call((el[6] + 7) >> 3)
308
+
309
+ while o.size & 0x1FF != 0
310
+ o << write_shift(-4, ENDOFCHAIN)
311
+ end
312
+
313
+ t = i = j = 0
314
+
315
+ while j < self.file_index.size do
316
+ file = self.file_index[j]
317
+ j += 1
318
+
319
+ next if file.content.nil?
320
+
321
+ flen = file.content.bytesize
322
+ next if flen == 0 || flen >= 0x1000
323
+
324
+ file.start = t
325
+ chainit.call((flen + 0x3F) >> 6)
326
+ end
327
+
328
+ while o.size & 0x1FF != 0
329
+ o << write_shift(-4, ENDOFCHAIN)
330
+ end
331
+
332
+ i = 0
333
+
334
+ while i < (el[4] << 2) do
335
+ nm = self.full_paths[i]
336
+
337
+ if nm.blank?
338
+ 0.upto(16) { o << write_shift(4, 0) } # Remember, #upto is inclusive -> *17* words
339
+ 0.upto(2 ) { o << write_shift(4, -1) }
340
+ 0.upto(11) { o << write_shift(4, 0) }
341
+
342
+ i += 1
343
+ next # NOTE EARLY LOOP RESTART
344
+ end
345
+
346
+ file = self.file_index[i]
347
+
348
+ if i.zero?
349
+ file.start = file.size.blank? || file.size.zero? ? ENDOFCHAIN : file.start - 1;
350
+ end
351
+
352
+ u_nm = file.name
353
+ u_nm = u_nm[0...32] if u_nm.size > 32
354
+
355
+ flen = 2 * (u_nm.size + 1)
356
+
357
+ o << write_shift(64, u_nm, 'utf16le')
358
+ o << write_shift(2, flen)
359
+ o << write_shift(1, file.type)
360
+ o << write_shift(1, file.color)
361
+ o << write_shift(-4, file.L)
362
+ o << write_shift(-4, file.R)
363
+ o << write_shift(-4, file.C)
364
+
365
+ if file.clsid.blank?
366
+ j = 0
367
+ while j < 4
368
+ o << write_shift(4, 0)
369
+ j += 1
370
+ end
371
+ else
372
+ o << file.clsid
373
+ end
374
+
375
+ o << write_shift(4, file.state.blank? || file.state.zero? ? 0 : file.state)
376
+ o << write_shift(4, 0)
377
+ o << write_shift(4, 0)
378
+ o << write_shift(4, 0)
379
+ o << write_shift(4, 0)
380
+ o << write_shift(4, file.start)
381
+ o << write_shift(4, file.size)
382
+ o << write_shift(4, 0)
383
+
384
+ i += 1
385
+ end
386
+
387
+ i = 1
388
+
389
+ while i < self.file_index.size do
390
+ file = self.file_index[i]
391
+
392
+ if file.size.present? && file.size >= 0x1000
393
+ aligned_size = (file.start + 1) << 9
394
+ while (o.size < aligned_size) do; o << 0x00; end
395
+
396
+ o << file.content
397
+ while (o.size % 512 != 0) do; o << 0x00; end
398
+ end
399
+
400
+ i += 1
401
+ end
402
+
403
+ i = 1
404
+
405
+ while i < self.file_index.size do
406
+ file = self.file_index[i]
407
+
408
+ if file.size.present? && file.size > 0 && file.size < 0x1000
409
+ o << file.content
410
+ while (o.size % 64 != 0) do; o << 0x00; end
411
+ end
412
+
413
+ i += 1
414
+ end
415
+
416
+ while (o.size < el[7] << 9) do; o << 0x00; end
417
+
418
+ return o
419
+ end # "def write"
420
+
421
+ # Parses an input file into this object, allowing you to extract individual
422
+ # files thereafter via #read.
423
+ #
424
+ # +file+:: Source I/O stream. Data is read from the current file pointer,
425
+ # which will therefore have advanced when the method returns.
426
+ #
427
+ def parse!(file)
428
+ raise "CFB corrupt - file size < 512 bytes" if file.size < 512
429
+
430
+ mver = 3
431
+ ssz = 512
432
+ nmfs = 0 # number of mini FAT sectors
433
+ difat_sec_cnt = 0
434
+ dir_start = 0
435
+ minifat_start = 0
436
+ difat_start = 0
437
+ fat_addrs = [] # locations of FAT sectors
438
+
439
+ # [MS-CFB] 2.2 Compound File Header
440
+ # Check major version
441
+ #
442
+ major, minor = self.check_get_mver(file)
443
+
444
+ if major == 3
445
+ ssz = 512
446
+ elsif major == 4
447
+ ssz = 4096
448
+ elsif major == 0 && minor == 0
449
+ raise 'Zip contents are not supported'
450
+ else
451
+ raise "Major version: Only 3 or 4 is supported; #{mver} encountered"
452
+ end
453
+
454
+ self.check_shifts(file, major)
455
+
456
+ # Number of Directory Sectors
457
+ #
458
+ dir_cnt = self.read_shift(file, 4, 'i')
459
+ raise "Directory sectors: Expected 0, saw #{dir_cnt}" if major == 3 && dir_cnt != 0
460
+
461
+ # Number of FAT Sectors
462
+ #
463
+ file.seek(file.pos + 4)
464
+
465
+ # First Directory Sector Location
466
+ #
467
+ dir_start = self.read_shift(file, 4, 'i')
468
+
469
+ # Transaction Signature
470
+ #
471
+ file.seek(file.pos + 4)
472
+
473
+ # Mini Stream Cutoff Size
474
+ #
475
+ self.check_field(file, "\x00\x10\x00\x00", 'Mini stream cutoff size')
476
+
477
+ # First Mini FAT Sector Location
478
+ #
479
+ minifat_start = self.read_shift(file, 4, 'i')
480
+
481
+ # Number of Mini FAT Sectors
482
+ #
483
+ nmfs = self.read_shift(file, 4, 'i')
484
+
485
+ # First DIFAT sector location
486
+ #
487
+ difat_start = self.read_shift(file, 4, 'i')
488
+
489
+ # Number of DIFAT Sectors
490
+ #
491
+ difat_sec_cnt = self.read_shift(file, 4, 'i')
492
+
493
+ # Grab FAT Sector Locations
494
+ #
495
+ q = -1
496
+ j = 0
497
+
498
+ while (j < 109) # 109 = (512 - file.pos) >> 2
499
+ q = self.read_shift(file, 4, 'i')
500
+ break if q < 0
501
+ fat_addrs[j] = q
502
+ j += 1
503
+ end
504
+
505
+ # Break the file up into sectors, skipping the file header of 'ssz' size.
506
+ #
507
+ sectors = []
508
+ file.seek(ssz)
509
+
510
+ while ! file.eof?
511
+ sectors << file.read(ssz)
512
+ end
513
+
514
+ self.sleuth_fat(difat_start, difat_sec_cnt, sectors, ssz, fat_addrs)
515
+
516
+ # Chains
517
+ #
518
+ sector_list = self.make_sector_list(sectors, dir_start, fat_addrs, ssz)
519
+ sector_list[dir_start].name = '!Directory'
520
+
521
+ if nmfs > 0 && minifat_start != ENDOFCHAIN
522
+ sector_list[minifat_start].name = '!MiniFAT'
523
+ end
524
+
525
+ sector_list[fat_addrs[0]].name = '!FAT'
526
+ sector_list.fat_addrs = fat_addrs
527
+ sector_list.ssz = ssz
528
+
529
+ # [MS-CFB] 2.6.1 Compound File Directory Entry
530
+ #
531
+ files = {}
532
+ paths = []
533
+
534
+ self.full_paths = []
535
+ self.file_index = []
536
+ self.read_directory(
537
+ dir_start,
538
+ sector_list,
539
+ sectors,
540
+ paths,
541
+ nmfs,
542
+ files,
543
+ minifat_start
544
+ )
545
+
546
+ self.build_full_paths(paths)
547
+ ensure
548
+ file.close() unless file.nil?
549
+ end # "def parse!"
550
+
551
+ # =========================================================================
552
+ # PRIVATE INSTANCE METHODS
553
+ # =========================================================================
554
+ #
555
+ private
556
+
557
+ # Initialise or reinitialise the internal file data. After being called
558
+ # for the first time, calling here is really only useful to make sure
559
+ # that internal file path and index arrays look consistent.
560
+ #
561
+ def reinit
562
+ self.full_paths ||= []
563
+ self.file_index ||= []
564
+
565
+ if self.full_paths.size != self.file_index.size
566
+ raise 'Inconsistent CFB structure'
567
+ end
568
+
569
+ if self.full_paths.size == 0
570
+ root = 'Root Entry'
571
+
572
+ self.full_paths << root + '/'
573
+ self.file_index << OpenStruct.new({name: root, type: 5})
574
+
575
+ # Add starting seed file
576
+ #
577
+ nm = SEED_FILENAME
578
+ p = [55, 50, 54, 50].pack('C*')
579
+
580
+ self.full_paths << self.full_paths[0] + nm
581
+ self.file_index << OpenStruct.new({name: nm, type: 2, content: p, R: 69, L: 69, C: 69})
582
+ end
583
+ end
584
+
585
+ # Strange function that's very much not the same as "File.dirname".
586
+ #
587
+ def dirname(p)
588
+ if p.end_with?('/')
589
+ chomped = p.chomp('/')
590
+ return chomped.include?('/') ? self.dirname(chomped) : p # NOTE EARLY EXIT AND RECURSION
591
+ end
592
+
593
+ c = p.rindex('/')
594
+
595
+ return c.nil? ? p : p.slice(0, c + 1)
596
+ end
597
+
598
+ # Strange function that's very much not the same as "File.basename".
599
+ #
600
+ def filename(p)
601
+ if p.end_with?('/')
602
+ return filename(p.chomp('/')) # NOTE EARLY EXIT AND RECURSION
603
+ end
604
+
605
+ c = p.rindex('/')
606
+
607
+ return c.nil? ? p : p[(c + 1)..]
608
+ end
609
+
610
+ # Compare file-path-name with some FAT concepts thrown in (L vs R); related
611
+ # to CFB section 2.6.4 (red-black trees).
612
+ #
613
+ def namecmp(l, r)
614
+ el = l.split('/')
615
+ ar = r.split('/')
616
+ i = 0
617
+ z = [el.size, ar.size].min
618
+
619
+ while i < z do
620
+ c = el[i].size - ar[i].size
621
+
622
+ return c if c != 0
623
+ return el[i] < r[i] ? -1 : 1 if el[i] != ar[i]
624
+
625
+ i += 1
626
+ end
627
+
628
+ return el.size - ar.size
629
+ end
630
+
631
+ # CFB internal knowledge would be required to understand this; seems to be
632
+ # recalculating data structures that then theoretically would make life
633
+ # easier during the file output stage.
634
+ #
635
+ def rebuild(force_gc: false)
636
+ self.reinit()
637
+
638
+ s = false
639
+ gc = force_gc
640
+
641
+ unless gc == true
642
+ (self.full_paths.size - 1).downto(0) do | i |
643
+ file = self.file_index[i]
644
+
645
+ case file.type
646
+ when 0
647
+ if s == true
648
+ gc = true
649
+ else
650
+ self.file_index.pop()
651
+ self.full_paths.pop()
652
+ end
653
+
654
+ when 1, 2, 5
655
+ s = true
656
+ gc ||= (file.R * file.L * file.C rescue nil).nil?
657
+ gc ||= file.R.try(:>, -1) && file.L.try(:>, -1) && file.R == file.L
658
+
659
+ else
660
+ gc = true
661
+ end
662
+ end
663
+ end
664
+
665
+ return unless gc == true
666
+
667
+ now = Date.parse('1987-01-19')
668
+
669
+ # Track which names exist
670
+
671
+ track_full_paths = {}
672
+ data = []
673
+
674
+ 0.upto(self.full_paths.size - 1) do | i |
675
+ track_full_paths[self.full_paths[i]] = true
676
+ next if self.file_index[i].type == 0
677
+ data.push([self.full_paths[i], self.file_index[i]])
678
+ end
679
+
680
+ 0.upto(data.size - 1) do | i |
681
+ dad = self.dirname(data[i][0])
682
+ s = track_full_paths[dad]
683
+
684
+ while s.blank?
685
+ while self.dirname(dad).present? && track_full_paths[self.dirname(dad)].blank?
686
+ dir = self.dirname(dad)
687
+ end
688
+
689
+ data.push([
690
+ dad,
691
+ OpenStruct.new({
692
+ name: self.filname(dad).gsub('/', ''),
693
+ type: 1,
694
+ clsid: HEADER_CLSID,
695
+ ct: now,
696
+ mt: now,
697
+ content: null
698
+ })
699
+ ])
700
+
701
+ # Add name to set
702
+ #
703
+ track_full_paths[dad] = true
704
+
705
+ dad = self.dirname(data[i][0])
706
+ s = track_full_paths[dad]
707
+ end
708
+ end
709
+
710
+ data.sort! { |x, y| self.namecmp(x[0], y[0]) }
711
+
712
+ self.full_paths = []
713
+ self.file_index = []
714
+
715
+ 0.upto(data.size - 1) do | i |
716
+ self.full_paths << data[i][0]
717
+ self.file_index << data[i][1]
718
+ end
719
+
720
+ 0.upto(data.size - 1) do | i |
721
+ nm = self.full_paths[i]
722
+ elt = self.file_index[i]
723
+
724
+ elt.name = self.filename(nm).gsub('/', '')
725
+ elt.color = 1
726
+ elt.L = -1
727
+ elt.R = -1
728
+ elt.C = -1
729
+ elt.size = elt.content.nil? ? 0 : elt.content.bytesize
730
+ elt.start = 0
731
+ elt.clsid = elt.clsid || HEADER_CLSID
732
+
733
+ if i == 0
734
+ elt.C = data.size > 1 ? 1 : -1
735
+ elt.size = 0
736
+ elt.type = 5
737
+
738
+ elsif nm.end_with?('/')
739
+ j = i + 1
740
+ while j < data.size do
741
+ break if self.dirname(self.full_paths[j]) == nm
742
+ j += 1
743
+ end
744
+
745
+ elt.C = j >= data.size ? -1 : j
746
+
747
+ j = i + 1
748
+ while j < data.size do
749
+ break if self.dirname(self.full_paths[j]) == self.dirname(nm)
750
+ j += 1
751
+ end
752
+
753
+ elt.R = j >= data.size ? -1 : j
754
+ elt.type = 1
755
+
756
+ else
757
+ elt.R = i + 1 if self.dirname(self.full_paths[i + 1] || '') == self.dirname(nm)
758
+ elt.type = 2
759
+
760
+ end
761
+ end
762
+ end
763
+
764
+ # Returns a chunk of data representing a converted write of the input in
765
+ # the +value+ parameter.
766
+ #
767
+ # The JS code from which this was ported has a very, VERY strange method
768
+ # signature...
769
+ #
770
+ # +size+:: Either a number of bytes to write or a format specifier (see
771
+ # below).
772
+ #
773
+ # +value+:: A value to write; its type is interpreted through both the
774
+ # +size+ and +format+ parameters.
775
+ #
776
+ # +format+:: Either 'hex' or 'utf16le' in which case the value is treated
777
+ # as a hex string (e.g. "deadbeef", high nibble first) or
778
+ # character data in arbitrary Ruby string encoding; written to
779
+ # the output as parsed bytes from the hex data, or little
780
+ # endian UTF-16 byte pairs, respectively. If the input value
781
+ # is longer than +size+ *IN BYTES* then it is truncated, else
782
+ # if need be, padded with zeros - again *IN BYTES*, so the
783
+ # maximum length in characters of a "utf16le" string is half
784
+ # the amount in +size+.
785
+ #
786
+ # If +format+ is something else or omitted, "size" becomes an
787
+ # indication of format (!). The value is treated as an 8-bit
788
+ # byte (+size+ is 1) and masked as such, 16-bit unsigned
789
+ # little-endian value (2), or uint32 (4) - or a signed int32
790
+ # (+size+ is -4 - yes, that's minus 4) - written out as four
791
+ # bytes, little-endian.
792
+ #
793
+ def write_shift(size, value, format = nil)
794
+ output_buffer = nil
795
+
796
+ case format
797
+ when 'hex'
798
+ bytes = [value].pack('H*').ljust(size, NUL)
799
+ bytes = bytes[0...size]
800
+
801
+ output_buffer = bytes
802
+
803
+ when 'utf16le'
804
+ chars = value.ljust(size / 2, NUL)
805
+ chars = chars[0...(size / 2)]
806
+
807
+ output_buffer = chars.encode('UTF-16LE').force_encoding('ASCII-8BIT')
808
+
809
+ else
810
+ case size
811
+ when 1
812
+ output_buffer = [value].pack('C') # Unsigned 8-bit, bitwise truncated
813
+ when 2
814
+ output_buffer = [value].pack('v') # Unsigned 16-bit little-endian, bitwise truncated
815
+ when 4
816
+ output_buffer = [value].pack('V') # Unsigned 32-bit little-endian, bitwise truncated
817
+ when -4
818
+ int32_4_bytes = [value].pack('l')
819
+ int32_4_bytes = int32_4_bytes.reverse() unless self.class.host_is_little_endian?
820
+ output_buffer = int32_4_bytes
821
+ end
822
+ end
823
+
824
+ return output_buffer
825
+ end
826
+
827
+ # A method that's a companion to #write_shift and equally strange!
828
+ #
829
+ # Read from file for 'size' bytes if size is 1, 2 or 4, parsing the bytes
830
+ # as an 8-bit unsigned, 16-bit unsigned or 32-bit integer where the value
831
+ # of 't' indicates if the 32-bit integer is signed ('t' is string 'i') or
832
+ # unsigned ('t' is anything else); or if size is 16, just return a string
833
+ # of 16 bytes read as-is.
834
+ #
835
+ # This implementation is slightly cleaner and more appropriate than the
836
+ # one in the original source, by omitting unused conversions.
837
+ #
838
+ # +file+:: Source I/O stream. Data is read from the current file pointer,
839
+ # which will therefore have advanced when the method returns.
840
+ #
841
+ # +size+:: 1, 2, 4 to read 1, 2 or 4 bytes returned as a parsed 8, 16 or
842
+ # 32-bit little-endian integer respectively, or pass 16 to read
843
+ # 16 bytes of raw data returned as an ASCII-8BIT encoded string.
844
+ #
845
+ # +type+:: If +size+ is 4, pass 'i' to read as a signed 32-bit integer,
846
+ # else (omitted, or not 'i') value is read as unsigned.
847
+ #
848
+ def read_shift(file, size, t = nil)
849
+ return case size
850
+ when 1 # Unsigned 8-bit
851
+ file.read(1).bytes.first
852
+
853
+ when 2 # Unsigned 16-bit little-endian
854
+ file.read(2).unpack('v').first
855
+
856
+ when 4 # 32-bit little-endian signed or unsigned
857
+ data = file.read(4)
858
+
859
+ if t == 'i' # Signed 32-bit little-endian
860
+ self.class.get_int32le(data)
861
+ else # Unsigned 32-bit little-endian
862
+ self.class.get_uint32le(data)
863
+ end
864
+
865
+ when 16
866
+ file.read(16)
867
+ end
868
+ end
869
+
870
+ # Read from the file, expecting to see a particular value; if not, throw
871
+ # an exception.
872
+ #
873
+ # +file+:: Source I/O stream. Data is read from the current file
874
+ # pointer, which will therefore have advanced when the
875
+ # method returns.
876
+ #
877
+ # +expected+:: The expected value, as a String that'll be forced to
878
+ # ASCII-8BIT encoding, if not that way already.
879
+ #
880
+ # +field_name+:: The field name to include in the raised exception, just
881
+ # for human diagnostic purposes.
882
+ #
883
+ def check_field(file, expected, field_name)
884
+ expected = expected.dup.force_encoding('ASCII-8BIT')
885
+ data = file.read(expected.bytesize)
886
+
887
+ if data != expected
888
+ raise "#{field_name}: Expected #{expected.inspect}, but got #{data.inspect}"
889
+ end
890
+ end
891
+
892
+ # Return a tuple array of major, minor file version, with 0, 0 for ZIP
893
+ # files, else read from the CFB file, checking header in passing. File
894
+ # pointer is assumed to be at zero on entry.
895
+ #
896
+ # +file+:: Source I/O stream. Data is read from the current file pointer,
897
+ # which will therefore have advanced when the method returns.
898
+ #
899
+ def check_get_mver(file)
900
+ return [0, 0] if file.read(1) == 0x50 && file.read(1) == 0x4b
901
+
902
+ file.rewind()
903
+ check_field(file, HEADER_SIGNATURE, 'Header signature')
904
+
905
+ file.seek(file.pos + 16) # Skip all-NUL CLSID, 16 bytes
906
+
907
+ # Minor version
908
+ minor = self.read_shift(file, 2)
909
+ major = self.read_shift(file, 2)
910
+
911
+ return [major, minor]
912
+ end
913
+
914
+ # Check sector shifts in the file header.
915
+ #
916
+ # +file+:: Source I/O stream. Data is read from the current file pointer,
917
+ # which will therefore have advanced when the method returns.
918
+ #
919
+ # +major+:: Major version number - must be 3 or 4.
920
+ #
921
+ def check_shifts(file, major)
922
+
923
+ # Skip byte order marker (always indicates little-endian)
924
+ #
925
+ file.seek(file.pos + 2)
926
+
927
+ shift = self.read_shift(file, 2)
928
+
929
+ case shift
930
+ when 0x09
931
+ raise "Sector shift: Expected 9, saw #{shift}" if major != 3
932
+ when 0x0c
933
+ raise "Sector shift: Expected 12, saw #{shift}" if major != 4
934
+ else
935
+ raise "Sector shift: Unsupported value #{shift}"
936
+ end
937
+
938
+ # Mini Sector Shift
939
+ #
940
+ self.check_field(file, "\x06\x00", 'Mini sector shift')
941
+
942
+ # Reserved
943
+ #
944
+ self.check_field(file, "\x00\x00\x00\x00\x00\x00", 'Reserved')
945
+ end
946
+
947
+ # Chase down the rest of the DIFAT chain to build a comprehensive list
948
+ # DIFAT chains by storing the next sector number as the last 32 bits.
949
+ #
950
+ # +idx+:: Sector index; usually, start DIFAT sector initially
951
+ # +cnt+:: DIFAT sector count expected
952
+ # +sectors+:: Array of sectors
953
+ # +ssz+:: Size of a sector
954
+ # +fat_addrs+:: Array MODIFIED IN PLACE with sector addresses added
955
+ #
956
+ def sleuth_fat(idx, cnt, sectors, ssz, fat_addrs)
957
+ q = ENDOFCHAIN
958
+
959
+ if idx == ENDOFCHAIN
960
+ raise 'DIFAT chain shorter than expected' if cnt != 0
961
+ elsif idx != FREESECT
962
+ sector = sectors[idx]
963
+ m = (ssz >> 2) - 1
964
+ i = 0
965
+
966
+ return if sector.nil?
967
+
968
+ while i < m
969
+ q = self.class.get_int32le(sector, i * 4)
970
+ break if q == ENDOFCHAIN
971
+
972
+ fat_addrs << q
973
+ i += 1
974
+ end
975
+
976
+ if cnt >= 1
977
+ self.sleuth_fat(
978
+ self.class.get_int32le(sector, ssz - 4),
979
+ cnt - 1,
980
+ sectors,
981
+ ssz,
982
+ fat_addrs
983
+ )
984
+ end
985
+ end
986
+ end
987
+
988
+ # Follow the linked list of sectors for a given starting point.
989
+ #
990
+ # Parameters need to be guessed from caller use cases.
991
+ #
992
+ def get_sector_list(sectors, start, fat_addrs, ssz, chkd)
993
+ chkd ||= []
994
+ buf = []
995
+ buf_chain = []
996
+ modulus = ssz - 1
997
+ j = start
998
+ jj = 0
999
+
1000
+ while j >= 0
1001
+ chkd[j] = true
1002
+ buf[buf.length] = j
1003
+ buf_chain.push(sectors[j])
1004
+
1005
+ addr = fat_addrs[((j * 4).to_f / ssz).floor()]
1006
+ jj = ((j * 4) & modulus)
1007
+
1008
+ raise "FAT boundary crossed: #{j} 4 #{ssz}" if ssz < 4 + jj
1009
+ break if sectors[addr].nil?
1010
+
1011
+ j = self.class.get_int32le(sectors[addr], jj)
1012
+ end
1013
+
1014
+ return OpenStruct.new(nodes: buf, data: buf_chain.join)
1015
+ end
1016
+
1017
+ # Chase down the sector linked lists.
1018
+ #
1019
+ # Parameters need to be guessed from caller use cases.
1020
+ #
1021
+ def make_sector_list(sectors, dir_start, fat_addrs, ssz)
1022
+ sl = sectors.length
1023
+ sector_list = SectorList.new
1024
+ chkd = []
1025
+ buf = []
1026
+ buf_chain = []
1027
+
1028
+ modulus = ssz - 1
1029
+ i = 0
1030
+ j = 0
1031
+ k = 0
1032
+ jj = 0
1033
+
1034
+ 0.upto(sl - 1) do | i |
1035
+ buf = []
1036
+ k = i + dir_start
1037
+ k -= sl if k >= sl
1038
+
1039
+ next if chkd[k]
1040
+
1041
+ buf_chain = []
1042
+ seen = []
1043
+ j = k
1044
+
1045
+ while j >= 0
1046
+ seen[j] = true
1047
+ chkd[j] = true
1048
+
1049
+ buf[buf.size] = j;
1050
+ buf_chain << sectors[j]
1051
+
1052
+ addr = fat_addrs[((j * 4).to_f / ssz).floor()]
1053
+ jj = (j * 4) & modulus
1054
+
1055
+ raise "FAT boundary crossed: #{j} 4 #{ssz}" if ssz < 4 + jj
1056
+ break if sectors[addr].nil?
1057
+
1058
+ j = self.class.get_int32le(sectors[addr], jj)
1059
+ break if seen[j]
1060
+ end
1061
+
1062
+ sector_list[k] = OpenStruct.new(nodes: buf, data: buf_chain.join())
1063
+ end
1064
+
1065
+ return sector_list
1066
+ end
1067
+
1068
+ # [MS-CFB] 2.6.1 Compound File Directory Entry.
1069
+ #
1070
+ # Parameters need to be guessed from caller use cases.
1071
+ #
1072
+ def read_directory(dir_start, sector_list, sectors, paths, nmfs, files, mini)
1073
+ minifat_store = 0
1074
+ pl = paths.any? ? 2 : 0
1075
+ sector = sector_list[dir_start].data
1076
+ i = 0
1077
+ namelen = 0
1078
+ name = nil
1079
+
1080
+ while i < sector.size
1081
+ blob = StringIO.new(sector.slice(i, 128))
1082
+
1083
+ blob.seek(64)
1084
+ namelen = self.read_shift(blob, 2)
1085
+
1086
+ blob.seek(0)
1087
+ name = blob.read(namelen - pl).force_encoding('UTF-16LE')
1088
+ nul_terminator = String.new("\x00\x00", encoding: 'UTF-16LE')
1089
+ name.chomp!(nul_terminator)
1090
+ name.encode!('UTF-8')
1091
+
1092
+ paths << name
1093
+
1094
+ blob.seek(66)
1095
+ o = OpenStruct.new({
1096
+ name: name,
1097
+ type: self.read_shift(blob, 1),
1098
+ color: self.read_shift(blob, 1),
1099
+ L: self.read_shift(blob, 4, 'i'),
1100
+ R: self.read_shift(blob, 4, 'i'),
1101
+ C: self.read_shift(blob, 4, 'i'),
1102
+ clsid: self.read_shift(blob, 16),
1103
+ state: self.read_shift(blob, 4, 'i'),
1104
+ start: 0,
1105
+ size: 0
1106
+ })
1107
+
1108
+ o.ct = self.class.get_time(blob.read(8))
1109
+ o.mt = self.class.get_time(blob.read(8))
1110
+ o.start = self.read_shift(blob, 4, 'i')
1111
+ o.size = self.read_shift(blob, 4, 'i')
1112
+
1113
+ if o.size < 0 && o.start < 0
1114
+ o.size = o.type = 0
1115
+ o.start = ENDOFCHAIN
1116
+ o.name = ''
1117
+ end
1118
+
1119
+ if o.type === 5 # Root
1120
+ minifat_store = o.start
1121
+
1122
+ if nmfs > 0 && minifat_store != ENDOFCHAIN
1123
+ sector_list[minifat_store].name = '!StreamData'
1124
+ end
1125
+ elsif o.size >= 4096 # MSCSZ
1126
+ o.storage = 'fat'
1127
+ if sector_list[o.start].nil?
1128
+ sector_list[o.start] = self.get_sector_list(sectors, o.start, sector_list.fat_addrs, sector_list.ssz)
1129
+ end
1130
+ sector_list[o.start].name = o.name
1131
+ o.content = sector_list[o.start].data.slice(0, o.size)
1132
+ else
1133
+ o.storage = 'minifat';
1134
+
1135
+ if o.size < 0
1136
+ o.size = 0
1137
+ elsif minifat_store != ENDOFCHAIN && o.start != ENDOFCHAIN && ! sector_list[minifat_store].nil?
1138
+ o.content = self.get_mfat_entry(o, sector_list[minifat_store].data, sector_list[mini]&.data)
1139
+ end
1140
+ end
1141
+
1142
+ files[name] = o;
1143
+ self.file_index << o
1144
+
1145
+ i += 128
1146
+ end
1147
+ end
1148
+
1149
+ # [MS-CFB] 2.6.4 Red-Black Tree.
1150
+ #
1151
+ # +paths+:: Array of incomplete paths (often just leafnames) where indices
1152
+ # in the array correspond to "self.file_index" entries; contents
1153
+ # in "self.full_paths" will be overwritten if present.
1154
+ #
1155
+ def build_full_paths(paths)
1156
+ i = 0
1157
+ j = 0
1158
+ el = ar = ce = 0
1159
+ pl = paths.length
1160
+ dad = []
1161
+ q = []
1162
+
1163
+ while i < pl
1164
+ dad[i] = q[i] = i
1165
+ self.full_paths[i] = paths[i]
1166
+
1167
+ i += 1
1168
+ end
1169
+
1170
+ while j < q.length
1171
+ i = q[j]
1172
+ el = self.file_index[i].L
1173
+ ar = self.file_index[i].R
1174
+ ce = self.file_index[i].C
1175
+
1176
+ if dad[i] == i
1177
+ dad[i] = dad[el] if el != NOSTREAM && dad[el] != el
1178
+ dad[i] = dad[ar] if ar != NOSTREAM && dad[ar] != ar
1179
+ end
1180
+
1181
+ dad[ce] = i if ce != NOSTREAM
1182
+
1183
+ if el != NOSTREAM && i != dad[i]
1184
+ dad[el] = dad[i]
1185
+ q << el if q.rindex(el) < j
1186
+ end
1187
+
1188
+ if ar != NOSTREAM && i != dad[i]
1189
+ dad[ar] = dad[i]
1190
+ q << ar if q.rindex(ar) < j
1191
+ end
1192
+
1193
+ j += 1
1194
+ end
1195
+
1196
+ 1.upto(pl - 1) do | i |
1197
+ if dad[i] == i
1198
+ if ar != NOSTREAM && dad[ar] != ar
1199
+ dad[i] = dad[ar]
1200
+ elsif el != NOSTREAM && dad[el] != el
1201
+ dad[i] = dad[el]
1202
+ end
1203
+ end
1204
+ end
1205
+
1206
+ 1.upto(pl - 1) do | i |
1207
+ next if self.file_index[i].type == 0 # (unknown)
1208
+
1209
+ j = i;
1210
+
1211
+ if j != dad[j]
1212
+ loop do
1213
+ j = dad[j]
1214
+ self.full_paths[i] = self.full_paths[j] + '/' + self.full_paths[i]
1215
+
1216
+ break unless j != 0 && NOSTREAM != dad[j] && j != dad[j]
1217
+ end
1218
+ end
1219
+
1220
+ dad[i] = -1
1221
+ end
1222
+
1223
+ self.full_paths[0] << '/'
1224
+
1225
+ 1.upto(pl - 1) do | i |
1226
+ if self.file_index[i].type != STREAM
1227
+ self.full_paths[i] << '/'
1228
+ end
1229
+ end
1230
+ end
1231
+
1232
+ # Read entry contents. Undocumented in JS code; looks like:
1233
+ #
1234
+ # +entry+:: The internal file structure being compiled; updated on exit
1235
+ # +payload+:: MiniFAT sector data (file contents within)
1236
+ # +mini+:: MiniFAT indices (of file contents in sector data)
1237
+ #
1238
+ # Returns the extracted data as an ASCII-8BIT encoded string.
1239
+ #
1240
+ def get_mfat_entry(entry, payload, mini)
1241
+ start = entry.start
1242
+ size = entry.size
1243
+ o = String.new(encoding: 'ASCII-8BIT')
1244
+ idx = start;
1245
+
1246
+ while mini.present? && size > 0 && idx >= 0 do
1247
+ o << payload.slice(idx * MSSZ, MSSZ)
1248
+ size -= MSSZ
1249
+ idx = self.class.get_int32le(mini, idx * 4)
1250
+ end
1251
+
1252
+ return '' if o.bytesize == 0
1253
+ return o.slice(0, entry.size)
1254
+ end
1255
+
1256
+ end # "class SimpleCfb"