@cj-tech-master/excelts 6.1.1 → 6.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -345,13 +345,36 @@ export function deflateRawStore(data) {
345
345
  return output.subarray(0, outPos);
346
346
  }
347
347
  // ============================================================================
348
- // LZ77 + Huffman Compression (Basic implementation)
348
+ // LZ77 + Huffman Compression
349
349
  // ============================================================================
350
+ // Hash table size must be a power of 2. 32768 entries keeps memory reasonable
351
+ // while providing a good distribution for the 3-byte hash.
352
+ const HASH_SIZE = 32768;
353
+ const HASH_MASK = HASH_SIZE - 1;
354
+ // Maximum hash chain length to walk per position. Longer chains find better
355
+ // matches at the cost of speed. 64 is a good balance (~zlib level 5-6).
356
+ const MAX_CHAIN_LEN = 64;
357
+ // Minimum match length for LZ77 (RFC 1951 minimum).
358
+ const MIN_MATCH = 3;
359
+ // Maximum match length (RFC 1951 maximum).
360
+ const MAX_MATCH = 258;
361
+ // Maximum back-reference distance (RFC 1951 / 32 KB sliding window).
362
+ const MAX_DIST = 32768;
350
363
  /**
351
- * Compress data using DEFLATE with fixed Huffman codes
364
+ * Hash function for 3-byte sequences.
365
+ * Uses a multiplicative hash for better distribution than the naive
366
+ * shift-or approach. The constant 0x1e35a7bd is chosen for good avalanche
367
+ * properties in the lower bits.
368
+ */
369
+ function hash3(a, b, c) {
370
+ return ((((a << 16) | (b << 8) | c) * 0x1e35a7bd) >>> 17) & HASH_MASK;
371
+ }
372
+ /**
373
+ * Compress data using DEFLATE with fixed Huffman codes.
352
374
  *
353
- * This provides real compression using LZ77 + fixed Huffman codes.
354
- * Not as efficient as full DEFLATE but much better than STORE mode.
375
+ * Uses LZ77 with hash chains and lazy matching for significantly better
376
+ * compression than a single-entry hash table. The algorithm is modelled
377
+ * after zlib's "fast" and "slow" deflate strategies.
355
378
  *
356
379
  * @param data - Data to compress
357
380
  * @returns Compressed data in deflate-raw format
@@ -369,43 +392,106 @@ export function deflateRawCompressed(data) {
369
392
  // Write final block header with fixed Huffman (BFINAL=1, BTYPE=01)
370
393
  output.writeBits(1, 1); // BFINAL
371
394
  output.writeBits(1, 2); // BTYPE = 01 (fixed Huffman)
372
- // LZ77 compression with hash table
373
- const hashTable = new Map();
395
+ // --- Hash chain tables (typed arrays for performance) ---
396
+ // head[h]: most recent position with hash h (0 = unused, positions are 1-based internally)
397
+ // prev[pos & (MAX_DIST-1)]: previous position in the chain for the same hash
398
+ const head = new Int32Array(HASH_SIZE); // filled with 0 (no match)
399
+ const prev = new Int32Array(MAX_DIST);
374
400
  let pos = 0;
401
+ // State for lazy matching:
402
+ // When we find a match at position N, we check position N+1 too.
403
+ // If N+1 has a longer match we emit a literal for N and use the N+1 match.
404
+ let prevMatchLen = 0;
405
+ let prevMatchDist = 0;
406
+ let prevLiteral = 0;
407
+ let hasPrevMatch = false;
375
408
  while (pos < data.length) {
376
- // Try to find a match
377
409
  let bestLen = 0;
378
410
  let bestDist = 0;
379
411
  if (pos + 2 < data.length) {
380
- const hash = (data[pos] << 16) | (data[pos + 1] << 8) | data[pos + 2];
381
- const matchPos = hashTable.get(hash);
382
- if (matchPos !== undefined && pos - matchPos <= 32768) {
383
- const dist = pos - matchPos;
412
+ const h = hash3(data[pos], data[pos + 1], data[pos + 2]);
413
+ // Walk the hash chain to find the best (longest) match
414
+ let chainLen = MAX_CHAIN_LEN;
415
+ let matchHead = head[h];
416
+ while (matchHead > 0 && chainLen-- > 0) {
417
+ const mPos = matchHead - 1; // convert from 1-based to 0-based
418
+ const dist = pos - mPos;
419
+ if (dist > MAX_DIST || dist <= 0) {
420
+ break;
421
+ }
422
+ // Quick check: compare the byte just beyond current best length first
423
+ // to skip obviously shorter matches early.
424
+ if (bestLen >= MIN_MATCH && data[mPos + bestLen] !== data[pos + bestLen]) {
425
+ matchHead = prev[mPos & (MAX_DIST - 1)];
426
+ continue;
427
+ }
428
+ // Full scan
384
429
  let len = 0;
385
- const maxLen = Math.min(258, data.length - pos);
386
- while (len < maxLen && data[matchPos + len] === data[pos + len]) {
430
+ const maxLen = Math.min(MAX_MATCH, data.length - pos);
431
+ while (len < maxLen && data[mPos + len] === data[pos + len]) {
387
432
  len++;
388
433
  }
389
- if (len >= 3) {
434
+ if (len > bestLen) {
390
435
  bestLen = len;
391
436
  bestDist = dist;
437
+ if (len >= MAX_MATCH) {
438
+ break; // can't do better
439
+ }
440
+ }
441
+ matchHead = prev[mPos & (MAX_DIST - 1)];
442
+ }
443
+ // Insert current position into the hash chain
444
+ prev[pos & (MAX_DIST - 1)] = head[h];
445
+ head[h] = pos + 1; // 1-based
446
+ }
447
+ // --- Lazy matching logic ---
448
+ if (hasPrevMatch) {
449
+ if (bestLen > prevMatchLen) {
450
+ // Current position has a better match; emit previous as literal
451
+ writeLiteralCode(output, prevLiteral);
452
+ // Now adopt current match as the pending one
453
+ prevMatchLen = bestLen;
454
+ prevMatchDist = bestDist;
455
+ prevLiteral = data[pos];
456
+ pos++;
457
+ }
458
+ else {
459
+ // Previous match is at least as good; emit it
460
+ writeLengthCode(output, prevMatchLen);
461
+ writeDistanceCode(output, prevMatchDist);
462
+ // Insert hash entries for the skipped bytes (positions inside the match)
463
+ // so future matches can find them. We already inserted pos-1 (the match
464
+ // start); now insert pos through pos + prevMatchLen - 2.
465
+ const matchEnd = pos - 1 + prevMatchLen;
466
+ for (let i = pos; i < matchEnd && i + 2 < data.length; i++) {
467
+ const h = hash3(data[i], data[i + 1], data[i + 2]);
468
+ prev[i & (MAX_DIST - 1)] = head[h];
469
+ head[h] = i + 1;
392
470
  }
471
+ pos = matchEnd;
472
+ hasPrevMatch = false;
473
+ prevMatchLen = 0;
393
474
  }
394
- // Update hash table
395
- hashTable.set(hash, pos);
396
475
  }
397
- if (bestLen >= 3) {
398
- // Write length/distance pair
399
- writeLengthCode(output, bestLen);
400
- writeDistanceCode(output, bestDist);
401
- pos += bestLen;
476
+ else if (bestLen >= MIN_MATCH) {
477
+ // We have a match; hold it and try the next position (lazy evaluation)
478
+ hasPrevMatch = true;
479
+ prevMatchLen = bestLen;
480
+ prevMatchDist = bestDist;
481
+ prevLiteral = data[pos];
482
+ pos++;
402
483
  }
403
484
  else {
404
- // Write literal
485
+ // No match — emit literal
405
486
  writeLiteralCode(output, data[pos]);
406
487
  pos++;
407
488
  }
408
489
  }
490
+ // Flush any pending lazy match
491
+ if (hasPrevMatch) {
492
+ writeLengthCode(output, prevMatchLen);
493
+ writeDistanceCode(output, prevMatchDist);
494
+ }
409
495
  // Write end-of-block symbol (256)
410
496
  writeLiteralCode(output, 256);
411
497
  return output.finish();
@@ -600,7 +686,10 @@ const WINDOW_SIZE = 32768;
600
686
  * maintains state across multiple `write()` calls:
601
687
  *
602
688
  * - **LZ77 sliding window**: back-references can span across chunks.
603
- * - **Hash table**: match positions persist across chunks.
689
+ * - **Hash chains**: match positions persist across chunks with typed-array
690
+ * hash tables for fast lookup.
691
+ * - **Lazy matching**: each match is compared with the next position's match
692
+ * to pick the longer one.
604
693
  * - **Bit writer**: bit position is preserved, so consecutive blocks form
605
694
  * a single valid DEFLATE bit-stream without alignment issues.
606
695
  *
@@ -614,13 +703,20 @@ const WINDOW_SIZE = 32768;
614
703
  export class SyncDeflater {
615
704
  constructor() {
616
705
  this._output = new BitWriter();
617
- this._hashTable = new Map();
706
+ // Hash chain tables — shared across chunks for cross-chunk matching.
707
+ this._head = new Int32Array(HASH_SIZE);
708
+ this._prev = new Int32Array(MAX_DIST);
618
709
  /** Sliding window: the last WINDOW_SIZE bytes of uncompressed data. */
619
710
  this._window = new Uint8Array(WINDOW_SIZE);
620
711
  /** Number of valid bytes currently in the window. */
621
712
  this._windowLen = 0;
622
713
  /** Total bytes written so far (monotonically increasing; used for hash offsets). */
623
714
  this._totalIn = 0;
715
+ // Lazy matching state that may span across chunks.
716
+ this._hasPrevMatch = false;
717
+ this._prevMatchLen = 0;
718
+ this._prevMatchDist = 0;
719
+ this._prevLiteral = 0;
624
720
  }
625
721
  /**
626
722
  * Compress a chunk and return the compressed bytes produced so far.
@@ -636,57 +732,163 @@ export class SyncDeflater {
636
732
  out.writeBits(1, 2); // BTYPE = 01 (fixed Huffman)
637
733
  const window = this._window;
638
734
  let wLen = this._windowLen;
639
- const hashTable = this._hashTable;
735
+ const head = this._head;
736
+ const prevArr = this._prev;
640
737
  const totalIn = this._totalIn;
641
- for (let pos = 0; pos < data.length;) {
738
+ let hasPrevMatch = this._hasPrevMatch;
739
+ let prevMatchLen = this._prevMatchLen;
740
+ let prevMatchDist = this._prevMatchDist;
741
+ let prevLiteral = this._prevLiteral;
742
+ /**
743
+ * Insert a global position into the hash chain and the sliding window.
744
+ */
745
+ const insertHash = (localPos) => {
746
+ if (localPos + 2 >= data.length) {
747
+ return;
748
+ }
749
+ const h = hash3(data[localPos], data[localPos + 1], data[localPos + 2]);
750
+ const globalPos = totalIn + localPos;
751
+ prevArr[globalPos & (MAX_DIST - 1)] = head[h];
752
+ head[h] = globalPos + 1; // 1-based
753
+ };
754
+ const insertWindow = (localPos, count) => {
755
+ for (let i = 0; i < count; i++) {
756
+ window[(wLen + i) & (WINDOW_SIZE - 1)] = data[localPos + i];
757
+ }
758
+ wLen += count;
759
+ };
760
+ let pos = 0;
761
+ for (; pos < data.length;) {
642
762
  let bestLen = 0;
643
763
  let bestDist = 0;
644
764
  if (pos + 2 < data.length) {
645
- const h = (data[pos] << 16) | (data[pos + 1] << 8) | data[pos + 2];
646
- const matchGlobalPos = hashTable.get(h);
647
- if (matchGlobalPos !== undefined) {
648
- const dist = totalIn + pos - matchGlobalPos;
649
- if (dist > 0 && dist <= WINDOW_SIZE) {
650
- // Match candidate scan in the sliding window
651
- const wStart = (((wLen - dist) % WINDOW_SIZE) + WINDOW_SIZE) % WINDOW_SIZE;
652
- const maxLen = Math.min(258, data.length - pos);
653
- let len = 0;
654
- while (len < maxLen) {
655
- const wByte = window[(wStart + len) % WINDOW_SIZE];
656
- if (wByte !== data[pos + len]) {
657
- break;
658
- }
659
- len++;
765
+ const h = hash3(data[pos], data[pos + 1], data[pos + 2]);
766
+ const globalPos = totalIn + pos;
767
+ // Walk the hash chain
768
+ let chainLen = MAX_CHAIN_LEN;
769
+ let matchHead = head[h];
770
+ while (matchHead > 0 && chainLen-- > 0) {
771
+ const mGlobalPos = matchHead - 1;
772
+ const dist = globalPos - mGlobalPos;
773
+ if (dist > MAX_DIST || dist <= 0) {
774
+ break;
775
+ }
776
+ // Compare bytes through the sliding window + current chunk
777
+ const maxLen = Math.min(MAX_MATCH, data.length - pos);
778
+ let len = 0;
779
+ // Quick reject on the byte beyond current bestLen
780
+ if (bestLen >= MIN_MATCH) {
781
+ const checkOffset = mGlobalPos + bestLen;
782
+ // Determine the byte at checkOffset
783
+ let checkByte;
784
+ const checkLocal = checkOffset - totalIn;
785
+ if (checkLocal >= 0 && checkLocal < data.length) {
786
+ checkByte = data[checkLocal];
787
+ }
788
+ else {
789
+ checkByte = window[checkOffset & (WINDOW_SIZE - 1)];
790
+ }
791
+ if (checkByte !== data[pos + bestLen]) {
792
+ matchHead = prevArr[mGlobalPos & (MAX_DIST - 1)];
793
+ continue;
794
+ }
795
+ }
796
+ while (len < maxLen) {
797
+ const matchOffset = mGlobalPos + len;
798
+ // Get byte from window or current data
799
+ let matchByte;
800
+ const matchLocal = matchOffset - totalIn;
801
+ if (matchLocal >= 0 && matchLocal < data.length) {
802
+ matchByte = data[matchLocal];
803
+ }
804
+ else {
805
+ matchByte = window[matchOffset & (WINDOW_SIZE - 1)];
806
+ }
807
+ if (matchByte !== data[pos + len]) {
808
+ break;
660
809
  }
661
- if (len >= 3) {
662
- bestLen = len;
663
- bestDist = dist;
810
+ len++;
811
+ }
812
+ if (len > bestLen) {
813
+ bestLen = len;
814
+ bestDist = dist;
815
+ if (len >= MAX_MATCH) {
816
+ break;
664
817
  }
665
818
  }
819
+ matchHead = prevArr[mGlobalPos & (MAX_DIST - 1)];
666
820
  }
667
- hashTable.set(h, totalIn + pos);
821
+ // Insert current position into hash chain
822
+ prevArr[globalPos & (MAX_DIST - 1)] = head[h];
823
+ head[h] = globalPos + 1;
668
824
  }
669
- if (bestLen >= 3) {
670
- writeLengthCode(out, bestLen);
671
- writeDistanceCode(out, bestDist);
672
- // Advance window
673
- for (let i = 0; i < bestLen; i++) {
674
- window[wLen % WINDOW_SIZE] = data[pos + i];
675
- wLen++;
825
+ // --- Lazy matching logic ---
826
+ if (hasPrevMatch) {
827
+ if (bestLen > prevMatchLen) {
828
+ // Current position wins — emit previous as literal
829
+ writeLiteralCode(out, prevLiteral);
830
+ prevMatchLen = bestLen;
831
+ prevMatchDist = bestDist;
832
+ prevLiteral = data[pos];
833
+ insertWindow(pos, 1);
834
+ pos++;
835
+ }
836
+ else {
837
+ // Previous match wins — emit it
838
+ writeLengthCode(out, prevMatchLen);
839
+ writeDistanceCode(out, prevMatchDist);
840
+ // Insert hash entries for skipped positions inside the match
841
+ const matchEnd = pos - 1 + prevMatchLen;
842
+ const insertEnd = Math.min(matchEnd, data.length);
843
+ for (let i = pos; i < insertEnd; i++) {
844
+ insertHash(i);
845
+ }
846
+ insertWindow(pos, insertEnd - pos);
847
+ pos = insertEnd;
848
+ hasPrevMatch = false;
849
+ prevMatchLen = 0;
676
850
  }
677
- pos += bestLen;
851
+ }
852
+ else if (bestLen >= MIN_MATCH) {
853
+ hasPrevMatch = true;
854
+ prevMatchLen = bestLen;
855
+ prevMatchDist = bestDist;
856
+ prevLiteral = data[pos];
857
+ insertWindow(pos, 1);
858
+ pos++;
678
859
  }
679
860
  else {
680
861
  writeLiteralCode(out, data[pos]);
681
- window[wLen % WINDOW_SIZE] = data[pos];
682
- wLen++;
862
+ insertWindow(pos, 1);
683
863
  pos++;
684
864
  }
685
865
  }
866
+ // If there's a pending lazy match and we're at chunk boundary,
867
+ // flush it now (the next chunk will start fresh for lazy matching).
868
+ if (hasPrevMatch) {
869
+ writeLengthCode(out, prevMatchLen);
870
+ writeDistanceCode(out, prevMatchDist);
871
+ // The pending match started at pos-1 and covers prevMatchLen bytes.
872
+ // pos-1 was already hashed/windowed when it was first encountered;
873
+ // now insert the remaining positions (pos .. pos-1+prevMatchLen-1)
874
+ // into hash chains and the sliding window so the next chunk can
875
+ // reference them.
876
+ const matchEnd = Math.min(pos - 1 + prevMatchLen, data.length);
877
+ for (let i = pos; i < matchEnd; i++) {
878
+ insertHash(i);
879
+ }
880
+ insertWindow(pos, matchEnd - pos);
881
+ hasPrevMatch = false;
882
+ prevMatchLen = 0;
883
+ }
686
884
  // End-of-block symbol
687
885
  writeLiteralCode(out, 256);
688
886
  this._windowLen = wLen;
689
887
  this._totalIn = totalIn + data.length;
888
+ this._hasPrevMatch = hasPrevMatch;
889
+ this._prevMatchLen = prevMatchLen;
890
+ this._prevMatchDist = prevMatchDist;
891
+ this._prevLiteral = prevLiteral;
690
892
  // Flush completed bytes from the bit writer
691
893
  return out.flushBytes();
692
894
  }
@@ -88,36 +88,81 @@ export function createUnzlibStream(_options = {}) {
88
88
  // Synchronous stateful deflater (Node.js — native zlib)
89
89
  // =============================================================================
90
90
  /**
91
- * Node.js synchronous deflater using `deflateRawSync` with `Z_SYNC_FLUSH`.
91
+ * Minimum batch size before flushing to the native zlib compressor.
92
92
  *
93
- * Each `write()` compresses the chunk independently (no cross-chunk dictionary)
94
- * but uses `Z_SYNC_FLUSH` so the output is byte-aligned and can be concatenated
95
- * into a single valid DEFLATE stream. The final `finish()` emits a proper
96
- * BFINAL=1 block.
93
+ * Small chunks (e.g. one spreadsheet row 200-400 bytes) compress very
94
+ * poorly when each is given its own zlib context because the LZ77 dictionary
95
+ * starts empty every time. Batching into 64 KB mega-chunks gives zlib
96
+ * enough history to find good matches, bringing compression ratios within
97
+ * ~1% of single-shot compression.
97
98
  *
98
- * This is fast (native C zlib) and produces valid output on all Node.js versions
99
- * (20+). The trade-off is ~2% worse compression ratio vs a stateful context,
100
- * which is acceptable for streaming where memory is the priority.
99
+ * 64 KB is chosen as a sweet spot: large enough for good compression,
100
+ * small enough to keep memory bounded and latency low.
101
+ */
102
+ const SYNC_DEFLATE_BATCH_SIZE = 65536;
103
+ /**
104
+ * Node.js synchronous deflater that batches small writes for better
105
+ * compression.
106
+ *
107
+ * Previous implementation compressed each `write()` call independently
108
+ * with `deflateRawSync()`, creating a fresh zlib context every time.
109
+ * For streaming workloads that push many small chunks (e.g. WorkbookWriter
110
+ * writing one row at a time), this destroyed the LZ77 dictionary between
111
+ * chunks and caused compression ratios to drop from ~82% to ~58%.
112
+ *
113
+ * The new implementation accumulates incoming data into an internal buffer
114
+ * and only calls `deflateRawSync()` when the buffer reaches 64 KB (or on
115
+ * `finish()`). Each batch is still compressed independently, but 64 KB
116
+ * is enough for zlib to build a good dictionary — the compression ratio
117
+ * is within ~1% of a single-shot compression of the entire input.
118
+ *
119
+ * The trade-off is slightly higher latency (compressed output is not
120
+ * returned byte-for-byte immediately), but this is acceptable because
121
+ * the ZIP writer buffers output anyway and the streaming contract only
122
+ * requires data to flow *eventually*, not after every single write.
101
123
  */
102
124
  export class SyncDeflater {
103
125
  constructor(level = DEFAULT_COMPRESS_LEVEL) {
126
+ this._pending = [];
127
+ this._pendingSize = 0;
104
128
  this._level = level;
105
129
  }
106
130
  write(data) {
107
131
  if (data.length === 0) {
108
132
  return new Uint8Array(0);
109
133
  }
110
- const result = deflateRawSync(Buffer.from(data), {
134
+ this._pending.push(data);
135
+ this._pendingSize += data.length;
136
+ if (this._pendingSize >= SYNC_DEFLATE_BATCH_SIZE) {
137
+ return this._flushBatch(false);
138
+ }
139
+ return new Uint8Array(0);
140
+ }
141
+ finish() {
142
+ return this._flushBatch(true);
143
+ }
144
+ _flushBatch(final) {
145
+ let input;
146
+ if (this._pending.length === 0) {
147
+ input = Buffer.alloc(0);
148
+ }
149
+ else if (this._pending.length === 1) {
150
+ input = Buffer.from(this._pending[0]);
151
+ }
152
+ else {
153
+ input = Buffer.concat(this._pending);
154
+ }
155
+ this._pending.length = 0;
156
+ this._pendingSize = 0;
157
+ if (input.length === 0 && !final) {
158
+ return new Uint8Array(0);
159
+ }
160
+ const result = deflateRawSync(input, {
111
161
  level: this._level,
112
- finishFlush: constants.Z_SYNC_FLUSH
162
+ finishFlush: final ? constants.Z_FINISH : constants.Z_SYNC_FLUSH
113
163
  });
114
164
  // deflateRawSync returns a Buffer sharing a 16 KB slab ArrayBuffer.
115
165
  // Copy to a tight Uint8Array so the slab can be reclaimed.
116
166
  return new Uint8Array(result);
117
167
  }
118
- finish() {
119
- // Emit a final empty DEFLATE block (BFINAL=1, BTYPE=01, EOB).
120
- // This terminates the concatenated DEFLATE stream.
121
- return new Uint8Array(deflateRawSync(Buffer.alloc(0), { level: this._level }));
122
- }
123
168
  }
@@ -447,7 +447,7 @@ export class ZipDeflateFile {
447
447
  // Stateful synchronous compression — maintains LZ77 window and bit position
448
448
  // across chunks so the output is a single valid DEFLATE stream.
449
449
  if (!this._syncDeflater) {
450
- this._syncDeflater = new SyncDeflater();
450
+ this._syncDeflater = new SyncDeflater(this.level);
451
451
  }
452
452
  if (data.length > 0) {
453
453
  const compressed = this._syncDeflater.write(data);