oodle-kraken-ruby 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4153 @@
1
+ /*
2
+ Copyright (C) 2016, Powzix
3
+ Copyright (C) 2019, rarten
4
+ Copyright (C) 2022, Kerilk
5
+
6
+ This program is free software: you can redistribute it and/or modify
7
+ it under the terms of the GNU General Public License as published by
8
+ the Free Software Foundation, either version 3 of the License, or
9
+ (at your option) any later version.
10
+
11
+ This program is distributed in the hope that it will be useful,
12
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
+ GNU General Public License for more details.
15
+
16
+ You should have received a copy of the GNU General Public License
17
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
18
+ */
19
+
20
+ #include "stdafx.h"
21
+ #include "kraken.h"
22
+
23
+ // Header in front of each 256k block
24
+ typedef struct KrakenHeader {
25
+ // Type of decoder used, 6 means kraken
26
+ int decoder_type;
27
+
28
+ // Whether to restart the decoder
29
+ bool restart_decoder;
30
+
31
+ // Whether this block is uncompressed
32
+ bool uncompressed;
33
+
34
+ // Whether this block uses checksums.
35
+ bool use_checksums;
36
+ } KrakenHeader;
37
+
38
+ // Additional header in front of each 256k block ("quantum").
39
+ typedef struct KrakenQuantumHeader {
40
+ // The compressed size of this quantum. If this value is 0 it means
41
+ // the quantum is a special quantum such as memset.
42
+ uint32 compressed_size;
43
+ // If checksums are enabled, holds the checksum.
44
+ uint32 checksum;
45
+ // Two flags
46
+ uint8 flag1;
47
+ uint8 flag2;
48
+ // Whether the whole block matched a previous block
49
+ uint32 whole_match_distance;
50
+ } KrakenQuantumHeader;
51
+
52
+ // Kraken decompression happens in two phases, first one decodes
53
+ // all the literals and copy lengths using huffman and second
54
+ // phase runs the copy loop. This holds the tables needed by stage 2.
55
+ typedef struct KrakenLzTable {
56
+ // Stream of (literal, match) pairs. The flag byte contains
57
+ // the length of the match, the length of the literal and whether
58
+ // to use a recent offset.
59
+ byte *cmd_stream;
60
+ int cmd_stream_size;
61
+
62
+ // Holds the actual distances in case we're not using a recent
63
+ // offset.
64
+ int *offs_stream;
65
+ int offs_stream_size;
66
+
67
+ // Holds the sequence of literals. All literal copying happens from
68
+ // here.
69
+ byte *lit_stream;
70
+ int lit_stream_size;
71
+
72
+ // Holds the lengths that do not fit in the flag stream. Both literal
73
+ // lengths and match length are stored in the same array.
74
+ int *len_stream;
75
+ int len_stream_size;
76
+ } KrakenLzTable;
77
+
78
+
79
+ // Mermaid/Selkie decompression also happens in two phases, just like in Kraken,
80
+ // but the match copier works differently.
81
+ // Both Mermaid and Selkie use the same on-disk format, only the compressor
82
+ // differs.
83
+ typedef struct MermaidLzTable {
84
+ // Flag stream. Format of flags:
85
+ // Read flagbyte from |cmd_stream|
86
+ // If flagbyte >= 24:
87
+ // flagbyte & 0x80 == 0 : Read from |off16_stream| into |recent_offs|.
88
+ // != 0 : Don't read offset.
89
+ // flagbyte & 7 = Number of literals to copy first from |lit_stream|.
90
+ // (flagbyte >> 3) & 0xF = Number of bytes to copy from |recent_offs|.
91
+ //
92
+ // If flagbyte == 0 :
93
+ // Read byte L from |length_stream|
94
+ // If L > 251: L += 4 * Read word from |length_stream|
95
+ // L += 64
96
+ // Copy L bytes from |lit_stream|.
97
+ //
98
+ // If flagbyte == 1 :
99
+ // Read byte L from |length_stream|
100
+ // If L > 251: L += 4 * Read word from |length_stream|
101
+ // L += 91
102
+ // Copy L bytes from match pointed by next offset from |off16_stream|
103
+ //
104
+ // If flagbyte == 2 :
105
+ // Read byte L from |length_stream|
106
+ // If L > 251: L += 4 * Read word from |length_stream|
107
+ // L += 29
108
+ // Copy L bytes from match pointed by next offset from |off32_stream|,
109
+ // relative to start of block.
110
+ // Then prefetch |off32_stream[3]|
111
+ //
112
+ // If flagbyte > 2:
113
+ // L = flagbyte + 5
114
+ // Copy L bytes from match pointed by next offset from |off32_stream|,
115
+ // relative to start of block.
116
+ // Then prefetch |off32_stream[3]|
117
+ const byte *cmd_stream, *cmd_stream_end;
118
+
119
+ // Length stream
120
+ const byte *length_stream;
121
+
122
+ // Literal stream
123
+ const byte *lit_stream, *lit_stream_end;
124
+
125
+ // Near offsets
126
+ const uint16 *off16_stream, *off16_stream_end;
127
+
128
+ // Far offsets for current chunk
129
+ uint32 *off32_stream, *off32_stream_end;
130
+
131
+ // Holds the offsets for the two chunks
132
+ uint32 *off32_stream_1, *off32_stream_2;
133
+ uint32 off32_size_1, off32_size_2;
134
+
135
+ // Flag offsets for next 64k chunk.
136
+ uint32 cmd_stream_2_offs, cmd_stream_2_offs_end;
137
+ } MermaidLzTable;
138
+
139
+
140
+ typedef struct KrakenDecoder {
141
+ // Updated after the |*_DecodeStep| function completes to hold
142
+ // the number of bytes read and written.
143
+ int src_used, dst_used;
144
+
145
+ // Pointer to a 256k buffer that holds the intermediate state
146
+ // in between decode phase 1 and 2.
147
+ byte *scratch;
148
+ size_t scratch_size;
149
+
150
+ KrakenHeader hdr;
151
+ } KrakenDecoder;
152
+
153
+ typedef struct BitReader {
154
+ // |p| holds the current byte and |p_end| the end of the buffer.
155
+ const byte *p, *p_end;
156
+ // Bits accumulated so far
157
+ uint32 bits;
158
+ // Next byte will end up in the |bitpos| position in |bits|.
159
+ int bitpos;
160
+ } BitReader;
161
+
162
+ struct HuffRevLut {
163
+ uint8 bits2len[2048];
164
+ uint8 bits2sym[2048];
165
+ };
166
+
167
+ typedef struct HuffReader {
168
+ // Array to hold the output of the huffman read array operation
169
+ byte *output, *output_end;
170
+ // We decode three parallel streams, two forwards, |src| and |src_mid|
171
+ // while |src_end| is decoded backwards.
172
+ const byte *src, *src_mid, *src_end, *src_mid_org;
173
+ int src_bitpos, src_mid_bitpos, src_end_bitpos;
174
+ uint32 src_bits, src_mid_bits, src_end_bits;
175
+ } HuffReader;
176
+
177
+ inline size_t Max(size_t a, size_t b) { return a > b ? a : b; }
178
+ inline size_t Min(size_t a, size_t b) { return a < b ? a : b; }
179
+
180
+ #define ALIGN_POINTER(p, align) ((uint8*)(((uintptr_t)(p) + (align - 1)) & ~(align - 1)))
181
+
182
+ struct HuffRange;
183
+
184
+ int Kraken_DecodeBytes(byte **output, const byte *src, const byte *src_end, int *decoded_size, size_t output_size, bool force_memmove, uint8 *scratch, uint8 *scratch_end);
185
+ int Kraken_GetBlockSize(const uint8 *src, const uint8 *src_end, int *dest_size, int dest_capacity);
186
+ int Huff_ConvertToRanges(HuffRange *range, int num_symbols, int P, const uint8 *symlen, BitReader *bits);
187
+
188
+ // Allocate memory with a specific alignment
189
+ void *MallocAligned(size_t size, size_t alignment) {
190
+ void *x = malloc(size + (alignment - 1) + sizeof(void*)), *x_org = x;
191
+ if (x) {
192
+ x = (void*)(((intptr_t)x + alignment - 1 + sizeof(void*)) & ~(alignment - 1));
193
+ ((void**)x)[-1] = x_org;
194
+ }
195
+ return x;
196
+ }
197
+
198
+ // Free memory allocated through |MallocAligned|
199
+ void FreeAligned(void *p) {
200
+ free(((void**)p)[-1]);
201
+ }
202
+
203
+ uint32 BSR(uint32 x) {
204
+ unsigned long index;
205
+ _BitScanReverse(&index, x);
206
+ return index;
207
+ }
208
+
209
+ uint32 BSF(uint32 x) {
210
+ unsigned long index;
211
+ _BitScanForward(&index, x);
212
+ return index;
213
+ }
214
+
215
+ // Read more bytes to make sure we always have at least 24 bits in |bits|.
216
+ void BitReader_Refill(BitReader *bits) {
217
+ assert(bits->bitpos <= 24);
218
+ while (bits->bitpos > 0) {
219
+ bits->bits |= (bits->p < bits->p_end ? *bits->p : 0) << bits->bitpos;
220
+ bits->bitpos -= 8;
221
+ bits->p++;
222
+ }
223
+ }
224
+
225
+ // Read more bytes to make sure we always have at least 24 bits in |bits|,
226
+ // used when reading backwards.
227
+ void BitReader_RefillBackwards(BitReader *bits) {
228
+ assert(bits->bitpos <= 24);
229
+ while (bits->bitpos > 0) {
230
+ bits->p--;
231
+ bits->bits |= (bits->p >= bits->p_end ? *bits->p : 0) << bits->bitpos;
232
+ bits->bitpos -= 8;
233
+ }
234
+ }
235
+
236
+ // Refill bits then read a single bit.
237
+ int BitReader_ReadBit(BitReader *bits) {
238
+ int r;
239
+ BitReader_Refill(bits);
240
+ r = bits->bits >> 31;
241
+ bits->bits <<= 1;
242
+ bits->bitpos += 1;
243
+ return r;
244
+ }
245
+
246
+ int BitReader_ReadBitNoRefill(BitReader *bits) {
247
+ int r;
248
+ r = bits->bits >> 31;
249
+ bits->bits <<= 1;
250
+ bits->bitpos += 1;
251
+ return r;
252
+ }
253
+
254
+
255
+ // Read |n| bits without refilling.
256
+ int BitReader_ReadBitsNoRefill(BitReader *bits, int n) {
257
+ int r = (bits->bits >> (32 - n));
258
+ bits->bits <<= n;
259
+ bits->bitpos += n;
260
+ return r;
261
+ }
262
+
263
+ // Read |n| bits without refilling, n may be zero.
264
+ int BitReader_ReadBitsNoRefillZero(BitReader *bits, int n) {
265
+ int r = (bits->bits >> 1 >> (31 - n));
266
+ bits->bits <<= n;
267
+ bits->bitpos += n;
268
+ return r;
269
+ }
270
+
271
+ uint32 BitReader_ReadMoreThan24Bits(BitReader *bits, int n) {
272
+ uint32 rv;
273
+ if (n <= 24) {
274
+ rv = BitReader_ReadBitsNoRefillZero(bits, n);
275
+ } else {
276
+ rv = BitReader_ReadBitsNoRefill(bits, 24) << (n - 24);
277
+ BitReader_Refill(bits);
278
+ rv += BitReader_ReadBitsNoRefill(bits, n - 24);
279
+ }
280
+ BitReader_Refill(bits);
281
+ return rv;
282
+ }
283
+
284
+ uint32 BitReader_ReadMoreThan24BitsB(BitReader *bits, int n) {
285
+ uint32 rv;
286
+ if (n <= 24) {
287
+ rv = BitReader_ReadBitsNoRefillZero(bits, n);
288
+ } else {
289
+ rv = BitReader_ReadBitsNoRefill(bits, 24) << (n - 24);
290
+ BitReader_RefillBackwards(bits);
291
+ rv += BitReader_ReadBitsNoRefill(bits, n - 24);
292
+ }
293
+ BitReader_RefillBackwards(bits);
294
+ return rv;
295
+ }
296
+
297
+ // Reads a gamma value.
298
+ // Assumes bitreader is already filled with at least 23 bits
299
+ int BitReader_ReadGamma(BitReader *bits) {
300
+ unsigned long bitresult;
301
+ int n;
302
+ int r;
303
+ if (bits->bits != 0) {
304
+ _BitScanReverse(&bitresult, bits->bits);
305
+ n = 31 - bitresult;
306
+ } else {
307
+ n = 32;
308
+ }
309
+ n = 2 * n + 2;
310
+ assert(n < 24);
311
+ bits->bitpos += n;
312
+ r = bits->bits >> (32 - n);
313
+ bits->bits <<= n;
314
+ return r - 2;
315
+ }
316
+
317
+ int CountLeadingZeros(uint32 bits) {
318
+ unsigned long x;
319
+ _BitScanReverse(&x, bits);
320
+ return 31 - x;
321
+ }
322
+
323
+ // Reads a gamma value with |forced| number of forced bits.
324
+ int BitReader_ReadGammaX(BitReader *bits, int forced) {
325
+ unsigned long bitresult;
326
+ int r;
327
+ if (bits->bits != 0) {
328
+ _BitScanReverse(&bitresult, bits->bits);
329
+ int lz = 31 - bitresult;
330
+ assert(lz < 24);
331
+ r = (bits->bits >> (31 - lz - forced)) + ((lz - 1) << forced);
332
+ bits->bits <<= lz + forced + 1;
333
+ bits->bitpos += lz + forced + 1;
334
+ return r;
335
+ }
336
+ return 0;
337
+ }
338
+
339
+ // Reads a offset code parametrized by |v|.
340
+ uint32 BitReader_ReadDistance(BitReader *bits, uint32 v) {
341
+ uint32 w, m, n, rv;
342
+ if (v < 0xF0) {
343
+ n = (v >> 4) + 4;
344
+ w = _rotl(bits->bits | 1, n);
345
+ bits->bitpos += n;
346
+ m = (2 << n) - 1;
347
+ bits->bits = w & ~m;
348
+ rv = ((w & m) << 4) + (v & 0xF) - 248;
349
+ } else {
350
+ n = v - 0xF0 + 4;
351
+ w = _rotl(bits->bits | 1, n);
352
+ bits->bitpos += n;
353
+ m = (2 << n) - 1;
354
+ bits->bits = w & ~m;
355
+ rv = 8322816 + ((w & m) << 12);
356
+ BitReader_Refill(bits);
357
+ rv += (bits->bits >> 20);
358
+ bits->bitpos += 12;
359
+ bits->bits <<= 12;
360
+ }
361
+ BitReader_Refill(bits);
362
+ return rv;
363
+ }
364
+
365
+
366
+ // Reads a offset code parametrized by |v|, backwards.
367
+ uint32 BitReader_ReadDistanceB(BitReader *bits, uint32 v) {
368
+ uint32 w, m, n, rv;
369
+ if (v < 0xF0) {
370
+ n = (v >> 4) + 4;
371
+ w = _rotl(bits->bits | 1, n);
372
+ bits->bitpos += n;
373
+ m = (2 << n) - 1;
374
+ bits->bits = w & ~m;
375
+ rv = ((w & m) << 4) + (v & 0xF) - 248;
376
+ } else {
377
+ n = v - 0xF0 + 4;
378
+ w = _rotl(bits->bits | 1, n);
379
+ bits->bitpos += n;
380
+ m = (2 << n) - 1;
381
+ bits->bits = w & ~m;
382
+ rv = 8322816 + ((w & m) << 12);
383
+ BitReader_RefillBackwards(bits);
384
+ rv += (bits->bits >> (32 - 12));
385
+ bits->bitpos += 12;
386
+ bits->bits <<= 12;
387
+ }
388
+ BitReader_RefillBackwards(bits);
389
+ return rv;
390
+ }
391
+
392
+ // Reads a length code.
393
+ bool BitReader_ReadLength(BitReader *bits, uint32 *v) {
394
+ unsigned long bitresult;
395
+ int n;
396
+ uint32 rv;
397
+ _BitScanReverse(&bitresult, bits->bits);
398
+ n = 31 - bitresult;
399
+ if (n > 12) return false;
400
+ bits->bitpos += n;
401
+ bits->bits <<= n;
402
+ BitReader_Refill(bits);
403
+ n += 7;
404
+ bits->bitpos += n;
405
+ rv = (bits->bits >> (32 - n)) - 64;
406
+ bits->bits <<= n;
407
+ *v = rv;
408
+ BitReader_Refill(bits);
409
+ return true;
410
+ }
411
+
412
+ // Reads a length code, backwards.
413
+ bool BitReader_ReadLengthB(BitReader *bits, uint32 *v) {
414
+ unsigned long bitresult;
415
+ int n;
416
+ uint32 rv;
417
+ _BitScanReverse(&bitresult, bits->bits);
418
+ n = 31 - bitresult;
419
+ if (n > 12) return false;
420
+ bits->bitpos += n;
421
+ bits->bits <<= n;
422
+ BitReader_RefillBackwards(bits);
423
+ n += 7;
424
+ bits->bitpos += n;
425
+ rv = (bits->bits >> (32 - n)) - 64;
426
+ bits->bits <<= n;
427
+ *v = rv;
428
+ BitReader_RefillBackwards(bits);
429
+ return true;
430
+ }
431
+
432
+ int Log2RoundUp(uint32 v) {
433
+ if (v > 1) {
434
+ unsigned long idx;
435
+ _BitScanReverse(&idx, v - 1);
436
+ return idx + 1;
437
+ } else {
438
+ return 0;
439
+ }
440
+ }
441
+
442
+ #define ALIGN_16(x) (((x)+15)&~15)
443
+ #define COPY_64(d, s) {*(uint64*)(d) = *(uint64*)(s); }
444
+ #define COPY_64_BYTES(d, s) { \
445
+ _mm_storeu_si128((__m128i*)d + 0, _mm_loadu_si128((__m128i*)s + 0)); \
446
+ _mm_storeu_si128((__m128i*)d + 1, _mm_loadu_si128((__m128i*)s + 1)); \
447
+ _mm_storeu_si128((__m128i*)d + 2, _mm_loadu_si128((__m128i*)s + 2)); \
448
+ _mm_storeu_si128((__m128i*)d + 3, _mm_loadu_si128((__m128i*)s + 3)); \
449
+ }
450
+
451
+ #define COPY_64_ADD(d, s, t) _mm_storel_epi64((__m128i *)(d), _mm_add_epi8(_mm_loadl_epi64((__m128i *)(s)), _mm_loadl_epi64((__m128i *)(t))))
452
+
453
+ KrakenDecoder *Kraken_Create() {
454
+ size_t scratch_size = 0x6C000;
455
+ size_t memory_needed = sizeof(KrakenDecoder) + scratch_size;
456
+ KrakenDecoder *dec = (KrakenDecoder*)MallocAligned(memory_needed, 16);
457
+ memset(dec, 0, sizeof(KrakenDecoder));
458
+ dec->scratch_size = scratch_size;
459
+ dec->scratch = (byte*)(dec + 1);
460
+ return dec;
461
+ }
462
+
463
+ void Kraken_Destroy(KrakenDecoder *kraken) {
464
+ FreeAligned(kraken);
465
+ }
466
+
467
+ const byte *Kraken_ParseHeader(KrakenHeader *hdr, const byte *p) {
468
+ int b = p[0];
469
+ if ((b & 0xF) == 0xC) {
470
+ if (((b >> 4) & 3) != 0) return NULL;
471
+ hdr->restart_decoder = (b >> 7) & 1;
472
+ hdr->uncompressed = (b >> 6) & 1;
473
+ b = p[1];
474
+ hdr->decoder_type = b & 0x7F;
475
+ hdr->use_checksums = !!(b >> 7);
476
+ if (hdr->decoder_type != 6 && hdr->decoder_type != 10 && hdr->decoder_type != 5 && hdr->decoder_type != 11 && hdr->decoder_type != 12)
477
+ return NULL;
478
+ return p + 2;
479
+ }
480
+
481
+ return NULL;
482
+ }
483
+
484
+ const byte *Kraken_ParseQuantumHeader(KrakenQuantumHeader *hdr, const byte *p, bool use_checksum) {
485
+ uint32 v = (p[0] << 16) | (p[1] << 8) | p[2];
486
+ uint32 size = v & 0x3FFFF;
487
+ if (size != 0x3ffff) {
488
+ hdr->compressed_size = size + 1;
489
+ hdr->flag1 = (v >> 18) & 1;
490
+ hdr->flag2 = (v >> 19) & 1;
491
+ if (use_checksum) {
492
+ hdr->checksum = (p[3] << 16) | (p[4] << 8) | p[5];
493
+ return p + 6;
494
+ } else {
495
+ return p + 3;
496
+ }
497
+ }
498
+ v >>= 18;
499
+ if (v == 1) {
500
+ // memset
501
+ hdr->checksum = p[3];
502
+ hdr->compressed_size = 0;
503
+ hdr->whole_match_distance = 0;
504
+ return p + 4;
505
+ }
506
+ return NULL;
507
+
508
+ }
509
+
510
+ const byte *LZNA_ParseWholeMatchInfo(const byte *p, uint32 *dist) {
511
+ uint32 v = _byteswap_ushort(*(uint16*)p);
512
+
513
+ if (v < 0x8000) {
514
+ uint32 x = 0, b, pos = 0;
515
+ for (;;) {
516
+ b = p[2];
517
+ p += 1;
518
+ if (b & 0x80)
519
+ break;
520
+ x += (b + 0x80) << pos;
521
+ pos += 7;
522
+
523
+ }
524
+ x += (b - 128) << pos;
525
+ *dist = 0x8000 + v + (x << 15) + 1;
526
+ return p + 2;
527
+ } else {
528
+ *dist = v - 0x8000 + 1;
529
+ return p + 2;
530
+ }
531
+ }
532
+
533
+ const byte *LZNA_ParseQuantumHeader(KrakenQuantumHeader *hdr, const byte *p, bool use_checksum, int raw_len) {
534
+ uint32 v = (p[0] << 8) | p[1];
535
+ uint32 size = v & 0x3FFF;
536
+ if (size != 0x3fff) {
537
+ hdr->compressed_size = size + 1;
538
+ hdr->flag1 = (v >> 14) & 1;
539
+ hdr->flag2 = (v >> 15) & 1;
540
+ if (use_checksum) {
541
+ hdr->checksum = (p[2] << 16) | (p[3] << 8) | p[4];
542
+ return p + 5;
543
+ } else {
544
+ return p + 2;
545
+ }
546
+ }
547
+ v >>= 14;
548
+ if (v == 0) {
549
+ p = LZNA_ParseWholeMatchInfo(p + 2, &hdr->whole_match_distance);
550
+ hdr->compressed_size = 0;
551
+ return p;
552
+ }
553
+ if (v == 1) {
554
+ // memset
555
+ hdr->checksum = p[2];
556
+ hdr->compressed_size = 0;
557
+ hdr->whole_match_distance = 0;
558
+ return p + 3;
559
+ }
560
+ if (v == 2) {
561
+ // uncompressed
562
+ hdr->compressed_size = raw_len;
563
+ return p + 2;
564
+ }
565
+ return NULL;
566
+ }
567
+
568
+
569
+ uint32 Kraken_GetCrc(const byte *p, size_t p_size) {
570
+ // TODO: implement
571
+ return 0;
572
+ }
573
+
574
+ // Rearranges elements in the input array so that bits in the index
575
+ // get flipped.
576
+ static void ReverseBitsArray2048(const byte *input, byte *output) {
577
+ static const uint8 offsets[32] = {
578
+ 0, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
579
+ 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8
580
+ };
581
+ __m128i t0, t1, t2, t3, s0, s1, s2, s3;
582
+ int i, j;
583
+ for(i = 0; i != 32; i++) {
584
+ j = offsets[i];
585
+ t0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&input[j]),
586
+ _mm_loadl_epi64((const __m128i *)&input[j + 256]));
587
+ t1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&input[j + 512]),
588
+ _mm_loadl_epi64((const __m128i *)&input[j + 768]));
589
+ t2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&input[j + 1024]),
590
+ _mm_loadl_epi64((const __m128i *)&input[j + 1280]));
591
+ t3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&input[j + 1536]),
592
+ _mm_loadl_epi64((const __m128i *)&input[j + 1792]));
593
+
594
+ s0 = _mm_unpacklo_epi8(t0, t1);
595
+ s1 = _mm_unpacklo_epi8(t2, t3);
596
+ s2 = _mm_unpackhi_epi8(t0, t1);
597
+ s3 = _mm_unpackhi_epi8(t2, t3);
598
+
599
+ t0 = _mm_unpacklo_epi8(s0, s1);
600
+ t1 = _mm_unpacklo_epi8(s2, s3);
601
+ t2 = _mm_unpackhi_epi8(s0, s1);
602
+ t3 = _mm_unpackhi_epi8(s2, s3);
603
+
604
+ _mm_storel_epi64((__m128i *)&output[0], t0);
605
+ _mm_storeh_pi((__m64*)&output[1024], _mm_castsi128_ps(t0));
606
+ _mm_storel_epi64((__m128i *)&output[256], t1);
607
+ _mm_storeh_pi((__m64*)&output[1280], _mm_castsi128_ps(t1));
608
+ _mm_storel_epi64((__m128i *)&output[512], t2);
609
+ _mm_storeh_pi((__m64*)&output[1536], _mm_castsi128_ps(t2));
610
+ _mm_storel_epi64((__m128i *)&output[768], t3);
611
+ _mm_storeh_pi((__m64*)&output[1792], _mm_castsi128_ps(t3));
612
+ output += 8;
613
+ }
614
+ }
615
+
616
+ bool Kraken_DecodeBytesCore(HuffReader *hr, HuffRevLut *lut) {
617
+ const byte *src = hr->src;
618
+ uint32 src_bits = hr->src_bits;
619
+ int src_bitpos = hr->src_bitpos;
620
+
621
+ const byte *src_mid = hr->src_mid;
622
+ uint32 src_mid_bits = hr->src_mid_bits;
623
+ int src_mid_bitpos = hr->src_mid_bitpos;
624
+
625
+ const byte *src_end = hr->src_end;
626
+ uint32 src_end_bits = hr->src_end_bits;
627
+ int src_end_bitpos = hr->src_end_bitpos;
628
+
629
+ int k, n;
630
+
631
+ byte *dst = hr->output;
632
+ byte *dst_end = hr->output_end;
633
+
634
+ if (src > src_mid)
635
+ return false;
636
+
637
+ if (hr->src_end - src_mid >= 4 && dst_end - dst >= 6) {
638
+ dst_end -= 5;
639
+ src_end -= 4;
640
+
641
+ while (dst < dst_end && src <= src_mid && src_mid <= src_end) {
642
+ src_bits |= *(uint32*)src << src_bitpos;
643
+ src += (31 - src_bitpos) >> 3;
644
+
645
+ src_end_bits |= _byteswap_ulong(*(uint32*)src_end) << src_end_bitpos;
646
+ src_end -= (31 - src_end_bitpos) >> 3;
647
+
648
+ src_mid_bits |= *(uint32*)src_mid << src_mid_bitpos;
649
+ src_mid += (31 - src_mid_bitpos) >> 3;
650
+
651
+ src_bitpos |= 0x18;
652
+ src_end_bitpos |= 0x18;
653
+ src_mid_bitpos |= 0x18;
654
+
655
+ k = src_bits & 0x7FF;
656
+ n = lut->bits2len[k];
657
+ src_bits >>= n;
658
+ src_bitpos -= n;
659
+ dst[0] = lut->bits2sym[k];
660
+
661
+ k = src_end_bits & 0x7FF;
662
+ n = lut->bits2len[k];
663
+ src_end_bits >>= n;
664
+ src_end_bitpos -= n;
665
+ dst[1] = lut->bits2sym[k];
666
+
667
+ k = src_mid_bits & 0x7FF;
668
+ n = lut->bits2len[k];
669
+ src_mid_bits >>= n;
670
+ src_mid_bitpos -= n;
671
+ dst[2] = lut->bits2sym[k];
672
+
673
+ k = src_bits & 0x7FF;
674
+ n = lut->bits2len[k];
675
+ src_bits >>= n;
676
+ src_bitpos -= n;
677
+ dst[3] = lut->bits2sym[k];
678
+
679
+ k = src_end_bits & 0x7FF;
680
+ n = lut->bits2len[k];
681
+ src_end_bits >>= n;
682
+ src_end_bitpos -= n;
683
+ dst[4] = lut->bits2sym[k];
684
+
685
+ k = src_mid_bits & 0x7FF;
686
+ n = lut->bits2len[k];
687
+ src_mid_bits >>= n;
688
+ src_mid_bitpos -= n;
689
+ dst[5] = lut->bits2sym[k];
690
+ dst += 6;
691
+ }
692
+ dst_end += 5;
693
+
694
+ src -= src_bitpos >> 3;
695
+ src_bitpos &= 7;
696
+
697
+ src_end += 4 + (src_end_bitpos >> 3);
698
+ src_end_bitpos &= 7;
699
+
700
+ src_mid -= src_mid_bitpos >> 3;
701
+ src_mid_bitpos &= 7;
702
+ }
703
+ for(;;) {
704
+ if (dst >= dst_end)
705
+ break;
706
+
707
+ if (src_mid - src <= 1) {
708
+ if (src_mid - src == 1)
709
+ src_bits |= *src << src_bitpos;
710
+ } else {
711
+ src_bits |= *(uint16 *)src << src_bitpos;
712
+ }
713
+ k = src_bits & 0x7FF;
714
+ n = lut->bits2len[k];
715
+ src_bitpos -= n;
716
+ src_bits >>= n;
717
+ *dst++ = lut->bits2sym[k];
718
+ src += (7 - src_bitpos) >> 3;
719
+ src_bitpos &= 7;
720
+
721
+ if (dst < dst_end) {
722
+ if (src_end - src_mid <= 1) {
723
+ if (src_end - src_mid == 1) {
724
+ src_end_bits |= *src_mid << src_end_bitpos;
725
+ src_mid_bits |= *src_mid << src_mid_bitpos;
726
+ }
727
+ } else {
728
+ unsigned int v = *(uint16*)(src_end - 2);
729
+ src_end_bits |= (((v >> 8) | (v << 8)) & 0xffff) << src_end_bitpos;
730
+ src_mid_bits |= *(uint16*)src_mid << src_mid_bitpos;
731
+ }
732
+ n = lut->bits2len[src_end_bits & 0x7FF];
733
+ *dst++ = lut->bits2sym[src_end_bits & 0x7FF];
734
+ src_end_bitpos -= n;
735
+ src_end_bits >>= n;
736
+ src_end -= (7 - src_end_bitpos) >> 3;
737
+ src_end_bitpos &= 7;
738
+ if (dst < dst_end) {
739
+ n = lut->bits2len[src_mid_bits & 0x7FF];
740
+ *dst++ = lut->bits2sym[src_mid_bits & 0x7FF];
741
+ src_mid_bitpos -= n;
742
+ src_mid_bits >>= n;
743
+ src_mid += (7 - src_mid_bitpos) >> 3;
744
+ src_mid_bitpos &= 7;
745
+ }
746
+ }
747
+ if (src > src_mid || src_mid > src_end)
748
+ return false;
749
+ }
750
+ if (src != hr->src_mid_org || src_end != src_mid)
751
+ return false;
752
+ return true;
753
+ }
754
+
755
+ int Huff_ReadCodeLengthsOld(BitReader *bits, uint8 *syms, uint32 *code_prefix) {
756
+ if (BitReader_ReadBitNoRefill(bits)) {
757
+ int n, sym = 0, codelen, num_symbols = 0;
758
+ int avg_bits_x4 = 32;
759
+ int forced_bits = BitReader_ReadBitsNoRefill(bits, 2);
760
+
761
+ uint32 thres_for_valid_gamma_bits = 1 << (31 - (20u >> forced_bits));
762
+ if (BitReader_ReadBit(bits))
763
+ goto SKIP_INITIAL_ZEROS;
764
+ do {
765
+ // Run of zeros
766
+ if (!(bits->bits & 0xff000000))
767
+ return -1;
768
+ sym += BitReader_ReadBitsNoRefill(bits, 2 * (CountLeadingZeros(bits->bits) + 1)) - 2 + 1;
769
+ if (sym >= 256)
770
+ break;
771
+ SKIP_INITIAL_ZEROS:
772
+ BitReader_Refill(bits);
773
+ // Read out the gamma value for the # of symbols
774
+ if (!(bits->bits & 0xff000000))
775
+ return -1;
776
+ n = BitReader_ReadBitsNoRefill(bits, 2 * (CountLeadingZeros(bits->bits) + 1)) - 2 + 1;
777
+ // Overflow?
778
+ if (sym + n > 256)
779
+ return -1;
780
+ BitReader_Refill(bits);
781
+ num_symbols += n;
782
+ do {
783
+ if (bits->bits < thres_for_valid_gamma_bits)
784
+ return -1; // too big gamma value?
785
+
786
+ int lz = CountLeadingZeros(bits->bits);
787
+ int v = BitReader_ReadBitsNoRefill(bits, lz + forced_bits + 1) + ((lz - 1) << forced_bits);
788
+ codelen = (-(int)(v & 1) ^ (v >> 1)) + ((avg_bits_x4 + 2) >> 2);
789
+ if (codelen < 1 || codelen > 11)
790
+ return -1;
791
+ avg_bits_x4 = codelen + ((3 * avg_bits_x4 + 2) >> 2);
792
+ BitReader_Refill(bits);
793
+ syms[code_prefix[codelen]++] = sym++;
794
+ } while (--n);
795
+ } while (sym != 256);
796
+ return (sym == 256) && (num_symbols >= 2) ? num_symbols : -1;
797
+ } else {
798
+ // Sparse symbol encoding
799
+ int num_symbols = BitReader_ReadBitsNoRefill(bits, 8);
800
+ if (num_symbols == 0)
801
+ return -1;
802
+ if (num_symbols == 1) {
803
+ syms[0] = BitReader_ReadBitsNoRefill(bits, 8);
804
+ } else {
805
+ int codelen_bits = BitReader_ReadBitsNoRefill(bits, 3);
806
+ if (codelen_bits > 4)
807
+ return -1;
808
+ for (int i = 0; i < num_symbols; i++) {
809
+ BitReader_Refill(bits);
810
+ int sym = BitReader_ReadBitsNoRefill(bits, 8);
811
+ int codelen = BitReader_ReadBitsNoRefillZero(bits, codelen_bits) + 1;
812
+ if (codelen > 11)
813
+ return -1;
814
+ syms[code_prefix[codelen]++] = sym;
815
+ }
816
+ }
817
+ return num_symbols;
818
+ }
819
+ }
820
+
821
+ int BitReader_ReadFluff(BitReader *bits, int num_symbols) {
822
+ unsigned long y;
823
+
824
+ if (num_symbols == 256)
825
+ return 0;
826
+
827
+ int x = 257 - num_symbols;
828
+ if (x > num_symbols)
829
+ x = num_symbols;
830
+
831
+ x *= 2;
832
+
833
+ _BitScanReverse(&y, x - 1);
834
+ y += 1;
835
+
836
+ uint32 v = bits->bits >> (32 - y);
837
+ uint32 z = (1 << y) - x;
838
+
839
+ if ((v >> 1) >= z) {
840
+ bits->bits <<= y;
841
+ bits->bitpos += y;
842
+ return v - z;
843
+ } else {
844
+ bits->bits <<= (y - 1);
845
+ bits->bitpos += (y - 1);
846
+ return (v >> 1);
847
+ }
848
+ }
849
+
850
+ struct BitReader2 {
851
+ const uint8 *p, *p_end;
852
+ uint32 bitpos;
853
+ };
854
+
855
+ static const uint32 kRiceCodeBits2Value[256] = {
856
+ 0x80000000, 0x00000007, 0x10000006, 0x00000006, 0x20000005, 0x00000105, 0x10000005, 0x00000005,
857
+ 0x30000004, 0x00000204, 0x10000104, 0x00000104, 0x20000004, 0x00010004, 0x10000004, 0x00000004,
858
+ 0x40000003, 0x00000303, 0x10000203, 0x00000203, 0x20000103, 0x00010103, 0x10000103, 0x00000103,
859
+ 0x30000003, 0x00020003, 0x10010003, 0x00010003, 0x20000003, 0x01000003, 0x10000003, 0x00000003,
860
+ 0x50000002, 0x00000402, 0x10000302, 0x00000302, 0x20000202, 0x00010202, 0x10000202, 0x00000202,
861
+ 0x30000102, 0x00020102, 0x10010102, 0x00010102, 0x20000102, 0x01000102, 0x10000102, 0x00000102,
862
+ 0x40000002, 0x00030002, 0x10020002, 0x00020002, 0x20010002, 0x01010002, 0x10010002, 0x00010002,
863
+ 0x30000002, 0x02000002, 0x11000002, 0x01000002, 0x20000002, 0x00000012, 0x10000002, 0x00000002,
864
+ 0x60000001, 0x00000501, 0x10000401, 0x00000401, 0x20000301, 0x00010301, 0x10000301, 0x00000301,
865
+ 0x30000201, 0x00020201, 0x10010201, 0x00010201, 0x20000201, 0x01000201, 0x10000201, 0x00000201,
866
+ 0x40000101, 0x00030101, 0x10020101, 0x00020101, 0x20010101, 0x01010101, 0x10010101, 0x00010101,
867
+ 0x30000101, 0x02000101, 0x11000101, 0x01000101, 0x20000101, 0x00000111, 0x10000101, 0x00000101,
868
+ 0x50000001, 0x00040001, 0x10030001, 0x00030001, 0x20020001, 0x01020001, 0x10020001, 0x00020001,
869
+ 0x30010001, 0x02010001, 0x11010001, 0x01010001, 0x20010001, 0x00010011, 0x10010001, 0x00010001,
870
+ 0x40000001, 0x03000001, 0x12000001, 0x02000001, 0x21000001, 0x01000011, 0x11000001, 0x01000001,
871
+ 0x30000001, 0x00000021, 0x10000011, 0x00000011, 0x20000001, 0x00001001, 0x10000001, 0x00000001,
872
+ 0x70000000, 0x00000600, 0x10000500, 0x00000500, 0x20000400, 0x00010400, 0x10000400, 0x00000400,
873
+ 0x30000300, 0x00020300, 0x10010300, 0x00010300, 0x20000300, 0x01000300, 0x10000300, 0x00000300,
874
+ 0x40000200, 0x00030200, 0x10020200, 0x00020200, 0x20010200, 0x01010200, 0x10010200, 0x00010200,
875
+ 0x30000200, 0x02000200, 0x11000200, 0x01000200, 0x20000200, 0x00000210, 0x10000200, 0x00000200,
876
+ 0x50000100, 0x00040100, 0x10030100, 0x00030100, 0x20020100, 0x01020100, 0x10020100, 0x00020100,
877
+ 0x30010100, 0x02010100, 0x11010100, 0x01010100, 0x20010100, 0x00010110, 0x10010100, 0x00010100,
878
+ 0x40000100, 0x03000100, 0x12000100, 0x02000100, 0x21000100, 0x01000110, 0x11000100, 0x01000100,
879
+ 0x30000100, 0x00000120, 0x10000110, 0x00000110, 0x20000100, 0x00001100, 0x10000100, 0x00000100,
880
+ 0x60000000, 0x00050000, 0x10040000, 0x00040000, 0x20030000, 0x01030000, 0x10030000, 0x00030000,
881
+ 0x30020000, 0x02020000, 0x11020000, 0x01020000, 0x20020000, 0x00020010, 0x10020000, 0x00020000,
882
+ 0x40010000, 0x03010000, 0x12010000, 0x02010000, 0x21010000, 0x01010010, 0x11010000, 0x01010000,
883
+ 0x30010000, 0x00010020, 0x10010010, 0x00010010, 0x20010000, 0x00011000, 0x10010000, 0x00010000,
884
+ 0x50000000, 0x04000000, 0x13000000, 0x03000000, 0x22000000, 0x02000010, 0x12000000, 0x02000000,
885
+ 0x31000000, 0x01000020, 0x11000010, 0x01000010, 0x21000000, 0x01001000, 0x11000000, 0x01000000,
886
+ 0x40000000, 0x00000030, 0x10000020, 0x00000020, 0x20000010, 0x00001010, 0x10000010, 0x00000010,
887
+ 0x30000000, 0x00002000, 0x10001000, 0x00001000, 0x20000000, 0x00100000, 0x10000000, 0x00000000,
888
+ };
889
+
890
+ static const uint8 kRiceCodeBits2Len[256] = {
891
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
892
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
893
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
894
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
895
+ 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
896
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
897
+ 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
898
+ 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
899
+ };
900
+
901
+
902
+ bool DecodeGolombRiceLengths(uint8 *dst, size_t size, BitReader2 *br) {
903
+ const uint8 *p = br->p, *p_end = br->p_end;
904
+ uint8 *dst_end = dst + size;
905
+ if (p >= p_end)
906
+ return false;
907
+
908
+ int count = -(int)br->bitpos;
909
+ uint32 v = *p++ & (255 >> br->bitpos);
910
+ for (;;) {
911
+ if (v == 0) {
912
+ count += 8;
913
+ } else {
914
+ uint32 x = kRiceCodeBits2Value[v];
915
+ *(uint32*)&dst[0] = count + (x & 0x0f0f0f0f);
916
+ *(uint32*)&dst[4] = (x >> 4) & 0x0f0f0f0f;
917
+ dst += kRiceCodeBits2Len[v];
918
+ if (dst >= dst_end)
919
+ break;
920
+ count = x >> 28;
921
+ }
922
+ if (p >= p_end)
923
+ return false;
924
+ v = *p++;
925
+ }
926
+ // went too far, step back
927
+ if (dst > dst_end) {
928
+ int n = dst - dst_end;
929
+ do v &= (v - 1); while (--n);
930
+ }
931
+ // step back if byte not finished
932
+ int bitpos = 0;
933
+ if (!(v & 1)) {
934
+ p--;
935
+ unsigned long q;
936
+ _BitScanForward(&q, v);
937
+ bitpos = 8 - q;
938
+ }
939
+ br->p = p;
940
+ br->bitpos = bitpos;
941
+ return true;
942
+ }
943
+
944
+ bool DecodeGolombRiceBits(uint8 *dst, uint size, uint bitcount, BitReader2 *br) {
945
+ if (bitcount == 0)
946
+ return true;
947
+ uint8 *dst_end = dst + size;
948
+ const uint8 *p = br->p;
949
+ int bitpos = br->bitpos;
950
+
951
+ uint bits_required = bitpos + bitcount * size;
952
+ uint bytes_required = (bits_required + 7) >> 3;
953
+ if (bytes_required > br->p_end - p)
954
+ return false;
955
+
956
+ br->p = p + (bits_required >> 3);
957
+ br->bitpos = bits_required & 7;
958
+
959
+ // todo. handle r/w outside of range
960
+ uint64 bak = *(uint64*)dst_end;
961
+
962
+ if (bitcount < 2) {
963
+ assert(bitcount == 1);
964
+ do {
965
+ // Read the next byte
966
+ uint64 bits = (uint8)(_byteswap_ulong(*(uint32*)p) >> (24 - bitpos));
967
+ p += 1;
968
+ // Expand each bit into each byte of the uint64.
969
+ bits = (bits | (bits << 28)) & 0xF0000000Full;
970
+ bits = (bits | (bits << 14)) & 0x3000300030003ull;
971
+ bits = (bits | (bits << 7)) & 0x0101010101010101ull;
972
+ *(uint64*)dst = *(uint64*)dst * 2 + _byteswap_uint64(bits);
973
+ dst += 8;
974
+ } while (dst < dst_end);
975
+ } else if (bitcount == 2) {
976
+ do {
977
+ // Read the next 2 bytes
978
+ uint64 bits = (uint16)(_byteswap_ulong(*(uint32*)p) >> (16 - bitpos));
979
+ p += 2;
980
+ // Expand each bit into each byte of the uint64.
981
+ bits = (bits | (bits << 24)) & 0xFF000000FFull;
982
+ bits = (bits | (bits << 12)) & 0xF000F000F000Full;
983
+ bits = (bits | (bits << 6)) & 0x0303030303030303ull;
984
+ *(uint64*)dst = *(uint64*)dst * 4 + _byteswap_uint64(bits);
985
+ dst += 8;
986
+ } while (dst < dst_end);
987
+
988
+ } else {
989
+ assert(bitcount == 3);
990
+ do {
991
+ // Read the next 3 bytes
992
+ uint64 bits = (_byteswap_ulong(*(uint32*)p) >> (8 - bitpos)) & 0xffffff;
993
+ p += 3;
994
+ // Expand each bit into each byte of the uint64.
995
+ bits = (bits | (bits << 20)) & 0xFFF00000FFFull;
996
+ bits = (bits | (bits << 10)) & 0x3F003F003F003Full;
997
+ bits = (bits | (bits << 5)) & 0x0707070707070707ull;
998
+ *(uint64*)dst = *(uint64*)dst * 8 + _byteswap_uint64(bits);
999
+ dst += 8;
1000
+ } while (dst < dst_end);
1001
+ }
1002
+ *(uint64*)dst_end = bak;
1003
+ return true;
1004
+ }
1005
+
1006
+ struct HuffRange {
1007
+ uint16 symbol;
1008
+ uint16 num;
1009
+ };
1010
+
1011
+ int Huff_ConvertToRanges(HuffRange *range, int num_symbols, int P, const uint8 *symlen, BitReader *bits) {
1012
+ int num_ranges = P >> 1, v, sym_idx = 0;
1013
+
1014
+ // Start with space?
1015
+ if (P & 1) {
1016
+ BitReader_Refill(bits);
1017
+ v = *symlen++;
1018
+ if (v >= 8)
1019
+ return -1;
1020
+ sym_idx = BitReader_ReadBitsNoRefill(bits, v + 1) + (1 << (v + 1)) - 1;
1021
+ }
1022
+ int syms_used = 0;
1023
+
1024
+ for (int i = 0; i < num_ranges; i++) {
1025
+ BitReader_Refill(bits);
1026
+ v = symlen[0];
1027
+ if (v >= 9)
1028
+ return -1;
1029
+ int num = BitReader_ReadBitsNoRefillZero(bits, v) + (1 << v);
1030
+ v = symlen[1];
1031
+ if (v >= 8)
1032
+ return -1;
1033
+ int space = BitReader_ReadBitsNoRefill(bits, v + 1) + (1 << (v + 1)) - 1;
1034
+ range[i].symbol = sym_idx;
1035
+ range[i].num = num;
1036
+ syms_used += num;
1037
+ sym_idx += num + space;
1038
+ symlen += 2;
1039
+ }
1040
+
1041
+ if (sym_idx >= 256 || syms_used >= num_symbols || sym_idx + num_symbols - syms_used > 256)
1042
+ return -1;
1043
+
1044
+ range[num_ranges].symbol = sym_idx;
1045
+ range[num_ranges].num = num_symbols - syms_used;
1046
+
1047
+ return num_ranges + 1;
1048
+ }
1049
+
1050
+ int Huff_ReadCodeLengthsNew(BitReader *bits, uint8 *syms, uint32 *code_prefix) {
1051
+ int forced_bits = BitReader_ReadBitsNoRefill(bits, 2);
1052
+
1053
+ int num_symbols = BitReader_ReadBitsNoRefill(bits, 8) + 1;
1054
+
1055
+ int fluff = BitReader_ReadFluff(bits, num_symbols);
1056
+
1057
+ uint8 code_len[512];
1058
+ BitReader2 br2;
1059
+ br2.bitpos = (bits->bitpos - 24) & 7;
1060
+ br2.p_end = bits->p_end;
1061
+ br2.p = bits->p - (unsigned)((24 - bits->bitpos + 7) >> 3);
1062
+
1063
+ if (!DecodeGolombRiceLengths(code_len, num_symbols + fluff, &br2))
1064
+ return -1;
1065
+ memset(code_len + (num_symbols + fluff), 0, 16);
1066
+ if (!DecodeGolombRiceBits(code_len, num_symbols, forced_bits, &br2))
1067
+ return -1;
1068
+
1069
+ // Reset the bits decoder.
1070
+ bits->bitpos = 24;
1071
+ bits->p = br2.p;
1072
+ bits->bits = 0;
1073
+ BitReader_Refill(bits);
1074
+ bits->bits <<= br2.bitpos;
1075
+ bits->bitpos += br2.bitpos;
1076
+
1077
+ if (1) {
1078
+ uint running_sum = 0x1e;
1079
+ int maxlen = 11;
1080
+ for (int i = 0; i < num_symbols; i++) {
1081
+ int v = code_len[i];
1082
+ v = -(int)(v & 1) ^ (v >> 1);
1083
+ code_len[i] = v + (running_sum >> 2) + 1;
1084
+ if (code_len[i] < 1 || code_len[i] > 11)
1085
+ return -1;
1086
+ running_sum += v;
1087
+ }
1088
+
1089
+ } else {
1090
+ // Ensure we don't read unknown data that could contaminate
1091
+ // max_codeword_len.
1092
+ __m128i bak = _mm_loadu_si128((__m128i*)&code_len[num_symbols]);
1093
+ _mm_storeu_si128((__m128i*)&code_len[num_symbols], _mm_set1_epi32(0));
1094
+ // apply a filter
1095
+ __m128i avg = _mm_set1_epi8(0x1e);
1096
+ __m128i ones = _mm_set1_epi8(1);
1097
+ __m128i max_codeword_len = _mm_set1_epi8(10);
1098
+ for (uint i = 0; i < num_symbols; i += 16) {
1099
+ __m128i v = _mm_loadu_si128((__m128i*)&code_len[i]), t;
1100
+ // avg[0..15] = avg[15]
1101
+ avg = _mm_unpackhi_epi8(avg, avg);
1102
+ avg = _mm_unpackhi_epi8(avg, avg);
1103
+ avg = _mm_shuffle_epi32(avg, 255);
1104
+ // v = -(int)(v & 1) ^ (v >> 1)
1105
+ v = _mm_xor_si128(_mm_sub_epi8(_mm_set1_epi8(0), _mm_and_si128(v, ones)),
1106
+ _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(0x7f)));
1107
+ // create all the sums. v[n] = v[0] + ... + v[n]
1108
+ t = _mm_add_epi8(_mm_slli_si128(v, 1), v);
1109
+ t = _mm_add_epi8(_mm_slli_si128(t, 2), t);
1110
+ t = _mm_add_epi8(_mm_slli_si128(t, 4), t);
1111
+ t = _mm_add_epi8(_mm_slli_si128(t, 8), t);
1112
+ // u[x] = (avg + t[x-1]) >> 2
1113
+ __m128i u = _mm_and_si128(_mm_srli_epi16(_mm_add_epi8(_mm_slli_si128(t, 1), avg), 2u), _mm_set1_epi8(0x3f));
1114
+ // v += u
1115
+ v = _mm_add_epi8(v, u);
1116
+ // avg += t
1117
+ avg = _mm_add_epi8(avg, t);
1118
+ // max_codeword_len = max(max_codeword_len, v)
1119
+ max_codeword_len = _mm_max_epu8(max_codeword_len, v);
1120
+ // mem[] = v+1
1121
+ _mm_storeu_si128((__m128i*)&code_len[i], _mm_add_epi8(v, _mm_set1_epi8(1)));
1122
+ }
1123
+ _mm_storeu_si128((__m128i*)&code_len[num_symbols], bak);
1124
+ if (_mm_movemask_epi8(_mm_cmpeq_epi8(max_codeword_len, _mm_set1_epi8(10))) != 0xffff)
1125
+ return -1; // codeword too big?
1126
+ }
1127
+
1128
+ HuffRange range[128];
1129
+ int ranges = Huff_ConvertToRanges(range, num_symbols, fluff, &code_len[num_symbols], bits);
1130
+ if (ranges <= 0)
1131
+ return -1;
1132
+
1133
+ uint8 *cp = code_len;
1134
+ for (int i = 0; i < ranges; i++) {
1135
+ int sym = range[i].symbol;
1136
+ int n = range[i].num;
1137
+ do {
1138
+ syms[code_prefix[*cp++]++] = sym++;
1139
+ } while (--n);
1140
+ }
1141
+
1142
+ return num_symbols;
1143
+ }
1144
+
1145
+ struct NewHuffLut {
1146
+ // Mapping that maps a bit pattern to a code length.
1147
+ uint8 bits2len[2048 + 16];
1148
+ // Mapping that maps a bit pattern to a symbol.
1149
+ uint8 bits2sym[2048 + 16];
1150
+ };
1151
+
1152
+ // May overflow 16 bytes past the end
1153
+ void FillByteOverflow16(uint8 *dst, uint8 v, size_t n) {
1154
+ memset(dst, v, n);
1155
+ }
1156
+
1157
+ bool Huff_MakeLut(const uint32 *prefix_org, const uint32 *prefix_cur, NewHuffLut *hufflut, uint8 *syms) {
1158
+ uint32 currslot = 0;
1159
+ for(uint32 i = 1; i < 11; i++) {
1160
+ uint32 start = prefix_org[i];
1161
+ uint32 count = prefix_cur[i] - start;
1162
+ if (count) {
1163
+ uint32 stepsize = 1 << (11 - i);
1164
+ uint32 num_to_set = count << (11 - i);
1165
+ if (currslot + num_to_set > 2048)
1166
+ return false;
1167
+ FillByteOverflow16(&hufflut->bits2len[currslot], i, num_to_set);
1168
+
1169
+ uint8 *p = &hufflut->bits2sym[currslot];
1170
+ for (uint32 j = 0; j != count; j++, p += stepsize)
1171
+ FillByteOverflow16(p, syms[start + j], stepsize);
1172
+ currslot += num_to_set;
1173
+ }
1174
+ }
1175
+ if (prefix_cur[11] - prefix_org[11] != 0) {
1176
+ uint32 num_to_set = prefix_cur[11] - prefix_org[11];
1177
+ if (currslot + num_to_set > 2048)
1178
+ return false;
1179
+ FillByteOverflow16(&hufflut->bits2len[currslot], 11, num_to_set);
1180
+ memcpy(&hufflut->bits2sym[currslot], &syms[prefix_org[11]], num_to_set);
1181
+ currslot += num_to_set;
1182
+ }
1183
+ return currslot == 2048;
1184
+ }
1185
+
1186
+ int Kraken_DecodeBytes_Type12(const byte *src, size_t src_size, byte *output, int output_size, int type) {
1187
+ BitReader bits;
1188
+ int half_output_size;
1189
+ uint32 split_left, split_mid, split_right;
1190
+ const byte *src_mid;
1191
+ NewHuffLut huff_lut;
1192
+ HuffReader hr;
1193
+ HuffRevLut rev_lut;
1194
+ const uint8 *src_end = src + src_size;
1195
+
1196
+ bits.bitpos = 24;
1197
+ bits.bits = 0;
1198
+ bits.p = src;
1199
+ bits.p_end = src_end;
1200
+ BitReader_Refill(&bits);
1201
+
1202
+ static const uint32 code_prefix_org[12] = { 0x0, 0x0, 0x2, 0x6, 0xE, 0x1E, 0x3E, 0x7E, 0xFE, 0x1FE, 0x2FE, 0x3FE };
1203
+ uint32 code_prefix[12] = { 0x0, 0x0, 0x2, 0x6, 0xE, 0x1E, 0x3E, 0x7E, 0xFE, 0x1FE, 0x2FE, 0x3FE };
1204
+ uint8 syms[1280];
1205
+ int num_syms;
1206
+ if (!BitReader_ReadBitNoRefill(&bits)) {
1207
+ num_syms = Huff_ReadCodeLengthsOld(&bits, syms, code_prefix);
1208
+ } else if (!BitReader_ReadBitNoRefill(&bits)) {
1209
+ num_syms = Huff_ReadCodeLengthsNew(&bits, syms, code_prefix);
1210
+ } else {
1211
+ return -1;
1212
+ }
1213
+
1214
+ if (num_syms < 1)
1215
+ return -1;
1216
+ src = bits.p - ((24 - bits.bitpos) / 8);
1217
+
1218
+ if (num_syms == 1) {
1219
+ memset(output, syms[0], output_size);
1220
+ return src - src_end;
1221
+ }
1222
+
1223
+ if (!Huff_MakeLut(code_prefix_org, code_prefix, &huff_lut, syms))
1224
+ return -1;
1225
+
1226
+ ReverseBitsArray2048(huff_lut.bits2len, rev_lut.bits2len);
1227
+ ReverseBitsArray2048(huff_lut.bits2sym, rev_lut.bits2sym);
1228
+
1229
+ if (type == 1) {
1230
+ if (src + 3 > src_end)
1231
+ return -1;
1232
+ split_mid = *(uint16*)src;
1233
+ src += 2;
1234
+ hr.output = output;
1235
+ hr.output_end = output + output_size;
1236
+ hr.src = src;
1237
+ hr.src_end = src_end;
1238
+ hr.src_mid_org = hr.src_mid = src + split_mid;
1239
+ hr.src_bitpos = 0;
1240
+ hr.src_bits = 0;
1241
+ hr.src_mid_bitpos = 0;
1242
+ hr.src_mid_bits = 0;
1243
+ hr.src_end_bitpos = 0;
1244
+ hr.src_end_bits = 0;
1245
+ if (!Kraken_DecodeBytesCore(&hr, &rev_lut))
1246
+ return -1;
1247
+ } else {
1248
+ if (src + 6 > src_end)
1249
+ return -1;
1250
+
1251
+ half_output_size = (output_size + 1) >> 1;
1252
+ split_mid = *(uint32*)src & 0xFFFFFF;
1253
+ src += 3;
1254
+ if (split_mid > (src_end - src))
1255
+ return -1;
1256
+ src_mid = src + split_mid;
1257
+ split_left = *(uint16*)src;
1258
+ src += 2;
1259
+ if (src_mid - src < split_left + 2 || src_end - src_mid < 3)
1260
+ return -1;
1261
+ split_right = *(uint16*)src_mid;
1262
+ if (src_end - (src_mid + 2) < split_right + 2)
1263
+ return -1;
1264
+
1265
+ hr.output = output;
1266
+ hr.output_end = output + half_output_size;
1267
+ hr.src = src;
1268
+ hr.src_end = src_mid;
1269
+ hr.src_mid_org = hr.src_mid = src + split_left;
1270
+ hr.src_bitpos = 0;
1271
+ hr.src_bits = 0;
1272
+ hr.src_mid_bitpos = 0;
1273
+ hr.src_mid_bits = 0;
1274
+ hr.src_end_bitpos = 0;
1275
+ hr.src_end_bits = 0;
1276
+ if (!Kraken_DecodeBytesCore(&hr, &rev_lut))
1277
+ return -1;
1278
+
1279
+ hr.output = output + half_output_size;
1280
+ hr.output_end = output + output_size;
1281
+ hr.src = src_mid + 2;
1282
+ hr.src_end = src_end;
1283
+ hr.src_mid_org = hr.src_mid = src_mid + 2 + split_right;
1284
+ hr.src_bitpos = 0;
1285
+ hr.src_bits = 0;
1286
+ hr.src_mid_bitpos = 0;
1287
+ hr.src_mid_bits = 0;
1288
+ hr.src_end_bitpos = 0;
1289
+ hr.src_end_bits = 0;
1290
+ if (!Kraken_DecodeBytesCore(&hr, &rev_lut))
1291
+ return -1;
1292
+ }
1293
+ return (int)src_size;
1294
+ }
1295
+
1296
+ static uint32 bitmasks[32] = {
1297
+ 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff,
1298
+ 0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff,
1299
+ 0x1ffff, 0x3ffff, 0x7ffff, 0xfffff, 0x1fffff, 0x3fffff, 0x7fffff,
1300
+ 0xffffff, 0x1ffffff, 0x3ffffff, 0x7ffffff, 0xfffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff
1301
+ };
1302
+
1303
+ int Kraken_DecodeMultiArray(const uint8 *src, const uint8 *src_end,
1304
+ uint8 *dst, uint8 *dst_end,
1305
+ uint8 **array_data, int *array_lens, int array_count,
1306
+ int *total_size_out, bool force_memmove, uint8 *scratch, uint8 *scratch_end) {
1307
+ const uint8 *src_org = src;
1308
+
1309
+ if (src_end - src < 4)
1310
+ return -1;
1311
+
1312
+ int decoded_size;
1313
+ int num_arrays_in_file = *src++;
1314
+ if (!(num_arrays_in_file & 0x80))
1315
+ return -1;
1316
+ num_arrays_in_file &= 0x3f;
1317
+
1318
+ if (dst == scratch) {
1319
+ // todo: ensure scratch space first?
1320
+ scratch += (scratch_end - scratch - 0xc000) >> 1;
1321
+ dst_end = scratch;
1322
+ }
1323
+
1324
+ int total_size = 0;
1325
+
1326
+ if (num_arrays_in_file == 0) {
1327
+ for (int i = 0; i < array_count; i++) {
1328
+ uint8 *chunk_dst = dst;
1329
+ int dec = Kraken_DecodeBytes(&chunk_dst, src, src_end, &decoded_size, dst_end - dst, force_memmove, scratch, scratch_end);
1330
+ if (dec < 0)
1331
+ return -1;
1332
+ dst += decoded_size;
1333
+ array_lens[i] = decoded_size;
1334
+ array_data[i] = chunk_dst;
1335
+ src += dec;
1336
+ total_size += decoded_size;
1337
+ }
1338
+ *total_size_out = total_size;
1339
+ return src - src_org; // not supported yet
1340
+ }
1341
+
1342
+ uint8 *entropy_array_data[32];
1343
+ uint32 entropy_array_size[32];
1344
+
1345
+ // First loop just decodes everything to scratch
1346
+ uint8 *scratch_cur = scratch;
1347
+
1348
+ for(int i = 0; i < num_arrays_in_file; i++) {
1349
+ uint8 *chunk_dst = scratch_cur;
1350
+ int dec = Kraken_DecodeBytes(&chunk_dst, src, src_end, &decoded_size, scratch_end - scratch_cur, force_memmove, scratch_cur, scratch_end);
1351
+ if (dec < 0)
1352
+ return -1;
1353
+ entropy_array_data[i] = chunk_dst;
1354
+ entropy_array_size[i] = decoded_size;
1355
+ scratch_cur += decoded_size;
1356
+ total_size += decoded_size;
1357
+ src += dec;
1358
+ }
1359
+ *total_size_out = total_size;
1360
+
1361
+ if (src_end - src < 3)
1362
+ return -1;
1363
+
1364
+ int Q = *(uint16*)src;
1365
+ src += 2;
1366
+
1367
+ int out_size;
1368
+ if (Kraken_GetBlockSize(src, src_end, &out_size, total_size) < 0)
1369
+ return -1;
1370
+ int num_indexes = out_size;
1371
+
1372
+ int num_lens = num_indexes - array_count;
1373
+ if (num_lens < 1)
1374
+ return -1;
1375
+
1376
+ if (scratch_end - scratch_cur < num_indexes)
1377
+ return -1;
1378
+ uint8 *interval_lenlog2 = scratch_cur;
1379
+ scratch_cur += num_indexes;
1380
+
1381
+ if (scratch_end - scratch_cur < num_indexes)
1382
+ return -1;
1383
+ uint8 *interval_indexes = scratch_cur;
1384
+ scratch_cur += num_indexes;
1385
+
1386
+
1387
+ if (Q & 0x8000) {
1388
+ int size_out;
1389
+ int n = Kraken_DecodeBytes(&interval_indexes, src, src_end, &size_out, num_indexes, false, scratch_cur, scratch_end);
1390
+ if (n < 0 || size_out != num_indexes)
1391
+ return -1;
1392
+ src += n;
1393
+
1394
+ for (int i = 0; i < num_indexes; i++) {
1395
+ int t = interval_indexes[i];
1396
+ interval_lenlog2[i] = t >> 4;
1397
+ interval_indexes[i] = t & 0xF;
1398
+ }
1399
+
1400
+ num_lens = num_indexes;
1401
+ } else {
1402
+ int lenlog2_chunksize = num_indexes - array_count;
1403
+
1404
+ int size_out;
1405
+ int n = Kraken_DecodeBytes(&interval_indexes, src, src_end, &size_out, num_indexes, false, scratch_cur, scratch_end);
1406
+ if (n < 0 || size_out != num_indexes)
1407
+ return -1;
1408
+ src += n;
1409
+
1410
+ n = Kraken_DecodeBytes(&interval_lenlog2, src, src_end, &size_out, lenlog2_chunksize, false, scratch_cur, scratch_end);
1411
+ if (n < 0 || size_out != lenlog2_chunksize)
1412
+ return -1;
1413
+ src += n;
1414
+
1415
+ for (int i = 0; i < lenlog2_chunksize; i++)
1416
+ if (interval_lenlog2[i] > 16)
1417
+ return -1;
1418
+ }
1419
+
1420
+ if (scratch_end - scratch_cur < 4)
1421
+ return -1;
1422
+
1423
+ scratch_cur = ALIGN_POINTER(scratch_cur, 4);
1424
+ if (scratch_end - scratch_cur < num_lens * 4)
1425
+ return -1;
1426
+ uint32 *decoded_intervals = (uint32*)scratch_cur;
1427
+
1428
+ int varbits_complen = Q & 0x3FFF;
1429
+ if (src_end - src < varbits_complen)
1430
+ return -1;
1431
+
1432
+ const uint8 *f = src;
1433
+ uint32 bits_f = 0;
1434
+ int bitpos_f = 24;
1435
+
1436
+ const uint8 *src_end_actual = src + varbits_complen;
1437
+
1438
+ const uint8 *b = src_end_actual;
1439
+ uint32 bits_b = 0;
1440
+ int bitpos_b = 24;
1441
+
1442
+
1443
+ int i;
1444
+ for (i = 0; i + 2 <= num_lens; i += 2) {
1445
+ bits_f |= _byteswap_ulong(*(uint32*)f) >> (24 - bitpos_f);
1446
+ f += (bitpos_f + 7) >> 3;
1447
+
1448
+ bits_b |= ((uint32*)b)[-1] >> (24 - bitpos_b);
1449
+ b -= (bitpos_b + 7) >> 3;
1450
+
1451
+ int numbits_f = interval_lenlog2[i + 0];
1452
+ int numbits_b = interval_lenlog2[i + 1];
1453
+
1454
+ bits_f = _rotl(bits_f | 1, numbits_f);
1455
+ bitpos_f += numbits_f - 8 * ((bitpos_f + 7) >> 3);
1456
+
1457
+ bits_b = _rotl(bits_b | 1, numbits_b);
1458
+ bitpos_b += numbits_b - 8 * ((bitpos_b + 7) >> 3);
1459
+
1460
+ int value_f = bits_f & bitmasks[numbits_f];
1461
+ bits_f &= ~bitmasks[numbits_f];
1462
+
1463
+ int value_b = bits_b & bitmasks[numbits_b];
1464
+ bits_b &= ~bitmasks[numbits_b];
1465
+
1466
+ decoded_intervals[i + 0] = value_f;
1467
+ decoded_intervals[i + 1] = value_b;
1468
+ }
1469
+
1470
+ // read final one since above loop reads 2
1471
+ if (i < num_lens) {
1472
+ bits_f |= _byteswap_ulong(*(uint32*)f) >> (24 - bitpos_f);
1473
+ int numbits_f = interval_lenlog2[i];
1474
+ bits_f = _rotl(bits_f | 1, numbits_f);
1475
+ int value_f = bits_f & bitmasks[numbits_f];
1476
+ decoded_intervals[i + 0] = value_f;
1477
+ }
1478
+
1479
+ if (interval_indexes[num_indexes - 1])
1480
+ return -1;
1481
+
1482
+ int indi = 0, leni = 0, source;
1483
+ int increment_leni = (Q & 0x8000) != 0;
1484
+
1485
+ for(int arri = 0; arri < array_count; arri++) {
1486
+ array_data[arri] = dst;
1487
+ if (indi >= num_indexes)
1488
+ return -1;
1489
+
1490
+ while ((source = interval_indexes[indi++]) != 0) {
1491
+ if (source > num_arrays_in_file)
1492
+ return -1;
1493
+ if (leni >= num_lens)
1494
+ return -1;
1495
+ int cur_len = decoded_intervals[leni++];
1496
+ int bytes_left = entropy_array_size[source - 1];
1497
+ if (cur_len > bytes_left || cur_len > dst_end - dst)
1498
+ return -1;
1499
+ uint8 *blksrc = entropy_array_data[source - 1];
1500
+ entropy_array_size[source - 1] -= cur_len;
1501
+ entropy_array_data[source - 1] += cur_len;
1502
+ uint8 *dstx = dst;
1503
+ dst += cur_len;
1504
+ memcpy(dstx, blksrc, cur_len);
1505
+ }
1506
+ leni += increment_leni;
1507
+ array_lens[arri] = dst - array_data[arri];
1508
+ }
1509
+
1510
+ if (indi != num_indexes || leni != num_lens)
1511
+ return -1;
1512
+
1513
+ for (int i = 0; i < num_arrays_in_file; i++) {
1514
+ if (entropy_array_size[i])
1515
+ return -1;
1516
+ }
1517
+ return src_end_actual - src_org;
1518
+ }
1519
+
1520
+ int Krak_DecodeRecursive(const byte *src, size_t src_size, byte *output, int output_size, uint8 *scratch, uint8 *scratch_end) {
1521
+ const uint8 *src_org = src;
1522
+ byte *output_end = output + output_size;
1523
+ const byte *src_end = src + src_size;
1524
+
1525
+ if (src_size < 6)
1526
+ return -1;
1527
+
1528
+ int n = src[0] & 0x7f;
1529
+ if (n < 2)
1530
+ return -1;
1531
+
1532
+ if (!(src[0] & 0x80)) {
1533
+ src++;
1534
+ do {
1535
+ int decoded_size;
1536
+ int dec = Kraken_DecodeBytes(&output, src, src_end, &decoded_size, output_end - output, true, scratch, scratch_end);
1537
+ if (dec < 0)
1538
+ return -1;
1539
+ output += decoded_size;
1540
+ src += dec;
1541
+ } while (--n);
1542
+ if (output != output_end)
1543
+ return -1;
1544
+ return src - src_org;
1545
+ } else {
1546
+ uint8 *array_data;
1547
+ int array_len, decoded_size;
1548
+ int dec = Kraken_DecodeMultiArray(src, src_end, output, output_end, &array_data, &array_len, 1, &decoded_size, true, scratch, scratch_end);
1549
+ if (dec < 0)
1550
+ return -1;
1551
+ output += decoded_size;
1552
+ if (output != output_end)
1553
+ return -1;
1554
+ return dec;
1555
+ }
1556
+ }
1557
+
1558
+ int Krak_DecodeRLE(const byte *src, size_t src_size, byte *dst, int dst_size, uint8 *scratch, uint8 *scratch_end) {
1559
+ if (src_size <= 1) {
1560
+ if (src_size != 1)
1561
+ return -1;
1562
+ memset(dst, src[0], dst_size);
1563
+ return 1;
1564
+ }
1565
+ uint8 *dst_end = dst + dst_size;
1566
+ const uint8 *cmd_ptr = src + 1, *cmd_ptr_end = src + src_size;
1567
+ // Unpack the first X bytes of the command buffer?
1568
+ if (src[0]) {
1569
+ uint8 *dst_ptr = scratch;
1570
+ int dec_size;
1571
+ int n = Kraken_DecodeBytes(&dst_ptr, src, src + src_size, &dec_size, scratch_end - scratch, true, scratch, scratch_end);
1572
+ if (n <= 0)
1573
+ return -1;
1574
+ int cmd_len = src_size - n + dec_size;
1575
+ if (cmd_len > scratch_end - scratch)
1576
+ return -1;
1577
+ memcpy(dst_ptr + dec_size, src + n, src_size - n);
1578
+ cmd_ptr = dst_ptr;
1579
+ cmd_ptr_end = &dst_ptr[cmd_len];
1580
+ }
1581
+
1582
+ int rle_byte = 0;
1583
+
1584
+ while (cmd_ptr < cmd_ptr_end) {
1585
+ uint32 cmd = cmd_ptr_end[-1];
1586
+ if (cmd - 1 >= 0x2f) {
1587
+ cmd_ptr_end--;
1588
+ uint32 bytes_to_copy = (-1 - cmd) & 0xF;
1589
+ uint32 bytes_to_rle = cmd >> 4;
1590
+ if (dst_end - dst < bytes_to_copy + bytes_to_rle || cmd_ptr_end - cmd_ptr < bytes_to_copy)
1591
+ return -1;
1592
+ memcpy(dst, cmd_ptr, bytes_to_copy);
1593
+ cmd_ptr += bytes_to_copy;
1594
+ dst += bytes_to_copy;
1595
+ memset(dst, rle_byte, bytes_to_rle);
1596
+ dst += bytes_to_rle;
1597
+ } else if (cmd >= 0x10) {
1598
+ uint32 data = *(uint16*)(cmd_ptr_end - 2) - 4096;
1599
+ cmd_ptr_end -= 2;
1600
+ uint32 bytes_to_copy = data & 0x3F;
1601
+ uint32 bytes_to_rle = data >> 6;
1602
+ if (dst_end - dst < bytes_to_copy + bytes_to_rle || cmd_ptr_end - cmd_ptr < bytes_to_copy)
1603
+ return -1;
1604
+ memcpy(dst, cmd_ptr, bytes_to_copy);
1605
+ cmd_ptr += bytes_to_copy;
1606
+ dst += bytes_to_copy;
1607
+ memset(dst, rle_byte, bytes_to_rle);
1608
+ dst += bytes_to_rle;
1609
+ } else if (cmd == 1) {
1610
+ rle_byte = *cmd_ptr++;
1611
+ cmd_ptr_end--;
1612
+ } else if (cmd >= 9) {
1613
+ uint32 bytes_to_rle = (*(uint16*)(cmd_ptr_end - 2) - 0x8ff) * 128;
1614
+ cmd_ptr_end -= 2;
1615
+ if (dst_end - dst < bytes_to_rle)
1616
+ return -1;
1617
+ memset(dst, rle_byte, bytes_to_rle);
1618
+ dst += bytes_to_rle;
1619
+ } else {
1620
+ uint32 bytes_to_copy = (*(uint16*)(cmd_ptr_end - 2) - 511) * 64;
1621
+ cmd_ptr_end -= 2;
1622
+ if (cmd_ptr_end - cmd_ptr < bytes_to_copy || dst_end - dst < bytes_to_copy)
1623
+ return -1;
1624
+ memcpy(dst, cmd_ptr, bytes_to_copy);
1625
+ dst += bytes_to_copy;
1626
+ cmd_ptr += bytes_to_copy;
1627
+ }
1628
+ }
1629
+ if (cmd_ptr_end != cmd_ptr)
1630
+ return -1;
1631
+
1632
+ if (dst != dst_end)
1633
+ return -1;
1634
+
1635
+ return src_size;
1636
+ }
1637
+
1638
+ struct TansData {
1639
+ uint32 A_used;
1640
+ uint32 B_used;
1641
+ uint8 A[256];
1642
+ uint32 B[256];
1643
+ };
1644
+
1645
+ template<typename T> void SimpleSort(T *p, T *pend) {
1646
+ if (p != pend) {
1647
+ for (T *lp = p + 1, *rp; lp != pend; lp++) {
1648
+ T t = lp[0];
1649
+ for (rp = lp; rp > p && t < rp[-1]; rp--)
1650
+ rp[0] = rp[-1];
1651
+ rp[0] = t;
1652
+ }
1653
+ }
1654
+ }
1655
+
1656
+ bool Tans_DecodeTable(BitReader *bits, int L_bits, TansData *tans_data) {
1657
+ BitReader_Refill(bits);
1658
+ if (BitReader_ReadBitNoRefill(bits)) {
1659
+ int Q = BitReader_ReadBitsNoRefill(bits, 3);
1660
+ int num_symbols = BitReader_ReadBitsNoRefill(bits, 8) + 1;
1661
+ if (num_symbols < 2)
1662
+ return false;
1663
+ int fluff = BitReader_ReadFluff(bits, num_symbols);
1664
+ int total_rice_values = fluff + num_symbols;
1665
+ uint8 rice[512 + 16];
1666
+ BitReader2 br2;
1667
+
1668
+ // another bit reader...
1669
+ br2.p = bits->p - ((uint)(24 - bits->bitpos + 7) >> 3);
1670
+ br2.p_end = bits->p_end;
1671
+ br2.bitpos = (bits->bitpos - 24) & 7;
1672
+
1673
+ if (!DecodeGolombRiceLengths(rice, total_rice_values, &br2))
1674
+ return false;
1675
+ memset(rice + total_rice_values, 0, 16);
1676
+
1677
+ // Switch back to other bitreader impl
1678
+ bits->bitpos = 24;
1679
+ bits->p = br2.p;
1680
+ bits->bits = 0;
1681
+ BitReader_Refill(bits);
1682
+ bits->bits <<= br2.bitpos;
1683
+ bits->bitpos += br2.bitpos;
1684
+
1685
+ HuffRange range[133];
1686
+ fluff = Huff_ConvertToRanges(range, num_symbols, fluff, &rice[num_symbols], bits);
1687
+ if (fluff < 0)
1688
+ return false;
1689
+
1690
+ BitReader_Refill(bits);
1691
+
1692
+ uint32 L = 1 << L_bits;
1693
+ uint8 *cur_rice_ptr = rice;
1694
+ int average = 6;
1695
+ int somesum = 0;
1696
+ uint8 *tanstable_A = tans_data->A;
1697
+ uint32 *tanstable_B = tans_data->B;
1698
+
1699
+ for (int ri = 0; ri < fluff; ri++) {
1700
+ int symbol = range[ri].symbol;
1701
+ int num = range[ri].num;
1702
+ do {
1703
+ BitReader_Refill(bits);
1704
+
1705
+ int nextra = Q + *cur_rice_ptr++;
1706
+ if (nextra > 15)
1707
+ return false;
1708
+ int v = BitReader_ReadBitsNoRefillZero(bits, nextra) + (1 << nextra) - (1 << Q);
1709
+
1710
+ int average_div4 = average >> 2;
1711
+ int limit = 2 * average_div4;
1712
+ if (v <= limit)
1713
+ v = average_div4 + (-(v & 1) ^ ((uint32)v >> 1));
1714
+ if (limit > v)
1715
+ limit = v;
1716
+ v += 1;
1717
+ average += limit - average_div4;
1718
+ *tanstable_A = symbol;
1719
+ *tanstable_B = (symbol << 16) + v;
1720
+ tanstable_A += (v == 1);
1721
+ tanstable_B += v >= 2;
1722
+ somesum += v;
1723
+ symbol += 1;
1724
+ } while (--num);
1725
+ }
1726
+ tans_data->A_used = tanstable_A - tans_data->A;
1727
+ tans_data->B_used = tanstable_B - tans_data->B;
1728
+ if (somesum != L)
1729
+ return false;
1730
+
1731
+ return true;
1732
+ } else {
1733
+ bool seen[256];
1734
+ memset(seen, 0, sizeof(seen));
1735
+ uint32 L = 1 << L_bits;
1736
+
1737
+ int count = BitReader_ReadBitsNoRefill(bits, 3) + 1;
1738
+
1739
+ int bits_per_sym = BSR(L_bits) + 1;
1740
+ int max_delta_bits = BitReader_ReadBitsNoRefill(bits, bits_per_sym);
1741
+
1742
+ if (max_delta_bits == 0 || max_delta_bits > L_bits)
1743
+ return false;
1744
+
1745
+ uint8 *tanstable_A = tans_data->A;
1746
+ uint32 *tanstable_B = tans_data->B;
1747
+
1748
+ int weight = 0;
1749
+ int total_weights = 0;
1750
+
1751
+ do {
1752
+ BitReader_Refill(bits);
1753
+
1754
+ int sym = BitReader_ReadBitsNoRefill(bits, 8);
1755
+ if (seen[sym])
1756
+ return false;
1757
+
1758
+ int delta = BitReader_ReadBitsNoRefill(bits, max_delta_bits);
1759
+
1760
+ weight += delta;
1761
+
1762
+ if (weight == 0)
1763
+ return false;
1764
+
1765
+ seen[sym] = true;
1766
+ if (weight == 1) {
1767
+ *tanstable_A++ = sym;
1768
+ } else {
1769
+ *tanstable_B++ = (sym << 16) + weight;
1770
+ }
1771
+
1772
+ total_weights += weight;
1773
+ } while (--count);
1774
+
1775
+ BitReader_Refill(bits);
1776
+
1777
+ int sym = BitReader_ReadBitsNoRefill(bits, 8);
1778
+ if (seen[sym])
1779
+ return false;
1780
+
1781
+ if (L - total_weights < weight || L - total_weights <= 1)
1782
+ return false;
1783
+
1784
+ *tanstable_B++ = (sym << 16) + (L - total_weights);
1785
+
1786
+ tans_data->A_used = tanstable_A - tans_data->A;
1787
+ tans_data->B_used = tanstable_B - tans_data->B;
1788
+
1789
+ SimpleSort(tans_data->A, tanstable_A);
1790
+ SimpleSort(tans_data->B, tanstable_B);
1791
+ return true;
1792
+ }
1793
+ }
1794
+
1795
+ struct TansLutEnt {
1796
+ uint32 x;
1797
+ uint8 bits_x;
1798
+ uint8 symbol;
1799
+ uint16 w;
1800
+ };
1801
+
1802
+ void Tans_InitLut(TansData *tans_data, int L_bits, TansLutEnt *lut) {
1803
+ TansLutEnt *pointers[4];
1804
+
1805
+ int L = 1 << L_bits;
1806
+ int a_used = tans_data->A_used;
1807
+
1808
+ uint slots_left_to_alloc = L - a_used;
1809
+
1810
+ uint sa = slots_left_to_alloc >> 2;
1811
+ pointers[0] = lut;
1812
+ uint sb = sa + ((slots_left_to_alloc & 3) > 0);
1813
+ pointers[1] = lut + sb;
1814
+ sb += sa + ((slots_left_to_alloc & 3) > 1);
1815
+ pointers[2] = lut + sb;
1816
+ sb += sa + ((slots_left_to_alloc & 3) > 2);
1817
+ pointers[3] = lut + sb;
1818
+
1819
+ // Setup the single entrys with weight=1
1820
+ {
1821
+ TansLutEnt *lut_singles = lut + slots_left_to_alloc, le;
1822
+ le.w = 0;
1823
+ le.bits_x = L_bits;
1824
+ le.x = (1 << L_bits) - 1;
1825
+ for (int i = 0; i < a_used; i++) {
1826
+ lut_singles[i] = le;
1827
+ lut_singles[i].symbol = tans_data->A[i];
1828
+ }
1829
+ }
1830
+
1831
+ // Setup the entrys with weight >= 2
1832
+ int weights_sum = 0;
1833
+ for (int i = 0; i < tans_data->B_used; i++) {
1834
+ int weight = tans_data->B[i] & 0xffff;
1835
+ int symbol = tans_data->B[i] >> 16;
1836
+ if (weight > 4) {
1837
+ uint32 sym_bits = BSR(weight);
1838
+ int Z = L_bits - sym_bits;
1839
+ TansLutEnt le;
1840
+ le.symbol = symbol;
1841
+ le.bits_x = Z;
1842
+ le.x = (1 << Z) - 1;
1843
+ le.w = (L - 1) & (weight << Z);
1844
+ int what_to_add = 1 << Z;
1845
+ int X = (1 << (sym_bits + 1)) - weight;
1846
+
1847
+ for (int j = 0; j < 4; j++) {
1848
+ TansLutEnt *dst = pointers[j];
1849
+
1850
+ int Y = (weight + ((weights_sum - j - 1) & 3)) >> 2;
1851
+ if (X >= Y) {
1852
+ for(int n = Y; n; n--) {
1853
+ *dst++ = le;
1854
+ le.w += what_to_add;
1855
+ }
1856
+ X -= Y;
1857
+ } else {
1858
+ for (int n = X; n; n--) {
1859
+ *dst++ = le;
1860
+ le.w += what_to_add;
1861
+ }
1862
+ Z--;
1863
+
1864
+ what_to_add >>= 1;
1865
+ le.bits_x = Z;
1866
+ le.w = 0;
1867
+ le.x >>= 1;
1868
+ for (int n = Y - X; n; n--) {
1869
+ *dst++ = le;
1870
+ le.w += what_to_add;
1871
+ }
1872
+ X = weight;
1873
+ }
1874
+ pointers[j] = dst;
1875
+ }
1876
+ } else {
1877
+ assert(weight > 0);
1878
+ uint32 bits = ((1 << weight) - 1) << (weights_sum & 3);
1879
+ bits |= (bits >> 4);
1880
+ int n = weight, ww = weight;
1881
+ do {
1882
+ uint32 idx = BSF(bits);
1883
+ bits &= bits - 1;
1884
+ TansLutEnt *dst = pointers[idx]++;
1885
+ dst->symbol = symbol;
1886
+ uint32 weight_bits = BSR(ww);
1887
+ dst->bits_x = L_bits - weight_bits;
1888
+ dst->x = (1 << (L_bits - weight_bits)) - 1;
1889
+ dst->w = (L - 1) & (ww++ << (L_bits - weight_bits));
1890
+ } while (--n);
1891
+ }
1892
+ weights_sum += weight;
1893
+ }
1894
+ }
1895
+
1896
+ struct TansDecoderParams {
1897
+ TansLutEnt *lut;
1898
+ uint8 *dst, *dst_end;
1899
+ const uint8 *ptr_f, *ptr_b;
1900
+ uint32 bits_f, bits_b;
1901
+ int bitpos_f, bitpos_b;
1902
+ uint32 state_0, state_1, state_2, state_3, state_4;
1903
+ };
1904
+
1905
+ bool Tans_Decode(TansDecoderParams *params) {
1906
+ TansLutEnt *lut = params->lut, *e;
1907
+ uint8 *dst = params->dst, *dst_end = params->dst_end;
1908
+ const uint8 *ptr_f = params->ptr_f, *ptr_b = params->ptr_b;
1909
+ uint32 bits_f = params->bits_f, bits_b = params->bits_b;
1910
+ int bitpos_f = params->bitpos_f, bitpos_b = params->bitpos_b;
1911
+ uint32 state_0 = params->state_0, state_1 = params->state_1;
1912
+ uint32 state_2 = params->state_2, state_3 = params->state_3;
1913
+ uint32 state_4 = params->state_4;
1914
+
1915
+ if (ptr_f > ptr_b)
1916
+ return false;
1917
+
1918
+ #define TANS_FORWARD_BITS() \
1919
+ bits_f |= *(uint32 *)ptr_f << bitpos_f; \
1920
+ ptr_f += (31 - bitpos_f) >> 3; \
1921
+ bitpos_f |= 24;
1922
+
1923
+ #define TANS_FORWARD_ROUND(state) \
1924
+ e = &lut[state]; \
1925
+ *dst++ = e->symbol; \
1926
+ bitpos_f -= e->bits_x; \
1927
+ state = (bits_f & e->x) + e->w; \
1928
+ bits_f >>= e->bits_x; \
1929
+ if (dst >= dst_end) \
1930
+ break;
1931
+
1932
+ #define TANS_BACKWARD_BITS() \
1933
+ bits_b |= _byteswap_ulong(((uint32 *)ptr_b)[-1]) << bitpos_b; \
1934
+ ptr_b -= (31 - bitpos_b) >> 3; \
1935
+ bitpos_b |= 24;
1936
+
1937
+ #define TANS_BACKWARD_ROUND(state) \
1938
+ e = &lut[state]; \
1939
+ *dst++ = e->symbol; \
1940
+ bitpos_b -= e->bits_x; \
1941
+ state = (bits_b & e->x) + e->w; \
1942
+ bits_b >>= e->bits_x; \
1943
+ if (dst >= dst_end) \
1944
+ break;
1945
+
1946
+ if (dst < dst_end) {
1947
+ for (;;) {
1948
+ TANS_FORWARD_BITS();
1949
+ TANS_FORWARD_ROUND(state_0);
1950
+ TANS_FORWARD_ROUND(state_1);
1951
+ TANS_FORWARD_BITS();
1952
+ TANS_FORWARD_ROUND(state_2);
1953
+ TANS_FORWARD_ROUND(state_3);
1954
+ TANS_FORWARD_BITS();
1955
+ TANS_FORWARD_ROUND(state_4);
1956
+ TANS_BACKWARD_BITS();
1957
+ TANS_BACKWARD_ROUND(state_0);
1958
+ TANS_BACKWARD_ROUND(state_1);
1959
+ TANS_BACKWARD_BITS();
1960
+ TANS_BACKWARD_ROUND(state_2);
1961
+ TANS_BACKWARD_ROUND(state_3);
1962
+ TANS_BACKWARD_BITS();
1963
+ TANS_BACKWARD_ROUND(state_4);
1964
+ }
1965
+ }
1966
+
1967
+ if (ptr_b - ptr_f + (bitpos_f >> 3) + (bitpos_b >> 3) != 0)
1968
+ return false;
1969
+
1970
+ uint32 states_or = state_0 | state_1 | state_2 | state_3 | state_4;
1971
+ if (states_or & ~0xFF)
1972
+ return false;
1973
+
1974
+ dst_end[0] = (uint8)state_0;
1975
+ dst_end[1] = (uint8)state_1;
1976
+ dst_end[2] = (uint8)state_2;
1977
+ dst_end[3] = (uint8)state_3;
1978
+ dst_end[4] = (uint8)state_4;
1979
+ return true;
1980
+ }
1981
+
1982
+ int Krak_DecodeTans(const byte *src, size_t src_size, byte *dst, int dst_size, uint8 *scratch, uint8 *scratch_end) {
1983
+ if (src_size < 8 || dst_size < 5)
1984
+ return -1;
1985
+
1986
+ const uint8 *src_end = src + src_size;
1987
+
1988
+ BitReader br;
1989
+ TansData tans_data;
1990
+
1991
+ br.bitpos = 24;
1992
+ br.bits = 0;
1993
+ br.p = src;
1994
+ br.p_end = src_end;
1995
+ BitReader_Refill(&br);
1996
+
1997
+ // reserved bit
1998
+ if (BitReader_ReadBitNoRefill(&br))
1999
+ return -1;
2000
+
2001
+ int L_bits = BitReader_ReadBitsNoRefill(&br, 2) + 8;
2002
+
2003
+ if (!Tans_DecodeTable(&br, L_bits, &tans_data))
2004
+ return -1;
2005
+
2006
+ src = br.p - (24 - br.bitpos) / 8;
2007
+
2008
+ if (src >= src_end)
2009
+ return -1;
2010
+
2011
+ uint32 lut_space_required = ((sizeof(TansLutEnt) << L_bits) + 15) &~ 15;
2012
+ if (lut_space_required > (scratch_end - scratch))
2013
+ return -1;
2014
+
2015
+ TansDecoderParams params;
2016
+ params.dst = dst;
2017
+ params.dst_end = dst + dst_size - 5;
2018
+
2019
+ params.lut = (TansLutEnt *)ALIGN_POINTER(scratch, 16);
2020
+ Tans_InitLut(&tans_data, L_bits, params.lut);
2021
+
2022
+ // Read out the initial state
2023
+ uint32 L_mask = (1 << L_bits) - 1;
2024
+ uint32 bits_f = *(uint32*)src;
2025
+ src += 4;
2026
+ uint32 bits_b = _byteswap_ulong(*(uint32*)(src_end - 4));
2027
+ src_end -= 4;
2028
+ uint32 bitpos_f = 32, bitpos_b = 32;
2029
+
2030
+ // Read first two.
2031
+ params.state_0 = bits_f & L_mask;
2032
+ params.state_1 = bits_b & L_mask;
2033
+ bits_f >>= L_bits, bitpos_f -= L_bits;
2034
+ bits_b >>= L_bits, bitpos_b -= L_bits;
2035
+
2036
+ // Read next two.
2037
+ params.state_2 = bits_f & L_mask;
2038
+ params.state_3 = bits_b & L_mask;
2039
+ bits_f >>= L_bits, bitpos_f -= L_bits;
2040
+ bits_b >>= L_bits, bitpos_b -= L_bits;
2041
+
2042
+ // Refill more bits
2043
+ bits_f |= *(uint32 *)src << bitpos_f;
2044
+ src += (31 - bitpos_f) >> 3;
2045
+ bitpos_f |= 24;
2046
+
2047
+ // Read final state variable
2048
+ params.state_4 = bits_f & L_mask;
2049
+ bits_f >>= L_bits, bitpos_f -= L_bits;
2050
+
2051
+ params.bits_f = bits_f;
2052
+ params.ptr_f = src - (bitpos_f >> 3);
2053
+ params.bitpos_f = bitpos_f & 7;
2054
+
2055
+ params.bits_b = bits_b;
2056
+ params.ptr_b = src_end + (bitpos_b >> 3);
2057
+ params.bitpos_b = bitpos_b & 7;
2058
+
2059
+ if (!Tans_Decode(&params))
2060
+ return -1;
2061
+
2062
+ return src_size;
2063
+ }
2064
+
2065
+ int Kraken_GetBlockSize(const uint8 *src, const uint8 *src_end, int *dest_size, int dest_capacity) {
2066
+ const byte *src_org = src;
2067
+ int src_size, dst_size;
2068
+
2069
+ if (src_end - src < 2)
2070
+ return -1; // too few bytes
2071
+
2072
+ int chunk_type = (src[0] >> 4) & 0x7;
2073
+ if (chunk_type == 0) {
2074
+ if (src[0] >= 0x80) {
2075
+ // In this mode, memcopy stores the length in the bottom 12 bits.
2076
+ src_size = ((src[0] << 8) | src[1]) & 0xFFF;
2077
+ src += 2;
2078
+ } else {
2079
+ if (src_end - src < 3)
2080
+ return -1; // too few bytes
2081
+ src_size = ((src[0] << 16) | (src[1] << 8) | src[2]);
2082
+ if (src_size & ~0x3ffff)
2083
+ return -1; // reserved bits must not be set
2084
+ src += 3;
2085
+ }
2086
+ if (src_size > dest_capacity || src_end - src < src_size)
2087
+ return -1;
2088
+ *dest_size = src_size;
2089
+ return src + src_size - src_org;
2090
+ }
2091
+
2092
+ if (chunk_type >= 6)
2093
+ return -1;
2094
+
2095
+ // In all the other modes, the initial bytes encode
2096
+ // the src_size and the dst_size
2097
+ if (src[0] >= 0x80) {
2098
+ if (src_end - src < 3)
2099
+ return -1; // too few bytes
2100
+
2101
+ // short mode, 10 bit sizes
2102
+ uint32 bits = ((src[0] << 16) | (src[1] << 8) | src[2]);
2103
+ src_size = bits & 0x3ff;
2104
+ dst_size = src_size + ((bits >> 10) & 0x3ff) + 1;
2105
+ src += 3;
2106
+ } else {
2107
+ // long mode, 18 bit sizes
2108
+ if (src_end - src < 5)
2109
+ return -1; // too few bytes
2110
+ uint32 bits = ((src[1] << 24) | (src[2] << 16) | (src[3] << 8) | src[4]);
2111
+ src_size = bits & 0x3ffff;
2112
+ dst_size = (((bits >> 18) | (src[0] << 14)) & 0x3FFFF) + 1;
2113
+ if (src_size >= dst_size)
2114
+ return -1;
2115
+ src += 5;
2116
+ }
2117
+ if (src_end - src < src_size || dst_size > dest_capacity)
2118
+ return -1;
2119
+ *dest_size = dst_size;
2120
+ return src_size;
2121
+ }
2122
+
2123
+
2124
+ int Kraken_DecodeBytes(byte **output, const byte *src, const byte *src_end, int *decoded_size, size_t output_size, bool force_memmove, uint8 *scratch, uint8 *scratch_end) {
2125
+ const byte *src_org = src;
2126
+ int src_size, dst_size;
2127
+
2128
+ if (src_end - src < 2)
2129
+ return -1; // too few bytes
2130
+
2131
+ int chunk_type = (src[0] >> 4) & 0x7;
2132
+ if (chunk_type == 0) {
2133
+ if (src[0] >= 0x80) {
2134
+ // In this mode, memcopy stores the length in the bottom 12 bits.
2135
+ src_size = ((src[0] << 8) | src[1]) & 0xFFF;
2136
+ src += 2;
2137
+ } else {
2138
+ if (src_end - src < 3)
2139
+ return -1; // too few bytes
2140
+ src_size = ((src[0] << 16) | (src[1] << 8) | src[2]);
2141
+ if (src_size & ~0x3ffff)
2142
+ return -1; // reserved bits must not be set
2143
+ src += 3;
2144
+ }
2145
+ if (src_size > output_size || src_end - src < src_size)
2146
+ return -1;
2147
+ *decoded_size = src_size;
2148
+ if (force_memmove)
2149
+ memmove(*output, src, src_size);
2150
+ else
2151
+ *output = (byte*)src;
2152
+ return src + src_size - src_org;
2153
+ }
2154
+
2155
+ // In all the other modes, the initial bytes encode
2156
+ // the src_size and the dst_size
2157
+ if (src[0] >= 0x80) {
2158
+ if (src_end - src < 3)
2159
+ return -1; // too few bytes
2160
+
2161
+ // short mode, 10 bit sizes
2162
+ uint32 bits = ((src[0] << 16) | (src[1] << 8) | src[2]);
2163
+ src_size = bits & 0x3ff;
2164
+ dst_size = src_size + ((bits >> 10) & 0x3ff) + 1;
2165
+ src += 3;
2166
+ } else {
2167
+ // long mode, 18 bit sizes
2168
+ if (src_end - src < 5)
2169
+ return -1; // too few bytes
2170
+ uint32 bits = ((src[1] << 24) | (src[2] << 16) | (src[3] << 8) | src[4]);
2171
+ src_size = bits & 0x3ffff;
2172
+ dst_size = (((bits >> 18) | (src[0] << 14)) & 0x3FFFF) + 1;
2173
+ if (src_size >= dst_size)
2174
+ return -1;
2175
+ src += 5;
2176
+ }
2177
+ if (src_end - src < src_size || dst_size > output_size)
2178
+ return -1;
2179
+
2180
+ uint8 *dst = *output;
2181
+ if (dst == scratch) {
2182
+ if (scratch_end - scratch < dst_size)
2183
+ return -1;
2184
+ scratch += dst_size;
2185
+ }
2186
+
2187
+ // printf("%d -> %d (%d)\n", src_size, dst_size, chunk_type);
2188
+
2189
+ int src_used = -1;
2190
+ switch (chunk_type) {
2191
+ case 2:
2192
+ case 4:
2193
+ src_used = Kraken_DecodeBytes_Type12(src, src_size, dst, dst_size, chunk_type >> 1);
2194
+ break;
2195
+ case 5:
2196
+ src_used = Krak_DecodeRecursive(src, src_size, dst, dst_size, scratch, scratch_end);
2197
+ break;
2198
+ case 3:
2199
+ src_used = Krak_DecodeRLE(src, src_size, dst, dst_size, scratch, scratch_end);
2200
+ break;
2201
+ case 1:
2202
+ src_used = Krak_DecodeTans(src, src_size, dst, dst_size, scratch, scratch_end);
2203
+ break;
2204
+ }
2205
+ if (src_used != src_size)
2206
+ return -1;
2207
+ *decoded_size = dst_size;
2208
+ return src + src_size - src_org;
2209
+ }
2210
+
2211
+ void CombineScaledOffsetArrays(int *offs_stream, size_t offs_stream_size, int scale, const uint8 *low_bits) {
2212
+ for (size_t i = 0; i != offs_stream_size; i++)
2213
+ offs_stream[i] = scale * offs_stream[i] - low_bits[i];
2214
+ }
2215
+
2216
+ // Unpacks the packed 8 bit offset and lengths into 32 bit.
2217
+ bool Kraken_UnpackOffsets(const byte *src, const byte *src_end,
2218
+ const byte *packed_offs_stream, const byte *packed_offs_stream_extra, int packed_offs_stream_size,
2219
+ int multi_dist_scale,
2220
+ const byte *packed_litlen_stream, int packed_litlen_stream_size,
2221
+ int *offs_stream, int *len_stream,
2222
+ bool excess_flag, int excess_bytes) {
2223
+
2224
+
2225
+ BitReader bits_a, bits_b;
2226
+ int n, i;
2227
+ int u32_len_stream_size = 0;
2228
+
2229
+ bits_a.bitpos = 24;
2230
+ bits_a.bits = 0;
2231
+ bits_a.p = src;
2232
+ bits_a.p_end = src_end;
2233
+ BitReader_Refill(&bits_a);
2234
+
2235
+ bits_b.bitpos = 24;
2236
+ bits_b.bits = 0;
2237
+ bits_b.p = src_end;
2238
+ bits_b.p_end = src;
2239
+ BitReader_RefillBackwards(&bits_b);
2240
+
2241
+ if (!excess_flag) {
2242
+ if (bits_b.bits < 0x2000)
2243
+ return false;
2244
+ n = 31 - BSR(bits_b.bits);
2245
+ bits_b.bitpos += n;
2246
+ bits_b.bits <<= n;
2247
+ BitReader_RefillBackwards(&bits_b);
2248
+ n++;
2249
+ u32_len_stream_size = (bits_b.bits >> (32 - n)) - 1;
2250
+ bits_b.bitpos += n;
2251
+ bits_b.bits <<= n;
2252
+ BitReader_RefillBackwards(&bits_b);
2253
+ }
2254
+
2255
+ if (multi_dist_scale == 0) {
2256
+ // Traditional way of coding offsets
2257
+ const uint8 *packed_offs_stream_end = packed_offs_stream + packed_offs_stream_size;
2258
+ while (packed_offs_stream != packed_offs_stream_end) {
2259
+ *offs_stream++ = -(int32)BitReader_ReadDistance(&bits_a, *packed_offs_stream++);
2260
+ if (packed_offs_stream == packed_offs_stream_end)
2261
+ break;
2262
+ *offs_stream++ = -(int32)BitReader_ReadDistanceB(&bits_b, *packed_offs_stream++);
2263
+ }
2264
+ } else {
2265
+ // New way of coding offsets
2266
+ int *offs_stream_org = offs_stream;
2267
+ const uint8 *packed_offs_stream_end = packed_offs_stream + packed_offs_stream_size;
2268
+ uint32 cmd, offs;
2269
+ while (packed_offs_stream != packed_offs_stream_end) {
2270
+ cmd = *packed_offs_stream++;
2271
+ if ((cmd >> 3) > 26)
2272
+ return 0;
2273
+ offs = ((8 + (cmd & 7)) << (cmd >> 3)) | BitReader_ReadMoreThan24Bits(&bits_a, (cmd >> 3));
2274
+ *offs_stream++ = 8 - (int32)offs;
2275
+ if (packed_offs_stream == packed_offs_stream_end)
2276
+ break;
2277
+ cmd = *packed_offs_stream++;
2278
+ if ((cmd >> 3) > 26)
2279
+ return 0;
2280
+ offs = ((8 + (cmd & 7)) << (cmd >> 3)) | BitReader_ReadMoreThan24BitsB(&bits_b, (cmd >> 3));
2281
+ *offs_stream++ = 8 - (int32)offs;
2282
+ }
2283
+ if (multi_dist_scale != 1) {
2284
+ CombineScaledOffsetArrays(offs_stream_org, offs_stream - offs_stream_org, multi_dist_scale, packed_offs_stream_extra);
2285
+ }
2286
+ }
2287
+ uint32 u32_len_stream_buf[512]; // max count is 128kb / 256 = 512
2288
+ if (u32_len_stream_size > 512)
2289
+ return false;
2290
+
2291
+ uint32 *u32_len_stream = u32_len_stream_buf,
2292
+ *u32_len_stream_end = u32_len_stream_buf + u32_len_stream_size;
2293
+ for (i = 0; i + 1 < u32_len_stream_size; i += 2) {
2294
+ if (!BitReader_ReadLength(&bits_a, &u32_len_stream[i + 0]))
2295
+ return false;
2296
+ if (!BitReader_ReadLengthB(&bits_b, &u32_len_stream[i + 1]))
2297
+ return false;
2298
+ }
2299
+ if (i < u32_len_stream_size) {
2300
+ if (!BitReader_ReadLength(&bits_a, &u32_len_stream[i + 0]))
2301
+ return false;
2302
+ }
2303
+
2304
+ bits_a.p -= (24 - bits_a.bitpos) >> 3;
2305
+ bits_b.p += (24 - bits_b.bitpos) >> 3;
2306
+
2307
+ if (bits_a.p != bits_b.p)
2308
+ return false;
2309
+
2310
+ for (i = 0; i < packed_litlen_stream_size; i++) {
2311
+ uint32 v = packed_litlen_stream[i];
2312
+ if (v == 255)
2313
+ v = *u32_len_stream++ + 255;
2314
+ len_stream[i] = v + 3;
2315
+ }
2316
+ if (u32_len_stream != u32_len_stream_end)
2317
+ return false;
2318
+
2319
+ return true;
2320
+ }
2321
+ bool Kraken_ReadLzTable(int mode,
2322
+ const byte *src, const byte *src_end,
2323
+ byte *dst, int dst_size, int offset,
2324
+ byte *scratch, byte *scratch_end, KrakenLzTable *lztable) {
2325
+ byte *out;
2326
+ int decode_count, n;
2327
+ byte *packed_offs_stream, *packed_len_stream;
2328
+
2329
+ if (mode > 1)
2330
+ return false;
2331
+
2332
+ if (src_end - src < 13)
2333
+ return false;
2334
+
2335
+ if (offset == 0) {
2336
+ COPY_64(dst, src);
2337
+ dst += 8;
2338
+ src += 8;
2339
+ }
2340
+
2341
+ if (*src & 0x80) {
2342
+ uint8 flag = *src++;
2343
+ if ((flag & 0xc0) != 0x80)
2344
+ return false; // reserved flag set
2345
+
2346
+ return false; // excess bytes not supported
2347
+ }
2348
+
2349
+ // Disable no copy optimization if source and dest overlap
2350
+ bool force_copy = dst <= src_end && src <= dst + dst_size;
2351
+
2352
+ // Decode lit stream, bounded by dst_size
2353
+ out = scratch;
2354
+ n = Kraken_DecodeBytes(&out, src, src_end, &decode_count, Min(scratch_end - scratch, dst_size),
2355
+ force_copy, scratch, scratch_end);
2356
+ if (n < 0)
2357
+ return false;
2358
+ src += n;
2359
+ lztable->lit_stream = out;
2360
+ lztable->lit_stream_size = decode_count;
2361
+ scratch += decode_count;
2362
+
2363
+ // Decode command stream, bounded by dst_size
2364
+ out = scratch;
2365
+ n = Kraken_DecodeBytes(&out, src, src_end, &decode_count, Min(scratch_end - scratch, dst_size),
2366
+ force_copy, scratch, scratch_end);
2367
+ if (n < 0)
2368
+ return false;
2369
+ src += n;
2370
+ lztable->cmd_stream = out;
2371
+ lztable->cmd_stream_size = decode_count;
2372
+ scratch += decode_count;
2373
+
2374
+ // Check if to decode the multistuff crap
2375
+ if (src_end - src < 3)
2376
+ return false;
2377
+
2378
+ int offs_scaling = 0;
2379
+ uint8 *packed_offs_stream_extra = NULL;
2380
+
2381
+ if (src[0] & 0x80) {
2382
+ // uses the mode where distances are coded with 2 tables
2383
+ offs_scaling = src[0] - 127;
2384
+ src++;
2385
+
2386
+ packed_offs_stream = scratch;
2387
+ n = Kraken_DecodeBytes(&packed_offs_stream, src, src_end, &lztable->offs_stream_size,
2388
+ Min(scratch_end - scratch, lztable->cmd_stream_size), false, scratch, scratch_end);
2389
+ if (n < 0)
2390
+ return false;
2391
+ src += n;
2392
+ scratch += lztable->offs_stream_size;
2393
+
2394
+ if (offs_scaling != 1) {
2395
+ packed_offs_stream_extra = scratch;
2396
+ n = Kraken_DecodeBytes(&packed_offs_stream_extra, src, src_end, &decode_count,
2397
+ Min(scratch_end - scratch, lztable->offs_stream_size), false, scratch, scratch_end);
2398
+ if (n < 0 || decode_count != lztable->offs_stream_size)
2399
+ return false;
2400
+ src += n;
2401
+ scratch += decode_count;
2402
+ }
2403
+ } else {
2404
+ // Decode packed offset stream, it's bounded by the command length.
2405
+ packed_offs_stream = scratch;
2406
+ n = Kraken_DecodeBytes(&packed_offs_stream, src, src_end, &lztable->offs_stream_size,
2407
+ Min(scratch_end - scratch, lztable->cmd_stream_size), false, scratch, scratch_end);
2408
+ if (n < 0)
2409
+ return false;
2410
+ src += n;
2411
+ scratch += lztable->offs_stream_size;
2412
+ }
2413
+
2414
+ // Decode packed litlen stream. It's bounded by 1/4 of dst_size.
2415
+ packed_len_stream = scratch;
2416
+ n = Kraken_DecodeBytes(&packed_len_stream, src, src_end, &lztable->len_stream_size,
2417
+ Min(scratch_end - scratch, dst_size >> 2), false, scratch, scratch_end);
2418
+ if (n < 0)
2419
+ return false;
2420
+ src += n;
2421
+ scratch += lztable->len_stream_size;
2422
+
2423
+ // Reserve memory for final dist stream
2424
+ scratch = ALIGN_POINTER(scratch, 16);
2425
+ lztable->offs_stream = (int*)scratch;
2426
+ scratch += lztable->offs_stream_size * 4;
2427
+
2428
+ // Reserve memory for final len stream
2429
+ scratch = ALIGN_POINTER(scratch, 16);
2430
+ lztable->len_stream = (int*)scratch;
2431
+ scratch += lztable->len_stream_size * 4;
2432
+
2433
+ if (scratch + 64 > scratch_end)
2434
+ return false;
2435
+
2436
+ return Kraken_UnpackOffsets(src, src_end, packed_offs_stream, packed_offs_stream_extra,
2437
+ lztable->offs_stream_size, offs_scaling,
2438
+ packed_len_stream, lztable->len_stream_size,
2439
+ lztable->offs_stream, lztable->len_stream, 0, 0);
2440
+ }
2441
+
2442
+
2443
+ // Note: may access memory out of bounds on invalid input.
2444
+ bool Kraken_ProcessLzRuns_Type0(KrakenLzTable *lzt, byte *dst, byte *dst_end, byte *dst_start) {
2445
+ const byte *cmd_stream = lzt->cmd_stream,
2446
+ *cmd_stream_end = cmd_stream + lzt->cmd_stream_size;
2447
+ const int *len_stream = lzt->len_stream;
2448
+ const int *len_stream_end = lzt->len_stream + lzt->len_stream_size;
2449
+ const byte *lit_stream = lzt->lit_stream;
2450
+ const byte *lit_stream_end = lzt->lit_stream + lzt->lit_stream_size;
2451
+ const int *offs_stream = lzt->offs_stream;
2452
+ const int *offs_stream_end = lzt->offs_stream + lzt->offs_stream_size;
2453
+ const byte *copyfrom;
2454
+ uint32 final_len;
2455
+ int32 offset;
2456
+ int32 recent_offs[7];
2457
+ int32 last_offset;
2458
+
2459
+ recent_offs[3] = -8;
2460
+ recent_offs[4] = -8;
2461
+ recent_offs[5] = -8;
2462
+ last_offset = -8;
2463
+
2464
+ while (cmd_stream < cmd_stream_end) {
2465
+ uint32 f = *cmd_stream++;
2466
+ uint32 litlen = f & 3;
2467
+ uint32 offs_index = f >> 6;
2468
+ uint32 matchlen = (f >> 2) & 0xF;
2469
+
2470
+ // use cmov
2471
+ uint32 next_long_length = *len_stream;
2472
+ const int *next_len_stream = len_stream + 1;
2473
+
2474
+ len_stream = (litlen == 3) ? next_len_stream : len_stream;
2475
+ litlen = (litlen == 3) ? next_long_length : litlen;
2476
+ recent_offs[6] = *offs_stream;
2477
+
2478
+ COPY_64_ADD(dst, lit_stream, &dst[last_offset]);
2479
+ if (litlen > 8) {
2480
+ COPY_64_ADD(dst + 8, lit_stream + 8, &dst[last_offset + 8]);
2481
+ if (litlen > 16) {
2482
+ COPY_64_ADD(dst + 16, lit_stream + 16, &dst[last_offset + 16]);
2483
+ if (litlen > 24) {
2484
+ do {
2485
+ COPY_64_ADD(dst + 24, lit_stream + 24, &dst[last_offset + 24]);
2486
+ litlen -= 8;
2487
+ dst += 8;
2488
+ lit_stream += 8;
2489
+ } while (litlen > 24);
2490
+ }
2491
+ }
2492
+ }
2493
+ dst += litlen;
2494
+ lit_stream += litlen;
2495
+
2496
+ offset = recent_offs[offs_index + 3];
2497
+ recent_offs[offs_index + 3] = recent_offs[offs_index + 2];
2498
+ recent_offs[offs_index + 2] = recent_offs[offs_index + 1];
2499
+ recent_offs[offs_index + 1] = recent_offs[offs_index + 0];
2500
+ recent_offs[3] = offset;
2501
+ last_offset = offset;
2502
+
2503
+ offs_stream = (int*)((intptr_t)offs_stream + ((offs_index + 1) & 4));
2504
+
2505
+ if ((uintptr_t)offset < (uintptr_t)(dst_start - dst))
2506
+ return false; // offset out of bounds
2507
+
2508
+ copyfrom = dst + offset;
2509
+ if (matchlen != 15) {
2510
+ COPY_64(dst, copyfrom);
2511
+ COPY_64(dst + 8, copyfrom + 8);
2512
+ dst += matchlen + 2;
2513
+ } else {
2514
+ matchlen = 14 + *len_stream++; // why is the value not 16 here, the above case copies up to 16 bytes.
2515
+ if ((uintptr_t)matchlen >(uintptr_t)(dst_end - dst))
2516
+ return false; // copy length out of bounds
2517
+ COPY_64(dst, copyfrom);
2518
+ COPY_64(dst + 8, copyfrom + 8);
2519
+ COPY_64(dst + 16, copyfrom + 16);
2520
+ do {
2521
+ COPY_64(dst + 24, copyfrom + 24);
2522
+ matchlen -= 8;
2523
+ dst += 8;
2524
+ copyfrom += 8;
2525
+ } while (matchlen > 24);
2526
+ dst += matchlen;
2527
+ }
2528
+ }
2529
+
2530
+ // check for incorrect input
2531
+ if (offs_stream != offs_stream_end || len_stream != len_stream_end)
2532
+ return false;
2533
+
2534
+ final_len = dst_end - dst;
2535
+ if (final_len != lit_stream_end - lit_stream)
2536
+ return false;
2537
+
2538
+ if (final_len >= 8) {
2539
+ do {
2540
+ COPY_64_ADD(dst, lit_stream, &dst[last_offset]);
2541
+ dst += 8, lit_stream += 8, final_len -= 8;
2542
+ } while (final_len >= 8);
2543
+ }
2544
+ if (final_len > 0) {
2545
+ do {
2546
+ *dst = *lit_stream++ + dst[last_offset];
2547
+ } while (dst++, --final_len);
2548
+ }
2549
+ return true;
2550
+ }
2551
+
2552
+
2553
+ // Note: may access memory out of bounds on invalid input.
2554
+ bool Kraken_ProcessLzRuns_Type1(KrakenLzTable *lzt, byte *dst, byte *dst_end, byte *dst_start) {
2555
+ const byte *cmd_stream = lzt->cmd_stream,
2556
+ *cmd_stream_end = cmd_stream + lzt->cmd_stream_size;
2557
+ const int *len_stream = lzt->len_stream;
2558
+ const int *len_stream_end = lzt->len_stream + lzt->len_stream_size;
2559
+ const byte *lit_stream = lzt->lit_stream;
2560
+ const byte *lit_stream_end = lzt->lit_stream + lzt->lit_stream_size;
2561
+ const int *offs_stream = lzt->offs_stream;
2562
+ const int *offs_stream_end = lzt->offs_stream + lzt->offs_stream_size;
2563
+ const byte *copyfrom;
2564
+ uint32 final_len;
2565
+ int32 offset;
2566
+ int32 recent_offs[7];
2567
+
2568
+ recent_offs[3] = -8;
2569
+ recent_offs[4] = -8;
2570
+ recent_offs[5] = -8;
2571
+
2572
+ while (cmd_stream < cmd_stream_end) {
2573
+ uint32 f = *cmd_stream++;
2574
+ uint32 litlen = f & 3;
2575
+ uint32 offs_index = f >> 6;
2576
+ uint32 matchlen = (f >> 2) & 0xF;
2577
+
2578
+ // use cmov
2579
+ uint32 next_long_length = *len_stream;
2580
+ const int *next_len_stream = len_stream + 1;
2581
+
2582
+ len_stream = (litlen == 3) ? next_len_stream : len_stream;
2583
+ litlen = (litlen == 3) ? next_long_length : litlen;
2584
+ recent_offs[6] = *offs_stream;
2585
+
2586
+ COPY_64(dst, lit_stream);
2587
+ if (litlen > 8) {
2588
+ COPY_64(dst + 8, lit_stream + 8);
2589
+ if (litlen > 16) {
2590
+ COPY_64(dst + 16, lit_stream + 16);
2591
+ if (litlen > 24) {
2592
+ do {
2593
+ COPY_64(dst + 24, lit_stream + 24);
2594
+ litlen -= 8;
2595
+ dst += 8;
2596
+ lit_stream += 8;
2597
+ } while (litlen > 24);
2598
+ }
2599
+ }
2600
+ }
2601
+ dst += litlen;
2602
+ lit_stream += litlen;
2603
+
2604
+ offset = recent_offs[offs_index + 3];
2605
+ recent_offs[offs_index + 3] = recent_offs[offs_index + 2];
2606
+ recent_offs[offs_index + 2] = recent_offs[offs_index + 1];
2607
+ recent_offs[offs_index + 1] = recent_offs[offs_index + 0];
2608
+ recent_offs[3] = offset;
2609
+
2610
+ offs_stream = (int*)((intptr_t)offs_stream + ((offs_index + 1) & 4));
2611
+
2612
+ if ((uintptr_t)offset < (uintptr_t)(dst_start - dst))
2613
+ return false; // offset out of bounds
2614
+
2615
+ copyfrom = dst + offset;
2616
+ if (matchlen != 15) {
2617
+ COPY_64(dst, copyfrom);
2618
+ COPY_64(dst + 8, copyfrom + 8);
2619
+ dst += matchlen + 2;
2620
+ } else {
2621
+ matchlen = 14 + *len_stream++; // why is the value not 16 here, the above case copies up to 16 bytes.
2622
+ if ((uintptr_t)matchlen > (uintptr_t)(dst_end - dst))
2623
+ return false; // copy length out of bounds
2624
+ COPY_64(dst, copyfrom);
2625
+ COPY_64(dst + 8, copyfrom + 8);
2626
+ COPY_64(dst + 16, copyfrom + 16);
2627
+ do {
2628
+ COPY_64(dst + 24, copyfrom + 24);
2629
+ matchlen -= 8;
2630
+ dst += 8;
2631
+ copyfrom += 8;
2632
+ } while (matchlen > 24);
2633
+ dst += matchlen;
2634
+ }
2635
+ }
2636
+
2637
+ // check for incorrect input
2638
+ if (offs_stream != offs_stream_end || len_stream != len_stream_end)
2639
+ return false;
2640
+
2641
+ final_len = dst_end - dst;
2642
+ if (final_len != lit_stream_end - lit_stream)
2643
+ return false;
2644
+
2645
+ if (final_len >= 64) {
2646
+ do {
2647
+ COPY_64_BYTES(dst, lit_stream);
2648
+ dst += 64, lit_stream += 64, final_len -= 64;
2649
+ } while (final_len >= 64);
2650
+ }
2651
+ if (final_len >= 8) {
2652
+ do {
2653
+ COPY_64(dst, lit_stream);
2654
+ dst += 8, lit_stream += 8, final_len -= 8;
2655
+ } while (final_len >= 8);
2656
+ }
2657
+ if (final_len > 0) {
2658
+ do {
2659
+ *dst++ = *lit_stream++;
2660
+ } while (--final_len);
2661
+ }
2662
+ return true;
2663
+ }
2664
+
2665
+ bool Kraken_ProcessLzRuns(int mode, byte *dst, int dst_size, int offset, KrakenLzTable *lztable) {
2666
+ byte *dst_end = dst + dst_size;
2667
+
2668
+ if (mode == 1)
2669
+ return Kraken_ProcessLzRuns_Type1(lztable, dst + (offset == 0 ? 8 : 0), dst_end, dst - offset);
2670
+
2671
+ if (mode == 0)
2672
+ return Kraken_ProcessLzRuns_Type0(lztable, dst + (offset == 0 ? 8 : 0), dst_end, dst - offset);
2673
+
2674
+
2675
+ return false;
2676
+ }
2677
+
2678
+ // Decode one 256kb big quantum block. It's divided into two 128k blocks
2679
+ // internally that are compressed separately but with a shared history.
2680
+ int Kraken_DecodeQuantum(byte *dst, byte *dst_end, byte *dst_start,
2681
+ const byte *src, const byte *src_end,
2682
+ byte *scratch, byte *scratch_end) {
2683
+ const byte *src_in = src;
2684
+ int mode, chunkhdr, dst_count, src_used, written_bytes;
2685
+
2686
+ while (dst_end - dst != 0) {
2687
+ dst_count = dst_end - dst;
2688
+ if (dst_count > 0x20000) dst_count = 0x20000;
2689
+ if (src_end - src < 4)
2690
+ return -1;
2691
+ chunkhdr = src[2] | src[1] << 8 | src[0] << 16;
2692
+ if (!(chunkhdr & 0x800000)) {
2693
+ // Stored as entropy without any match copying.
2694
+ byte *out = dst;
2695
+ src_used = Kraken_DecodeBytes(&out, src, src_end, &written_bytes, dst_count, false, scratch, scratch_end);
2696
+ if (src_used < 0 || written_bytes != dst_count)
2697
+ return -1;
2698
+ } else {
2699
+ src += 3;
2700
+ src_used = chunkhdr & 0x7FFFF;
2701
+ mode = (chunkhdr >> 19) & 0xF;
2702
+ if (src_end - src < src_used)
2703
+ return -1;
2704
+ if (src_used < dst_count) {
2705
+ size_t scratch_usage = Min(Min(3 * dst_count + 32 + 0xd000, 0x6C000), scratch_end - scratch);
2706
+ if (scratch_usage < sizeof(KrakenLzTable))
2707
+ return -1;
2708
+ if (!Kraken_ReadLzTable(mode,
2709
+ src, src + src_used,
2710
+ dst, dst_count,
2711
+ dst - dst_start,
2712
+ scratch + sizeof(KrakenLzTable), scratch + scratch_usage,
2713
+ (KrakenLzTable*)scratch))
2714
+ return -1;
2715
+ if (!Kraken_ProcessLzRuns(mode, dst, dst_count, dst - dst_start, (KrakenLzTable*)scratch))
2716
+ return -1;
2717
+ } else if (src_used > dst_count || mode != 0) {
2718
+ return -1;
2719
+ } else {
2720
+ memmove(dst, src, dst_count);
2721
+ }
2722
+ }
2723
+ src += src_used;
2724
+ dst += dst_count;
2725
+ }
2726
+ return src - src_in;
2727
+ }
2728
+
2729
+ struct LeviathanLzTable {
2730
+ int *offs_stream;
2731
+ int offs_stream_size;
2732
+ int *len_stream;
2733
+ int len_stream_size;
2734
+ uint8 *lit_stream[16];
2735
+ int lit_stream_size[16];
2736
+ int lit_stream_total;
2737
+ uint8 *multi_cmd_ptr[8];
2738
+ uint8 *multi_cmd_end[8];
2739
+ uint8 *cmd_stream;
2740
+ int cmd_stream_size;
2741
+ };
2742
+
2743
+ bool Leviathan_ReadLzTable(int chunk_type,
2744
+ const byte *src, const byte *src_end,
2745
+ byte *dst, int dst_size, int offset,
2746
+ byte *scratch, byte *scratch_end, LeviathanLzTable *lztable) {
2747
+ byte *packed_offs_stream, *packed_len_stream, *out;
2748
+ int decode_count, n;
2749
+
2750
+ if (chunk_type > 5)
2751
+ return false;
2752
+
2753
+ if (src_end - src < 13)
2754
+ return false;
2755
+
2756
+ if (offset == 0) {
2757
+ COPY_64(dst, src);
2758
+ dst += 8;
2759
+ src += 8;
2760
+ }
2761
+
2762
+ int offs_scaling = 0;
2763
+ uint8 *packed_offs_stream_extra = NULL;
2764
+
2765
+
2766
+ int offs_stream_limit = dst_size / 3;
2767
+
2768
+ if (!(src[0] & 0x80)) {
2769
+ // Decode packed offset stream, it's bounded by the command length.
2770
+ packed_offs_stream = scratch;
2771
+ n = Kraken_DecodeBytes(&packed_offs_stream, src, src_end, &lztable->offs_stream_size,
2772
+ Min(scratch_end - scratch, offs_stream_limit), false, scratch, scratch_end);
2773
+ if (n < 0)
2774
+ return false;
2775
+ src += n;
2776
+ scratch += lztable->offs_stream_size;
2777
+ } else {
2778
+ // uses the mode where distances are coded with 2 tables
2779
+ // and the transformation offs * scaling + low_bits
2780
+ offs_scaling = src[0] - 127;
2781
+ src++;
2782
+
2783
+ packed_offs_stream = scratch;
2784
+ n = Kraken_DecodeBytes(&packed_offs_stream, src, src_end, &lztable->offs_stream_size,
2785
+ Min(scratch_end - scratch, offs_stream_limit), false, scratch, scratch_end);
2786
+ if (n < 0)
2787
+ return false;
2788
+ src += n;
2789
+ scratch += lztable->offs_stream_size;
2790
+
2791
+ if (offs_scaling != 1) {
2792
+ packed_offs_stream_extra = scratch;
2793
+ n = Kraken_DecodeBytes(&packed_offs_stream_extra, src, src_end, &decode_count,
2794
+ Min(scratch_end - scratch, offs_stream_limit), false, scratch, scratch_end);
2795
+ if (n < 0 || decode_count != lztable->offs_stream_size)
2796
+ return false;
2797
+ src += n;
2798
+ scratch += decode_count;
2799
+ }
2800
+ }
2801
+
2802
+ // Decode packed litlen stream. It's bounded by 1/5 of dst_size.
2803
+ packed_len_stream = scratch;
2804
+ n = Kraken_DecodeBytes(&packed_len_stream, src, src_end, &lztable->len_stream_size,
2805
+ Min(scratch_end - scratch, dst_size / 5), false, scratch, scratch_end);
2806
+ if (n < 0)
2807
+ return false;
2808
+ src += n;
2809
+ scratch += lztable->len_stream_size;
2810
+
2811
+ // Reserve memory for final dist stream
2812
+ scratch = ALIGN_POINTER(scratch, 16);
2813
+ lztable->offs_stream = (int*)scratch;
2814
+ scratch += lztable->offs_stream_size * 4;
2815
+
2816
+ // Reserve memory for final len stream
2817
+ scratch = ALIGN_POINTER(scratch, 16);
2818
+ lztable->len_stream = (int*)scratch;
2819
+ scratch += lztable->len_stream_size * 4;
2820
+
2821
+ if (scratch > scratch_end)
2822
+ return false;
2823
+
2824
+ if (chunk_type <= 1) {
2825
+ // Decode lit stream, bounded by dst_size
2826
+ out = scratch;
2827
+ n = Kraken_DecodeBytes(&out, src, src_end, &decode_count, Min(scratch_end - scratch, dst_size),
2828
+ true, scratch, scratch_end);
2829
+ if (n < 0)
2830
+ return false;
2831
+ src += n;
2832
+ lztable->lit_stream[0] = out;
2833
+ lztable->lit_stream_size[0] = decode_count;
2834
+ } else {
2835
+ int array_count = (chunk_type == 2) ? 2 :
2836
+ (chunk_type == 3) ? 4 : 16;
2837
+ n = Kraken_DecodeMultiArray(src, src_end, scratch, scratch_end, lztable->lit_stream,
2838
+ lztable->lit_stream_size, array_count, &decode_count,
2839
+ true, scratch, scratch_end);
2840
+ if (n < 0)
2841
+ return false;
2842
+ src += n;
2843
+ }
2844
+ scratch += decode_count;
2845
+ lztable->lit_stream_total = decode_count;
2846
+
2847
+ if (src >= src_end)
2848
+ return false;
2849
+
2850
+ if (!(src[0] & 0x80)) {
2851
+ // Decode command stream, bounded by dst_size
2852
+ out = scratch;
2853
+ n = Kraken_DecodeBytes(&out, src, src_end, &decode_count, Min(scratch_end - scratch, dst_size),
2854
+ true, scratch, scratch_end);
2855
+ if (n < 0)
2856
+ return false;
2857
+ src += n;
2858
+ lztable->cmd_stream = out;
2859
+ lztable->cmd_stream_size = decode_count;
2860
+ scratch += decode_count;
2861
+ } else {
2862
+ if (src[0] != 0x83)
2863
+ return false;
2864
+ src++;
2865
+ int multi_cmd_lens[8];
2866
+ n = Kraken_DecodeMultiArray(src, src_end, scratch, scratch_end, lztable->multi_cmd_ptr,
2867
+ multi_cmd_lens, 8, &decode_count, true, scratch, scratch_end);
2868
+ if (n < 0)
2869
+ return false;
2870
+ src += n;
2871
+ for (size_t i = 0; i < 8; i++)
2872
+ lztable->multi_cmd_end[i] = lztable->multi_cmd_ptr[i] + multi_cmd_lens[i];
2873
+
2874
+ lztable->cmd_stream = NULL;
2875
+ lztable->cmd_stream_size = decode_count;
2876
+ scratch += decode_count;
2877
+ }
2878
+
2879
+ if (dst_size > scratch_end - scratch)
2880
+ return false;
2881
+
2882
+
2883
+ return Kraken_UnpackOffsets(src, src_end, packed_offs_stream, packed_offs_stream_extra,
2884
+ lztable->offs_stream_size, offs_scaling,
2885
+ packed_len_stream, lztable->len_stream_size,
2886
+ lztable->offs_stream, lztable->len_stream, 0, 0);
2887
+ }
2888
+
2889
+ #define finline __forceinline
2890
+
2891
+ struct LeviathanModeRaw {
2892
+ const uint8 *lit_stream;
2893
+
2894
+ finline LeviathanModeRaw(LeviathanLzTable *lzt, uint8 *dst_start) : lit_stream(lzt->lit_stream[0]) {
2895
+ }
2896
+
2897
+ finline bool CopyLiterals(uint32 cmd, uint8 *&dst, const int *&len_stream, uint8 *match_zone_end, size_t last_offset) {
2898
+ uint32 litlen = (cmd >> 3) & 3;
2899
+ // use cmov
2900
+ uint32 len_stream_value = *len_stream & 0xffffff;
2901
+ const int *next_len_stream = len_stream + 1;
2902
+ len_stream = (litlen == 3) ? next_len_stream : len_stream;
2903
+ litlen = (litlen == 3) ? len_stream_value : litlen;
2904
+ COPY_64(dst, lit_stream);
2905
+ if (litlen > 8) {
2906
+ COPY_64(dst + 8, lit_stream + 8);
2907
+ if (litlen > 16) {
2908
+ COPY_64(dst + 16, lit_stream + 16);
2909
+ if (litlen > 24) {
2910
+ if (litlen > match_zone_end - dst)
2911
+ return false; // out of bounds
2912
+ do {
2913
+ COPY_64(dst + 24, lit_stream + 24);
2914
+ litlen -= 8, dst += 8, lit_stream += 8;
2915
+ } while (litlen > 24);
2916
+ }
2917
+ }
2918
+ }
2919
+ dst += litlen;
2920
+ lit_stream += litlen;
2921
+ return true;
2922
+ }
2923
+
2924
+ finline void CopyFinalLiterals(uint32 final_len, uint8 *&dst, size_t last_offset) {
2925
+ if (final_len >= 64) {
2926
+ do {
2927
+ COPY_64_BYTES(dst, lit_stream);
2928
+ dst += 64, lit_stream += 64, final_len -= 64;
2929
+ } while (final_len >= 64);
2930
+ }
2931
+ if (final_len >= 8) {
2932
+ do {
2933
+ COPY_64(dst, lit_stream);
2934
+ dst += 8, lit_stream += 8, final_len -= 8;
2935
+ } while (final_len >= 8);
2936
+ }
2937
+ if (final_len > 0) {
2938
+ do {
2939
+ *dst++ = *lit_stream++;
2940
+ } while (--final_len);
2941
+ }
2942
+ }
2943
+ };
2944
+
2945
+ struct LeviathanModeSub {
2946
+ const uint8 *lit_stream;
2947
+
2948
+ finline LeviathanModeSub(LeviathanLzTable *lzt, uint8 *dst_start) : lit_stream(lzt->lit_stream[0]) {
2949
+ }
2950
+
2951
+ finline bool CopyLiterals(uint32 cmd, uint8 *&dst, const int *&len_stream, uint8 *match_zone_end, size_t last_offset) {
2952
+ uint32 litlen = (cmd >> 3) & 3;
2953
+ // use cmov
2954
+ uint32 len_stream_value = *len_stream & 0xffffff;
2955
+ const int *next_len_stream = len_stream + 1;
2956
+ len_stream = (litlen == 3) ? next_len_stream : len_stream;
2957
+ litlen = (litlen == 3) ? len_stream_value : litlen;
2958
+ COPY_64_ADD(dst, lit_stream, &dst[last_offset]);
2959
+ if (litlen > 8) {
2960
+ COPY_64_ADD(dst + 8, lit_stream + 8, &dst[last_offset + 8]);
2961
+ if (litlen > 16) {
2962
+ COPY_64_ADD(dst + 16, lit_stream + 16, &dst[last_offset + 16]);
2963
+ if (litlen > 24) {
2964
+ if (litlen > match_zone_end - dst)
2965
+ return false; // out of bounds
2966
+ do {
2967
+ COPY_64_ADD(dst + 24, lit_stream + 24, &dst[last_offset + 24]);
2968
+ litlen -= 8, dst += 8, lit_stream += 8;
2969
+ } while (litlen > 24);
2970
+ }
2971
+ }
2972
+ }
2973
+ dst += litlen;
2974
+ lit_stream += litlen;
2975
+ return true;
2976
+ }
2977
+
2978
+ finline void CopyFinalLiterals(uint32 final_len, uint8 *&dst, size_t last_offset) {
2979
+ if (final_len >= 8) {
2980
+ do {
2981
+ COPY_64_ADD(dst, lit_stream, &dst[last_offset]);
2982
+ dst += 8, lit_stream += 8, final_len -= 8;
2983
+ } while (final_len >= 8);
2984
+ }
2985
+ if (final_len > 0) {
2986
+ do {
2987
+ *dst = *lit_stream++ + dst[last_offset];
2988
+ } while (dst++, --final_len);
2989
+ }
2990
+ }
2991
+ };
2992
+
2993
+ struct LeviathanModeLamSub {
2994
+ const uint8 *lit_stream, *lam_lit_stream;
2995
+
2996
+ finline LeviathanModeLamSub(LeviathanLzTable *lzt, uint8 *dst_start)
2997
+ : lit_stream(lzt->lit_stream[0]),
2998
+ lam_lit_stream(lzt->lit_stream[1]) {
2999
+ }
3000
+
3001
+ finline bool CopyLiterals(uint32 cmd, uint8 *&dst, const int *&len_stream, uint8 *match_zone_end, size_t last_offset) {
3002
+ uint32 lit_cmd = cmd & 0x18;
3003
+ if (!lit_cmd)
3004
+ return true;
3005
+
3006
+ uint32 litlen = lit_cmd >> 3;
3007
+ // use cmov
3008
+ uint32 len_stream_value = *len_stream & 0xffffff;
3009
+ const int *next_len_stream = len_stream + 1;
3010
+ len_stream = (litlen == 3) ? next_len_stream : len_stream;
3011
+ litlen = (litlen == 3) ? len_stream_value : litlen;
3012
+
3013
+ if (litlen-- == 0)
3014
+ return false; // lamsub mode requires one literal
3015
+
3016
+ dst[0] = *lam_lit_stream++ + dst[last_offset], dst++;
3017
+
3018
+ COPY_64_ADD(dst, lit_stream, &dst[last_offset]);
3019
+ if (litlen > 8) {
3020
+ COPY_64_ADD(dst + 8, lit_stream + 8, &dst[last_offset + 8]);
3021
+ if (litlen > 16) {
3022
+ COPY_64_ADD(dst + 16, lit_stream + 16, &dst[last_offset + 16]);
3023
+ if (litlen > 24) {
3024
+ if (litlen > match_zone_end - dst)
3025
+ return false; // out of bounds
3026
+ do {
3027
+ COPY_64_ADD(dst + 24, lit_stream + 24, &dst[last_offset + 24]);
3028
+ litlen -= 8, dst += 8, lit_stream += 8;
3029
+ } while (litlen > 24);
3030
+ }
3031
+ }
3032
+ }
3033
+ dst += litlen;
3034
+ lit_stream += litlen;
3035
+ return true;
3036
+ }
3037
+
3038
+ finline void CopyFinalLiterals(uint32 final_len, uint8 *&dst, size_t last_offset) {
3039
+ dst[0] = *lam_lit_stream++ + dst[last_offset], dst++;
3040
+ final_len -= 1;
3041
+
3042
+ if (final_len >= 8) {
3043
+ do {
3044
+ COPY_64_ADD(dst, lit_stream, &dst[last_offset]);
3045
+ dst += 8, lit_stream += 8, final_len -= 8;
3046
+ } while (final_len >= 8);
3047
+ }
3048
+ if (final_len > 0) {
3049
+ do {
3050
+ *dst = *lit_stream++ + dst[last_offset];
3051
+ } while (dst++, --final_len);
3052
+ }
3053
+ }
3054
+ };
3055
+
3056
+ struct LeviathanModeSubAnd3 {
3057
+ enum { NUM = 4, MASK = NUM - 1};
3058
+ const uint8 *lit_stream[NUM];
3059
+
3060
+ finline LeviathanModeSubAnd3(LeviathanLzTable *lzt, uint8 *dst_start) {
3061
+ for (size_t i = 0; i != NUM; i++)
3062
+ lit_stream[i] = lzt->lit_stream[(-(intptr_t)dst_start + i) & MASK];
3063
+ }
3064
+ finline bool CopyLiterals(uint32 cmd, uint8 *&dst, const int *&len_stream, uint8 *match_zone_end, size_t last_offset) {
3065
+ uint32 lit_cmd = cmd & 0x18;
3066
+
3067
+ if (lit_cmd == 0x18) {
3068
+ uint32 litlen = *len_stream++ & 0xffffff;
3069
+ if (litlen > match_zone_end - dst)
3070
+ return false;
3071
+ while (litlen) {
3072
+ *dst = *lit_stream[(uintptr_t)dst & MASK]++ + dst[last_offset];
3073
+ dst++, litlen--;
3074
+ }
3075
+ } else if (lit_cmd) {
3076
+ *dst = *lit_stream[(uintptr_t)dst & MASK]++ + dst[last_offset];
3077
+ dst++;
3078
+ if (lit_cmd == 0x10) {
3079
+ *dst = *lit_stream[(uintptr_t)dst & MASK]++ + dst[last_offset];
3080
+ dst++;
3081
+ }
3082
+ }
3083
+ return true;
3084
+ }
3085
+
3086
+ finline void CopyFinalLiterals(uint32 final_len, uint8 *&dst, size_t last_offset) {
3087
+ if (final_len > 0) {
3088
+ do {
3089
+ *dst = *lit_stream[(uintptr_t)dst & MASK]++ + dst[last_offset];
3090
+ } while (dst++, --final_len);
3091
+ }
3092
+ }
3093
+ };
3094
+
3095
+ struct LeviathanModeSubAndF {
3096
+ enum { NUM = 16, MASK = NUM - 1};
3097
+ const uint8 *lit_stream[NUM];
3098
+
3099
+ finline LeviathanModeSubAndF(LeviathanLzTable *lzt, uint8 *dst_start) {
3100
+ for(size_t i = 0; i != NUM; i++)
3101
+ lit_stream[i] = lzt->lit_stream[(-(intptr_t)dst_start + i) & MASK];
3102
+ }
3103
+ finline bool CopyLiterals(uint32 cmd, uint8 *&dst, const int *&len_stream, uint8 *match_zone_end, size_t last_offset) {
3104
+ uint32 lit_cmd = cmd & 0x18;
3105
+
3106
+ if (lit_cmd == 0x18) {
3107
+ uint32 litlen = *len_stream++ & 0xffffff;
3108
+ if (litlen > match_zone_end - dst)
3109
+ return false;
3110
+ while (litlen) {
3111
+ *dst = *lit_stream[(uintptr_t)dst & MASK]++ + dst[last_offset];
3112
+ dst++, litlen--;
3113
+ }
3114
+ } else if (lit_cmd) {
3115
+ *dst = *lit_stream[(uintptr_t)dst & MASK]++ + dst[last_offset];
3116
+ dst++;
3117
+ if (lit_cmd == 0x10) {
3118
+ *dst = *lit_stream[(uintptr_t)dst & MASK]++ + dst[last_offset];
3119
+ dst++;
3120
+ }
3121
+ }
3122
+ return true;
3123
+ }
3124
+
3125
+ finline void CopyFinalLiterals(uint32 final_len, uint8 *&dst, size_t last_offset) {
3126
+ if (final_len > 0) {
3127
+ do {
3128
+ *dst = *lit_stream[(uintptr_t)dst & MASK]++ + dst[last_offset];
3129
+ } while (dst++, --final_len);
3130
+ }
3131
+ }
3132
+ };
3133
+
3134
+ struct LeviathanModeO1 {
3135
+ const uint8 *lit_streams[16];
3136
+ uint8 next_lit[16];
3137
+
3138
+ finline LeviathanModeO1(LeviathanLzTable *lzt, uint8 *dst_start) {
3139
+ for (size_t i = 0; i != 16; i++) {
3140
+ uint8 *p = lzt->lit_stream[i];
3141
+ next_lit[i] = *p;
3142
+ lit_streams[i] = p + 1;
3143
+ }
3144
+ }
3145
+
3146
+ finline bool CopyLiterals(uint32 cmd, uint8 *&dst, const int *&len_stream, uint8 *match_zone_end, size_t last_offset) {
3147
+ uint32 lit_cmd = cmd & 0x18;
3148
+
3149
+ if (lit_cmd == 0x18) {
3150
+ uint32 litlen = *len_stream++;
3151
+ if ((int32)litlen <= 0)
3152
+ return false;
3153
+ uint context = dst[-1];
3154
+ do {
3155
+ size_t slot = context >> 4;
3156
+ *dst++ = (context = next_lit[slot]);
3157
+ next_lit[slot] = *lit_streams[slot]++;
3158
+ } while (--litlen);
3159
+ } else if (lit_cmd) {
3160
+ // either 1 or 2
3161
+ uint context = dst[-1];
3162
+ size_t slot = context >> 4;
3163
+ *dst++ = (context = next_lit[slot]);
3164
+ next_lit[slot] = *lit_streams[slot]++;
3165
+ if (lit_cmd == 0x10) {
3166
+ slot = context >> 4;
3167
+ *dst++ = (context = next_lit[slot]);
3168
+ next_lit[slot] = *lit_streams[slot]++;
3169
+ }
3170
+ }
3171
+ return true;
3172
+ }
3173
+
3174
+ finline void CopyFinalLiterals(uint32 final_len, uint8 *&dst, size_t last_offset) {
3175
+ uint context = dst[-1];
3176
+ while (final_len) {
3177
+ size_t slot = context >> 4;
3178
+ *dst++ = (context = next_lit[slot]);
3179
+ next_lit[slot] = *lit_streams[slot]++;
3180
+ final_len--;
3181
+ }
3182
+ }
3183
+ };
3184
+
3185
+ template<typename Mode, bool MultiCmd>
3186
+ bool Leviathan_ProcessLz(LeviathanLzTable *lzt, uint8 *dst,
3187
+ uint8 *dst_start, uint8 *dst_end, uint8 *window_base) {
3188
+ const uint8 *cmd_stream = lzt->cmd_stream,
3189
+ *cmd_stream_end = cmd_stream + lzt->cmd_stream_size;
3190
+ const int *len_stream = lzt->len_stream;
3191
+ const int *len_stream_end = len_stream + lzt->len_stream_size;
3192
+
3193
+ const int *offs_stream = lzt->offs_stream;
3194
+ const int *offs_stream_end = offs_stream + lzt->offs_stream_size;
3195
+ const byte *copyfrom;
3196
+ uint8 *match_zone_end = (dst_end - dst_start >= 16) ? dst_end - 16 : dst_start;
3197
+
3198
+ int32 recent_offs[16];
3199
+ recent_offs[8] = recent_offs[9] = recent_offs[10] = recent_offs[11] = -8;
3200
+ recent_offs[12] = recent_offs[13] = recent_offs[14] = -8;
3201
+
3202
+ size_t offset = -8;
3203
+
3204
+ Mode mode(lzt, dst_start);
3205
+
3206
+ uint32 cmd_stream_left;
3207
+ const uint8 *multi_cmd_stream[8], **cmd_stream_ptr;
3208
+ if (MultiCmd) {
3209
+ for (size_t i = 0; i != 8; i++)
3210
+ multi_cmd_stream[i] = lzt->multi_cmd_ptr[(i - (uintptr_t)dst_start) & 7];
3211
+ cmd_stream_left = lzt->cmd_stream_size;
3212
+ cmd_stream_ptr = &multi_cmd_stream[(uintptr_t)dst & 7];
3213
+ cmd_stream = *cmd_stream_ptr;
3214
+ }
3215
+
3216
+ for(;;) {
3217
+ uint32 cmd;
3218
+
3219
+ if (!MultiCmd) {
3220
+ if (cmd_stream >= cmd_stream_end)
3221
+ break;
3222
+ cmd = *cmd_stream++;
3223
+ } else {
3224
+ if (cmd_stream_left == 0)
3225
+ break;
3226
+ cmd_stream_left--;
3227
+ cmd = *cmd_stream;
3228
+ *cmd_stream_ptr = cmd_stream + 1;
3229
+ }
3230
+
3231
+ uint32 offs_index = cmd >> 5;
3232
+ uint32 matchlen = (cmd & 7) + 2;
3233
+
3234
+ recent_offs[15] = *offs_stream;
3235
+
3236
+ if (!mode.CopyLiterals(cmd, dst, len_stream, match_zone_end, offset))
3237
+ return false;
3238
+
3239
+ offset = recent_offs[(size_t)offs_index + 8];
3240
+
3241
+ // Permute the recent offsets table
3242
+ __m128i temp = _mm_loadu_si128((const __m128i *)&recent_offs[(size_t)offs_index + 4]);
3243
+ _mm_storeu_si128((__m128i *)&recent_offs[(size_t)offs_index + 1], _mm_loadu_si128((const __m128i *)&recent_offs[offs_index]));
3244
+ _mm_storeu_si128((__m128i *)&recent_offs[(size_t)offs_index + 5], temp);
3245
+ recent_offs[8] = (int32)offset;
3246
+ offs_stream += offs_index == 7;
3247
+
3248
+ if ((uintptr_t)offset < (uintptr_t)(window_base - dst))
3249
+ return false; // offset out of bounds
3250
+ copyfrom = dst + offset;
3251
+
3252
+ if (matchlen == 9) {
3253
+ if (len_stream >= len_stream_end)
3254
+ return false; // len stream empty
3255
+ matchlen = *--len_stream_end + 6;
3256
+ COPY_64(dst, copyfrom);
3257
+ COPY_64(dst + 8, copyfrom + 8);
3258
+ uint8 *next_dst = dst + matchlen;
3259
+ if (MultiCmd)
3260
+ cmd_stream = *(cmd_stream_ptr = &multi_cmd_stream[(uintptr_t)next_dst & 7]);
3261
+ if (matchlen > 16) {
3262
+ if (matchlen > (uintptr_t)(dst_end - 8 - dst))
3263
+ return false; // no space in buf
3264
+ COPY_64(dst + 16, copyfrom + 16);
3265
+ do {
3266
+ COPY_64(dst + 24, copyfrom + 24);
3267
+ matchlen -= 8;
3268
+ dst += 8;
3269
+ copyfrom += 8;
3270
+ } while (matchlen > 24);
3271
+ }
3272
+ dst = next_dst;
3273
+ } else {
3274
+ COPY_64(dst, copyfrom);
3275
+ dst += matchlen;
3276
+ if (MultiCmd)
3277
+ cmd_stream = *(cmd_stream_ptr = &multi_cmd_stream[(uintptr_t)dst & 7]);
3278
+ }
3279
+ }
3280
+
3281
+ // check for incorrect input
3282
+ if (offs_stream != offs_stream_end || len_stream != len_stream_end)
3283
+ return false;
3284
+
3285
+ // copy final literals
3286
+ if (dst < dst_end) {
3287
+ mode.CopyFinalLiterals(dst_end - dst, dst, offset);
3288
+ } else if (dst != dst_end) {
3289
+ return false;
3290
+ }
3291
+ return true;
3292
+ }
3293
+
3294
+ bool Leviathan_ProcessLzRuns(int chunk_type, byte *dst, int dst_size, int offset, LeviathanLzTable *lzt) {
3295
+ uint8 *dst_cur = dst + (offset == 0 ? 8 : 0);
3296
+ uint8 *dst_end = dst + dst_size;
3297
+ uint8 *dst_start = dst - offset;
3298
+
3299
+ if (lzt->cmd_stream != NULL) {
3300
+ // single cmd mode
3301
+ switch (chunk_type) {
3302
+ case 0:
3303
+ return Leviathan_ProcessLz<LeviathanModeSub, false>(lzt, dst_cur, dst, dst_end, dst_start);
3304
+ case 1:
3305
+ return Leviathan_ProcessLz<LeviathanModeRaw, false>(lzt, dst_cur, dst, dst_end, dst_start);
3306
+ case 2:
3307
+ return Leviathan_ProcessLz<LeviathanModeLamSub, false>(lzt, dst_cur, dst, dst_end, dst_start);
3308
+ case 3:
3309
+ return Leviathan_ProcessLz<LeviathanModeSubAnd3, false>(lzt, dst_cur, dst, dst_end, dst_start);
3310
+ case 4:
3311
+ return Leviathan_ProcessLz<LeviathanModeO1, false>(lzt, dst_cur, dst, dst_end, dst_start);
3312
+ case 5:
3313
+ return Leviathan_ProcessLz<LeviathanModeSubAndF, false>(lzt, dst_cur, dst, dst_end, dst_start);
3314
+ }
3315
+ } else {
3316
+ // multi cmd mode
3317
+ switch (chunk_type) {
3318
+ case 0:
3319
+ return Leviathan_ProcessLz<LeviathanModeSub, true>(lzt, dst_cur, dst, dst_end, dst_start);
3320
+ case 1:
3321
+ return Leviathan_ProcessLz<LeviathanModeRaw, true>(lzt, dst_cur, dst, dst_end, dst_start);
3322
+ case 2:
3323
+ return Leviathan_ProcessLz<LeviathanModeLamSub, true>(lzt, dst_cur, dst, dst_end, dst_start);
3324
+ case 3:
3325
+ return Leviathan_ProcessLz<LeviathanModeSubAnd3, true>(lzt, dst_cur, dst, dst_end, dst_start);
3326
+ case 4:
3327
+ return Leviathan_ProcessLz<LeviathanModeO1, true>(lzt, dst_cur, dst, dst_end, dst_start);
3328
+ case 5:
3329
+ return Leviathan_ProcessLz<LeviathanModeSubAndF, true>(lzt, dst_cur, dst, dst_end, dst_start);
3330
+ }
3331
+
3332
+ }
3333
+ return false;
3334
+ }
3335
+
3336
+
3337
+
3338
+ // Decode one 256kb big quantum block. It's divided into two 128k blocks
3339
+ // internally that are compressed separately but with a shared history.
3340
+ int Leviathan_DecodeQuantum(byte *dst, byte *dst_end, byte *dst_start,
3341
+ const byte *src, const byte *src_end,
3342
+ byte *scratch, byte *scratch_end) {
3343
+ const byte *src_in = src;
3344
+ int mode, chunkhdr, dst_count, src_used, written_bytes;
3345
+
3346
+ while (dst_end - dst != 0) {
3347
+ dst_count = dst_end - dst;
3348
+ if (dst_count > 0x20000) dst_count = 0x20000;
3349
+ if (src_end - src < 4)
3350
+ return -1;
3351
+ chunkhdr = src[2] | src[1] << 8 | src[0] << 16;
3352
+ if (!(chunkhdr & 0x800000)) {
3353
+ // Stored as entropy without any match copying.
3354
+ byte *out = dst;
3355
+ src_used = Kraken_DecodeBytes(&out, src, src_end, &written_bytes, dst_count, false, scratch, scratch_end);
3356
+ if (src_used < 0 || written_bytes != dst_count)
3357
+ return -1;
3358
+ } else {
3359
+ src += 3;
3360
+ src_used = chunkhdr & 0x7FFFF;
3361
+ mode = (chunkhdr >> 19) & 0xF;
3362
+ if (src_end - src < src_used)
3363
+ return -1;
3364
+ if (src_used < dst_count) {
3365
+ size_t scratch_usage = Min(Min(3 * dst_count + 32 + 0xd000, 0x6C000), scratch_end - scratch);
3366
+ if (scratch_usage < sizeof(LeviathanLzTable))
3367
+ return -1;
3368
+ if (!Leviathan_ReadLzTable(mode,
3369
+ src, src + src_used,
3370
+ dst, dst_count,
3371
+ dst - dst_start,
3372
+ scratch + sizeof(LeviathanLzTable), scratch + scratch_usage,
3373
+ (LeviathanLzTable*)scratch))
3374
+ return -1;
3375
+ if (!Leviathan_ProcessLzRuns(mode, dst, dst_count, dst - dst_start, (LeviathanLzTable*)scratch))
3376
+ return -1;
3377
+ } else if (src_used > dst_count || mode != 0) {
3378
+ return -1;
3379
+ } else {
3380
+ memmove(dst, src, dst_count);
3381
+ }
3382
+ }
3383
+ src += src_used;
3384
+ dst += dst_count;
3385
+ }
3386
+ return src - src_in;
3387
+ }
3388
+
3389
+
3390
+
3391
+ int Mermaid_DecodeFarOffsets(const byte *src, const byte *src_end, uint32 *output, size_t output_size, int64 offset) {
3392
+ const byte *src_cur = src;
3393
+ size_t i;
3394
+ uint32 off;
3395
+
3396
+ if (offset < (0xC00000 - 1)) {
3397
+ for (i = 0; i != output_size; i++) {
3398
+ if (src_end - src_cur < 3)
3399
+ return -1;
3400
+ off = src_cur[0] | src_cur[1] << 8 | src_cur[2] << 16;
3401
+ src_cur += 3;
3402
+ output[i] = off;
3403
+ if (off > offset)
3404
+ return -1;
3405
+ }
3406
+ return src_cur - src;
3407
+ }
3408
+
3409
+ for (i = 0; i != output_size; i++) {
3410
+ if (src_end - src_cur < 3)
3411
+ return -1;
3412
+ off = src_cur[0] | src_cur[1] << 8 | src_cur[2] << 16;
3413
+ src_cur += 3;
3414
+
3415
+ if (off >= 0xc00000) {
3416
+ if (src_cur == src_end)
3417
+ return -1;
3418
+ off += *src_cur++ << 22;
3419
+ }
3420
+ output[i] = off;
3421
+ if (off > offset)
3422
+ return -1;
3423
+ }
3424
+ return src_cur - src;
3425
+ }
3426
+
3427
+ void Mermaid_CombineOffs16(uint16 *dst, size_t size, const uint8 *lo, const uint8 *hi) {
3428
+ for (size_t i = 0; i != size; i++)
3429
+ dst[i] = lo[i] + hi[i] * 256;
3430
+ }
3431
+
3432
+ bool Mermaid_ReadLzTable(int mode,
3433
+ const byte *src, const byte *src_end,
3434
+ byte *dst, int dst_size, int64 offset,
3435
+ byte *scratch, byte *scratch_end, MermaidLzTable *lz) {
3436
+ byte *out;
3437
+ int decode_count, n;
3438
+ uint32 tmp, off32_size_2, off32_size_1;
3439
+
3440
+ if (mode > 1)
3441
+ return false;
3442
+
3443
+ if (src_end - src < 10)
3444
+ return false;
3445
+
3446
+ if (offset == 0) {
3447
+ COPY_64(dst, src);
3448
+ dst += 8;
3449
+ src += 8;
3450
+ }
3451
+
3452
+ // Decode lit stream
3453
+ out = scratch;
3454
+ n = Kraken_DecodeBytes(&out, src, src_end, &decode_count, Min(scratch_end - scratch, dst_size), false, scratch, scratch_end);
3455
+ if (n < 0)
3456
+ return false;
3457
+ src += n;
3458
+ lz->lit_stream = out;
3459
+ lz->lit_stream_end = out + decode_count;
3460
+ scratch += decode_count;
3461
+
3462
+ // Decode flag stream
3463
+ out = scratch;
3464
+ n = Kraken_DecodeBytes(&out, src, src_end, &decode_count, Min(scratch_end - scratch, dst_size), false, scratch, scratch_end);
3465
+ if (n < 0)
3466
+ return false;
3467
+ src += n;
3468
+ lz->cmd_stream = out;
3469
+ lz->cmd_stream_end = out + decode_count;
3470
+ scratch += decode_count;
3471
+
3472
+ lz->cmd_stream_2_offs_end = decode_count;
3473
+ if (dst_size <= 0x10000) {
3474
+ lz->cmd_stream_2_offs = decode_count;
3475
+ } else {
3476
+ if (src_end - src < 2)
3477
+ return false;
3478
+ lz->cmd_stream_2_offs = *(uint16*)src;
3479
+ src += 2;
3480
+ if (lz->cmd_stream_2_offs > lz->cmd_stream_2_offs_end)
3481
+ return false;
3482
+ }
3483
+
3484
+ if (src_end - src < 2)
3485
+ return false;
3486
+
3487
+ int off16_count = *(uint16*)src;
3488
+ if (off16_count == 0xffff) {
3489
+ // off16 is entropy coded
3490
+ uint8 *off16_lo, *off16_hi;
3491
+ int off16_lo_count, off16_hi_count;
3492
+ src += 2;
3493
+ off16_hi = scratch;
3494
+ n = Kraken_DecodeBytes(&off16_hi, src, src_end, &off16_hi_count, Min(scratch_end - scratch, dst_size >> 1), false, scratch, scratch_end);
3495
+ if (n < 0)
3496
+ return false;
3497
+ src += n;
3498
+ scratch += off16_hi_count;
3499
+
3500
+ off16_lo = scratch;
3501
+ n = Kraken_DecodeBytes(&off16_lo, src, src_end, &off16_lo_count, Min(scratch_end - scratch, dst_size >> 1), false, scratch, scratch_end);
3502
+ if (n < 0)
3503
+ return false;
3504
+ src += n;
3505
+ scratch += off16_lo_count;
3506
+
3507
+ if (off16_lo_count != off16_hi_count)
3508
+ return false;
3509
+ scratch = ALIGN_POINTER(scratch, 2);
3510
+ lz->off16_stream = (uint16*)scratch;
3511
+ if (scratch + off16_lo_count * 2 > scratch_end)
3512
+ return false;
3513
+ scratch += off16_lo_count * 2;
3514
+ lz->off16_stream_end = (uint16*)scratch;
3515
+ Mermaid_CombineOffs16((uint16*)lz->off16_stream, off16_lo_count, off16_lo, off16_hi);
3516
+ } else {
3517
+ lz->off16_stream = (uint16*)(src + 2);
3518
+ src += 2 + off16_count * 2;
3519
+ lz->off16_stream_end = (uint16*)src;
3520
+ }
3521
+
3522
+ if (src_end - src < 3)
3523
+ return false;
3524
+ tmp = src[0] | src[1] << 8 | src[2] << 16;
3525
+ src += 3;
3526
+
3527
+ if (tmp != 0) {
3528
+ off32_size_1 = tmp >> 12;
3529
+ off32_size_2 = tmp & 0xFFF;
3530
+ if (off32_size_1 == 4095) {
3531
+ if (src_end - src < 2)
3532
+ return false;
3533
+ off32_size_1 = *(uint16*)src;
3534
+ src += 2;
3535
+ }
3536
+ if (off32_size_2 == 4095) {
3537
+ if (src_end - src < 2)
3538
+ return false;
3539
+ off32_size_2 = *(uint16*)src;
3540
+ src += 2;
3541
+ }
3542
+ lz->off32_size_1 = off32_size_1;
3543
+ lz->off32_size_2 = off32_size_2;
3544
+
3545
+ if (scratch + 4 * (off32_size_2 + off32_size_1) + 64 > scratch_end)
3546
+ return false;
3547
+
3548
+ scratch = ALIGN_POINTER(scratch, 4);
3549
+
3550
+ lz->off32_stream_1 = (uint32*)scratch;
3551
+ scratch += off32_size_1 * 4;
3552
+ // store dummy bytes after for prefetcher.
3553
+ ((uint64*)scratch)[0] = 0;
3554
+ ((uint64*)scratch)[1] = 0;
3555
+ ((uint64*)scratch)[2] = 0;
3556
+ ((uint64*)scratch)[3] = 0;
3557
+ scratch += 32;
3558
+
3559
+ lz->off32_stream_2 = (uint32*)scratch;
3560
+ scratch += off32_size_2 * 4;
3561
+ // store dummy bytes after for prefetcher.
3562
+ ((uint64*)scratch)[0] = 0;
3563
+ ((uint64*)scratch)[1] = 0;
3564
+ ((uint64*)scratch)[2] = 0;
3565
+ ((uint64*)scratch)[3] = 0;
3566
+ scratch += 32;
3567
+
3568
+ n = Mermaid_DecodeFarOffsets(src, src_end, lz->off32_stream_1, lz->off32_size_1, offset);
3569
+ if (n < 0)
3570
+ return false;
3571
+ src += n;
3572
+
3573
+ n = Mermaid_DecodeFarOffsets(src, src_end, lz->off32_stream_2, lz->off32_size_2, offset + 0x10000);
3574
+ if (n < 0)
3575
+ return false;
3576
+ src += n;
3577
+ } else {
3578
+ if (scratch_end - scratch < 32)
3579
+ return false;
3580
+ lz->off32_size_1 = 0;
3581
+ lz->off32_size_2 = 0;
3582
+ lz->off32_stream_1 = (uint32*)scratch;
3583
+ lz->off32_stream_2 = (uint32*)scratch;
3584
+ // store dummy bytes after for prefetcher.
3585
+ ((uint64*)scratch)[0] = 0;
3586
+ ((uint64*)scratch)[1] = 0;
3587
+ ((uint64*)scratch)[2] = 0;
3588
+ ((uint64*)scratch)[3] = 0;
3589
+ }
3590
+ lz->length_stream = src;
3591
+ return true;
3592
+ }
3593
+
3594
+ const byte *Mermaid_Mode0(byte *dst, size_t dst_size, byte *dst_ptr_end, byte *dst_start,
3595
+ const byte *src_end, MermaidLzTable *lz, int32 *saved_dist, size_t startoff) {
3596
+ const byte *dst_end = dst + dst_size;
3597
+ const byte *cmd_stream = lz->cmd_stream;
3598
+ const byte *cmd_stream_end = lz->cmd_stream_end;
3599
+ const byte *length_stream = lz->length_stream;
3600
+ const byte *lit_stream = lz->lit_stream;
3601
+ const byte *lit_stream_end = lz->lit_stream_end;
3602
+ const uint16 *off16_stream = lz->off16_stream;
3603
+ const uint16 *off16_stream_end = lz->off16_stream_end;
3604
+ const uint32 *off32_stream = lz->off32_stream;
3605
+ const uint32 *off32_stream_end = lz->off32_stream_end;
3606
+ intptr_t recent_offs = *saved_dist;
3607
+ const byte *match;
3608
+ intptr_t length;
3609
+ const byte *dst_begin = dst;
3610
+
3611
+ dst += startoff;
3612
+
3613
+ while (cmd_stream < cmd_stream_end) {
3614
+ uintptr_t cmd = *cmd_stream++;
3615
+ if (cmd >= 24) {
3616
+ intptr_t new_dist = *off16_stream;
3617
+ uintptr_t use_distance = (uintptr_t)(cmd >> 7) - 1;
3618
+ uintptr_t litlen = (cmd & 7);
3619
+ COPY_64_ADD(dst, lit_stream, &dst[recent_offs]);
3620
+ dst += litlen;
3621
+ lit_stream += litlen;
3622
+ recent_offs ^= use_distance & (recent_offs ^ -new_dist);
3623
+ off16_stream = (uint16*)((uintptr_t)off16_stream + (use_distance & 2));
3624
+ match = dst + recent_offs;
3625
+ COPY_64(dst, match);
3626
+ COPY_64(dst + 8, match + 8);
3627
+ dst += (cmd >> 3) & 0xF;
3628
+ } else if (cmd > 2) {
3629
+ length = cmd + 5;
3630
+
3631
+ if (off32_stream == off32_stream_end)
3632
+ return NULL;
3633
+ match = dst_begin - *off32_stream++;
3634
+ recent_offs = (match - dst);
3635
+
3636
+ if (dst_end - dst < length)
3637
+ return NULL;
3638
+ COPY_64(dst, match);
3639
+ COPY_64(dst + 8, match + 8);
3640
+ COPY_64(dst + 16, match + 16);
3641
+ COPY_64(dst + 24, match + 24);
3642
+ dst += length;
3643
+ _mm_prefetch((char*)dst_begin - off32_stream[3], _MM_HINT_T0);
3644
+ } else if (cmd == 0) {
3645
+ if (src_end - length_stream == 0)
3646
+ return NULL;
3647
+ length = *length_stream;
3648
+ if (length > 251) {
3649
+ if (src_end - length_stream < 3)
3650
+ return NULL;
3651
+ length += (size_t)*(uint16*)(length_stream + 1) * 4;
3652
+ length_stream += 2;
3653
+ }
3654
+ length_stream += 1;
3655
+
3656
+ length += 64;
3657
+ if (dst_end - dst < length ||
3658
+ lit_stream_end - lit_stream < length)
3659
+ return NULL;
3660
+
3661
+ do {
3662
+ COPY_64_ADD(dst, lit_stream, &dst[recent_offs]);
3663
+ COPY_64_ADD(dst + 8, lit_stream + 8, &dst[recent_offs + 8]);
3664
+ dst += 16;
3665
+ lit_stream += 16;
3666
+ length -= 16;
3667
+ } while (length > 0);
3668
+ dst += length;
3669
+ lit_stream += length;
3670
+ } else if (cmd == 1) {
3671
+ if (src_end - length_stream == 0)
3672
+ return NULL;
3673
+ length = *length_stream;
3674
+ if (length > 251) {
3675
+ if (src_end - length_stream < 3)
3676
+ return NULL;
3677
+ length += (size_t)*(uint16*)(length_stream + 1) * 4;
3678
+ length_stream += 2;
3679
+ }
3680
+ length_stream += 1;
3681
+ length += 91;
3682
+
3683
+ if (off16_stream == off16_stream_end)
3684
+ return NULL;
3685
+ match = dst - *off16_stream++;
3686
+ recent_offs = (match - dst);
3687
+ do {
3688
+ COPY_64(dst, match);
3689
+ COPY_64(dst + 8, match + 8);
3690
+ dst += 16;
3691
+ match += 16;
3692
+ length -= 16;
3693
+ } while (length > 0);
3694
+ dst += length;
3695
+ } else /* flag == 2 */ {
3696
+ if (src_end - length_stream == 0)
3697
+ return NULL;
3698
+ length = *length_stream;
3699
+ if (length > 251) {
3700
+ if (src_end - length_stream < 3)
3701
+ return NULL;
3702
+ length += (size_t)*(uint16*)(length_stream + 1) * 4;
3703
+ length_stream += 2;
3704
+ }
3705
+ length_stream += 1;
3706
+ length += 29;
3707
+ if (off32_stream == off32_stream_end)
3708
+ return NULL;
3709
+ match = dst_begin - *off32_stream++;
3710
+ recent_offs = (match - dst);
3711
+ do {
3712
+ COPY_64(dst, match);
3713
+ COPY_64(dst + 8, match + 8);
3714
+ dst += 16;
3715
+ match += 16;
3716
+ length -= 16;
3717
+ } while (length > 0);
3718
+ dst += length;
3719
+ _mm_prefetch((char*)dst_begin - off32_stream[3], _MM_HINT_T0);
3720
+ }
3721
+ }
3722
+
3723
+ length = dst_end - dst;
3724
+ if (length >= 8) {
3725
+ do {
3726
+ COPY_64_ADD(dst, lit_stream, &dst[recent_offs]);
3727
+ dst += 8;
3728
+ lit_stream += 8;
3729
+ length -= 8;
3730
+ } while (length >= 8);
3731
+ }
3732
+ if (length > 0) {
3733
+ do {
3734
+ *dst = *lit_stream++ + dst[recent_offs];
3735
+ dst++;
3736
+ } while (--length);
3737
+ }
3738
+
3739
+ *saved_dist = (int32)recent_offs;
3740
+ lz->length_stream = length_stream;
3741
+ lz->off16_stream = off16_stream;
3742
+ lz->lit_stream = lit_stream;
3743
+ return length_stream;
3744
+ }
3745
+
3746
+ const byte *Mermaid_Mode1(byte *dst, size_t dst_size, byte *dst_ptr_end, byte *dst_start,
3747
+ const byte *src_end, MermaidLzTable *lz, int32 *saved_dist, size_t startoff) {
3748
+ const byte *dst_end = dst + dst_size;
3749
+ const byte *cmd_stream = lz->cmd_stream;
3750
+ const byte *cmd_stream_end = lz->cmd_stream_end;
3751
+ const byte *length_stream = lz->length_stream;
3752
+ const byte *lit_stream = lz->lit_stream;
3753
+ const byte *lit_stream_end = lz->lit_stream_end;
3754
+ const uint16 *off16_stream = lz->off16_stream;
3755
+ const uint16 *off16_stream_end = lz->off16_stream_end;
3756
+ const uint32 *off32_stream = lz->off32_stream;
3757
+ const uint32 *off32_stream_end = lz->off32_stream_end;
3758
+ intptr_t recent_offs = *saved_dist;
3759
+ const byte *match;
3760
+ intptr_t length;
3761
+ const byte *dst_begin = dst;
3762
+
3763
+ dst += startoff;
3764
+
3765
+ while (cmd_stream < cmd_stream_end) {
3766
+ uintptr_t flag = *cmd_stream++;
3767
+ if (flag >= 24) {
3768
+ intptr_t new_dist = *off16_stream;
3769
+ uintptr_t use_distance = (uintptr_t)(flag >> 7) - 1;
3770
+ uintptr_t litlen = (flag & 7);
3771
+ COPY_64(dst, lit_stream);
3772
+ dst += litlen;
3773
+ lit_stream += litlen;
3774
+ recent_offs ^= use_distance & (recent_offs ^ -new_dist);
3775
+ off16_stream = (uint16*)((uintptr_t)off16_stream + (use_distance & 2));
3776
+ match = dst + recent_offs;
3777
+ COPY_64(dst, match);
3778
+ COPY_64(dst + 8, match + 8);
3779
+ dst += (flag >> 3) & 0xF;
3780
+ } else if (flag > 2) {
3781
+ length = flag + 5;
3782
+
3783
+ if (off32_stream == off32_stream_end)
3784
+ return NULL;
3785
+ match = dst_begin - *off32_stream++;
3786
+ recent_offs = (match - dst);
3787
+
3788
+ if (dst_end - dst < length)
3789
+ return NULL;
3790
+ COPY_64(dst, match);
3791
+ COPY_64(dst + 8, match + 8);
3792
+ COPY_64(dst + 16, match + 16);
3793
+ COPY_64(dst + 24, match + 24);
3794
+ dst += length;
3795
+ _mm_prefetch((char*)dst_begin - off32_stream[3], _MM_HINT_T0);
3796
+ } else if (flag == 0) {
3797
+ if (src_end - length_stream == 0)
3798
+ return NULL;
3799
+ length = *length_stream;
3800
+ if (length > 251) {
3801
+ if (src_end - length_stream < 3)
3802
+ return NULL;
3803
+ length += (size_t)*(uint16*)(length_stream + 1) * 4;
3804
+ length_stream += 2;
3805
+ }
3806
+ length_stream += 1;
3807
+
3808
+ length += 64;
3809
+ if (dst_end - dst < length ||
3810
+ lit_stream_end - lit_stream < length)
3811
+ return NULL;
3812
+
3813
+ do {
3814
+ COPY_64(dst, lit_stream);
3815
+ COPY_64(dst + 8, lit_stream + 8);
3816
+ dst += 16;
3817
+ lit_stream += 16;
3818
+ length -= 16;
3819
+ } while (length > 0);
3820
+ dst += length;
3821
+ lit_stream += length;
3822
+ } else if (flag == 1) {
3823
+ if (src_end - length_stream == 0)
3824
+ return NULL;
3825
+ length = *length_stream;
3826
+ if (length > 251) {
3827
+ if (src_end - length_stream < 3)
3828
+ return NULL;
3829
+ length += (size_t)*(uint16*)(length_stream + 1) * 4;
3830
+ length_stream += 2;
3831
+ }
3832
+ length_stream += 1;
3833
+ length += 91;
3834
+
3835
+ if (off16_stream == off16_stream_end)
3836
+ return NULL;
3837
+ match = dst - *off16_stream++;
3838
+ recent_offs = (match - dst);
3839
+ do {
3840
+ COPY_64(dst, match);
3841
+ COPY_64(dst + 8, match + 8);
3842
+ dst += 16;
3843
+ match += 16;
3844
+ length -= 16;
3845
+ } while (length > 0);
3846
+ dst += length;
3847
+ } else /* flag == 2 */ {
3848
+ if (src_end - length_stream == 0)
3849
+ return NULL;
3850
+ length = *length_stream;
3851
+ if (length > 251) {
3852
+ if (src_end - length_stream < 3)
3853
+ return NULL;
3854
+ length += (size_t)*(uint16*)(length_stream + 1) * 4;
3855
+ length_stream += 2;
3856
+ }
3857
+ length_stream += 1;
3858
+ length += 29;
3859
+
3860
+ if (off32_stream == off32_stream_end)
3861
+ return NULL;
3862
+ match = dst_begin - *off32_stream++;
3863
+ recent_offs = (match - dst);
3864
+
3865
+ do {
3866
+ COPY_64(dst, match);
3867
+ COPY_64(dst + 8, match + 8);
3868
+ dst += 16;
3869
+ match += 16;
3870
+ length -= 16;
3871
+ } while (length > 0);
3872
+ dst += length;
3873
+
3874
+ _mm_prefetch((char*)dst_begin - off32_stream[3], _MM_HINT_T0);
3875
+ }
3876
+ }
3877
+
3878
+ length = dst_end - dst;
3879
+ if (length >= 8) {
3880
+ do {
3881
+ COPY_64(dst, lit_stream);
3882
+ dst += 8;
3883
+ lit_stream += 8;
3884
+ length -= 8;
3885
+ } while (length >= 8);
3886
+ }
3887
+ if (length > 0) {
3888
+ do {
3889
+ *dst++ = *lit_stream++;
3890
+ } while (--length);
3891
+ }
3892
+
3893
+ *saved_dist = (int32)recent_offs;
3894
+ lz->length_stream = length_stream;
3895
+ lz->off16_stream = off16_stream;
3896
+ lz->lit_stream = lit_stream;
3897
+ return length_stream;
3898
+ }
3899
+
3900
+ bool Mermaid_ProcessLzRuns(int mode,
3901
+ const byte *src, const byte *src_end,
3902
+ byte *dst, size_t dst_size, uint64 offset, byte *dst_end,
3903
+ MermaidLzTable *lz) {
3904
+
3905
+ int iteration = 0;
3906
+ byte *dst_start = dst - offset;
3907
+ int32 saved_dist = -8;
3908
+ const byte *src_cur;
3909
+
3910
+ for (iteration = 0; iteration != 2; iteration++) {
3911
+ size_t dst_size_cur = dst_size;
3912
+ if (dst_size_cur > 0x10000) dst_size_cur = 0x10000;
3913
+
3914
+ if (iteration == 0) {
3915
+ lz->off32_stream = lz->off32_stream_1;
3916
+ lz->off32_stream_end = lz->off32_stream_1 + lz->off32_size_1 * 4;
3917
+ lz->cmd_stream_end = lz->cmd_stream + lz->cmd_stream_2_offs;
3918
+ } else {
3919
+ lz->off32_stream = lz->off32_stream_2;
3920
+ lz->off32_stream_end = lz->off32_stream_2 + lz->off32_size_2 * 4;
3921
+ lz->cmd_stream_end = lz->cmd_stream + lz->cmd_stream_2_offs_end;
3922
+ lz->cmd_stream += lz->cmd_stream_2_offs;
3923
+ }
3924
+
3925
+ if (mode == 0) {
3926
+ src_cur = Mermaid_Mode0(dst, dst_size_cur, dst_end, dst_start, src_end, lz, &saved_dist,
3927
+ (offset == 0) && (iteration == 0) ? 8 : 0);
3928
+ } else {
3929
+ src_cur = Mermaid_Mode1(dst, dst_size_cur, dst_end, dst_start, src_end, lz, &saved_dist,
3930
+ (offset == 0) && (iteration == 0) ? 8 : 0);
3931
+ }
3932
+ if (src_cur == NULL)
3933
+ return false;
3934
+
3935
+ dst += dst_size_cur;
3936
+ dst_size -= dst_size_cur;
3937
+ if (dst_size == 0)
3938
+ break;
3939
+ }
3940
+
3941
+ if (src_cur != src_end)
3942
+ return false;
3943
+
3944
+ return true;
3945
+ }
3946
+
3947
+
3948
+ int Mermaid_DecodeQuantum(byte *dst, byte *dst_end, byte *dst_start,
3949
+ const byte *src, const byte *src_end,
3950
+ byte *temp, byte *temp_end) {
3951
+ const byte *src_in = src;
3952
+ int mode, chunkhdr, dst_count, src_used, written_bytes;
3953
+
3954
+ while (dst_end - dst != 0) {
3955
+ dst_count = dst_end - dst;
3956
+ if (dst_count > 0x20000) dst_count = 0x20000;
3957
+ if (src_end - src < 4)
3958
+ return -1;
3959
+ chunkhdr = src[2] | src[1] << 8 | src[0] << 16;
3960
+ if (!(chunkhdr & 0x800000)) {
3961
+ // Stored without any match copying.
3962
+ byte *out = dst;
3963
+ src_used = Kraken_DecodeBytes(&out, src, src_end, &written_bytes, dst_count, false, temp, temp_end);
3964
+ if (src_used < 0 || written_bytes != dst_count)
3965
+ return -1;
3966
+ } else {
3967
+ src += 3;
3968
+ src_used = chunkhdr & 0x7FFFF;
3969
+ mode = (chunkhdr >> 19) & 0xF;
3970
+ if (src_end - src < src_used)
3971
+ return -1;
3972
+ if (src_used < dst_count) {
3973
+ int temp_usage = 2 * dst_count + 32;
3974
+ if (temp_usage > 0x40000) temp_usage = 0x40000;
3975
+ if (!Mermaid_ReadLzTable(mode,
3976
+ src, src + src_used,
3977
+ dst, dst_count,
3978
+ dst - dst_start,
3979
+ temp + sizeof(MermaidLzTable), temp + temp_usage,
3980
+ (MermaidLzTable*)temp))
3981
+ return -1;
3982
+ if (!Mermaid_ProcessLzRuns(mode,
3983
+ src, src + src_used,
3984
+ dst, dst_count,
3985
+ dst - dst_start, dst_end,
3986
+ (MermaidLzTable*)temp))
3987
+ return -1;
3988
+ } else if (src_used > dst_count || mode != 0) {
3989
+ return -1;
3990
+ } else {
3991
+ memmove(dst, src, dst_count);
3992
+ }
3993
+ }
3994
+ src += src_used;
3995
+ dst += dst_count;
3996
+ }
3997
+ return src - src_in;
3998
+ }
3999
+
4000
+ int LZNA_DecodeQuantum(byte *dst, byte *dst_end, byte *dst_start,
4001
+ const byte *src, const byte *src_end,
4002
+ struct LznaState *lut);
4003
+ void LZNA_InitLookup(LznaState *lut);
4004
+
4005
+ struct BitknitState;
4006
+
4007
+ void BitknitState_Init(BitknitState *bk);
4008
+ size_t Bitknit_Decode(const byte *src, const byte *src_end, byte *dst, byte *dst_end, byte *dst_start, BitknitState *bk);
4009
+
4010
+
4011
+ void Kraken_CopyWholeMatch(byte *dst, uint32 offset, size_t length) {
4012
+ size_t i = 0;
4013
+ byte *src = dst - offset;
4014
+ if (offset >= 8) {
4015
+ for (; i + 8 <= length; i += 8)
4016
+ *(uint64*)(dst + i) = *(uint64*)(src + i);
4017
+ }
4018
+ for (; i < length; i++)
4019
+ dst[i] = src[i];
4020
+ }
4021
+
4022
+ bool Kraken_DecodeStep(struct KrakenDecoder *dec,
4023
+ byte *dst_start, int offset, size_t dst_bytes_left_in,
4024
+ const byte *src, size_t src_bytes_left) {
4025
+ const byte *src_in = src;
4026
+ const byte *src_end = src + src_bytes_left;
4027
+ KrakenQuantumHeader qhdr;
4028
+ int n;
4029
+
4030
+ if ((offset & 0x3FFFF) == 0) {
4031
+ src = Kraken_ParseHeader(&dec->hdr, src);
4032
+ if (!src)
4033
+ return false;
4034
+ }
4035
+
4036
+ bool is_kraken_decoder = (dec->hdr.decoder_type == 6 || dec->hdr.decoder_type == 10 || dec->hdr.decoder_type == 12);
4037
+
4038
+ int dst_bytes_left = (int)Min(is_kraken_decoder ? 0x40000 : 0x4000, dst_bytes_left_in);
4039
+
4040
+ if (dec->hdr.uncompressed) {
4041
+ if (src_end - src < dst_bytes_left) {
4042
+ dec->src_used = dec->dst_used = 0;
4043
+ return true;
4044
+ }
4045
+ memmove(dst_start + offset, src, dst_bytes_left);
4046
+ dec->src_used = (src - src_in) + dst_bytes_left;
4047
+ dec->dst_used = dst_bytes_left;
4048
+ return true;
4049
+ }
4050
+
4051
+ if (is_kraken_decoder) {
4052
+ src = Kraken_ParseQuantumHeader(&qhdr, src, dec->hdr.use_checksums);
4053
+ } else {
4054
+ src = LZNA_ParseQuantumHeader(&qhdr, src, dec->hdr.use_checksums, dst_bytes_left);
4055
+ }
4056
+
4057
+ if (!src || src > src_end)
4058
+ return false;
4059
+
4060
+ // Too few bytes in buffer to make any progress?
4061
+ if ((uintptr_t)(src_end - src) < qhdr.compressed_size) {
4062
+ dec->src_used = dec->dst_used = 0;
4063
+ return true;
4064
+ }
4065
+
4066
+ if (qhdr.compressed_size > (uint32)dst_bytes_left)
4067
+ return false;
4068
+
4069
+ if (qhdr.compressed_size == 0) {
4070
+ if (qhdr.whole_match_distance != 0) {
4071
+ if (qhdr.whole_match_distance > (uint32)offset)
4072
+ return false;
4073
+ Kraken_CopyWholeMatch(dst_start + offset, qhdr.whole_match_distance, dst_bytes_left);
4074
+ } else {
4075
+ memset(dst_start + offset, qhdr.checksum, dst_bytes_left);
4076
+ }
4077
+ dec->src_used = (src - src_in);
4078
+ dec->dst_used = dst_bytes_left;
4079
+ return true;
4080
+ }
4081
+
4082
+ if (dec->hdr.use_checksums &&
4083
+ (Kraken_GetCrc(src, qhdr.compressed_size) & 0xFFFFFF) != qhdr.checksum)
4084
+ return false;
4085
+
4086
+ if (qhdr.compressed_size == dst_bytes_left) {
4087
+ memmove(dst_start + offset, src, dst_bytes_left);
4088
+ dec->src_used = (src - src_in) + dst_bytes_left;
4089
+ dec->dst_used = dst_bytes_left;
4090
+ return true;
4091
+ }
4092
+
4093
+ if (dec->hdr.decoder_type == 6) {
4094
+ n = Kraken_DecodeQuantum(dst_start + offset, dst_start + offset + dst_bytes_left, dst_start,
4095
+ src, src + qhdr.compressed_size,
4096
+ dec->scratch, dec->scratch + dec->scratch_size);
4097
+ } else if (dec->hdr.decoder_type == 5) {
4098
+ if (dec->hdr.restart_decoder) {
4099
+ dec->hdr.restart_decoder = false;
4100
+ LZNA_InitLookup((struct LznaState*)dec->scratch);
4101
+ }
4102
+ n = LZNA_DecodeQuantum(dst_start + offset, dst_start + offset + dst_bytes_left, dst_start,
4103
+ src, src + qhdr.compressed_size,
4104
+ (struct LznaState*)dec->scratch);
4105
+ } else if (dec->hdr.decoder_type == 11) {
4106
+ if (dec->hdr.restart_decoder) {
4107
+ dec->hdr.restart_decoder = false;
4108
+ BitknitState_Init((struct BitknitState*)dec->scratch);
4109
+ }
4110
+ n = (int)Bitknit_Decode(src, src + qhdr.compressed_size, dst_start + offset, dst_start + offset + dst_bytes_left, dst_start, (struct BitknitState*)dec->scratch);
4111
+
4112
+ } else if (dec->hdr.decoder_type == 10) {
4113
+ n = Mermaid_DecodeQuantum(dst_start + offset, dst_start + offset + dst_bytes_left, dst_start,
4114
+ src, src + qhdr.compressed_size,
4115
+ dec->scratch, dec->scratch + dec->scratch_size);
4116
+ } else if (dec->hdr.decoder_type == 12) {
4117
+ n = Leviathan_DecodeQuantum(dst_start + offset, dst_start + offset + dst_bytes_left, dst_start,
4118
+ src, src + qhdr.compressed_size,
4119
+ dec->scratch, dec->scratch + dec->scratch_size);
4120
+ } else {
4121
+ return false;
4122
+ }
4123
+
4124
+ if (n != qhdr.compressed_size)
4125
+ return false;
4126
+
4127
+ dec->src_used = (src - src_in) + n;
4128
+ dec->dst_used = dst_bytes_left;
4129
+ return true;
4130
+ }
4131
+
4132
+ ssize_t Kraken_Decompress(const byte *src, size_t src_len, byte *dst, size_t dst_len) {
4133
+ KrakenDecoder *dec = Kraken_Create();
4134
+ ssize_t offset = 0;
4135
+ while (dst_len != 0) {
4136
+ if (!Kraken_DecodeStep(dec, dst, offset, dst_len, src, src_len))
4137
+ goto FAIL;
4138
+ if (dec->src_used == 0)
4139
+ goto FAIL;
4140
+ src += dec->src_used;
4141
+ src_len -= dec->src_used;
4142
+ dst_len -= dec->dst_used;
4143
+ offset += dec->dst_used;
4144
+ }
4145
+ if (src_len != 0)
4146
+ goto FAIL;
4147
+ Kraken_Destroy(dec);
4148
+ return offset;
4149
+ FAIL:
4150
+ Kraken_Destroy(dec);
4151
+ return -1;
4152
+ }
4153
+