zstd-ruby 1.5.0.0 → 1.5.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/README.md +1 -1
  4. data/ext/zstdruby/extconf.rb +2 -1
  5. data/ext/zstdruby/libzstd/Makefile +50 -175
  6. data/ext/zstdruby/libzstd/README.md +7 -1
  7. data/ext/zstdruby/libzstd/common/bitstream.h +24 -9
  8. data/ext/zstdruby/libzstd/common/compiler.h +89 -43
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +11 -5
  10. data/ext/zstdruby/libzstd/common/error_private.h +79 -0
  11. data/ext/zstdruby/libzstd/common/fse.h +2 -1
  12. data/ext/zstdruby/libzstd/common/fse_decompress.c +1 -1
  13. data/ext/zstdruby/libzstd/common/huf.h +24 -22
  14. data/ext/zstdruby/libzstd/common/mem.h +18 -0
  15. data/ext/zstdruby/libzstd/common/pool.c +11 -6
  16. data/ext/zstdruby/libzstd/common/pool.h +2 -2
  17. data/ext/zstdruby/libzstd/common/portability_macros.h +137 -0
  18. data/ext/zstdruby/libzstd/common/xxhash.c +5 -805
  19. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  20. data/ext/zstdruby/libzstd/common/zstd_internal.h +95 -92
  21. data/ext/zstdruby/libzstd/common/zstd_trace.h +12 -3
  22. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  23. data/ext/zstdruby/libzstd/compress/fse_compress.c +63 -27
  24. data/ext/zstdruby/libzstd/compress/huf_compress.c +537 -104
  25. data/ext/zstdruby/libzstd/compress/zstd_compress.c +307 -373
  26. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +174 -83
  27. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +4 -3
  28. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +3 -1
  29. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +15 -14
  30. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +4 -3
  31. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +41 -27
  32. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +295 -120
  33. data/ext/zstdruby/libzstd/compress/zstd_fast.c +309 -130
  34. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +482 -562
  35. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +9 -7
  36. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  37. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +4 -1
  38. data/ext/zstdruby/libzstd/compress/zstd_opt.c +249 -148
  39. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +76 -38
  40. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +4 -1
  41. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +727 -189
  42. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +585 -0
  43. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +85 -22
  44. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +744 -220
  45. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -2
  46. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +34 -3
  47. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +23 -3
  48. data/ext/zstdruby/libzstd/dictBuilder/cover.c +9 -2
  49. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +11 -4
  50. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +101 -30
  51. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2 -6
  52. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3 -7
  53. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3 -7
  54. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3 -7
  55. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +3 -7
  56. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +3 -7
  57. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +3 -7
  58. data/ext/zstdruby/libzstd/libzstd.mk +203 -0
  59. data/ext/zstdruby/libzstd/libzstd.pc.in +1 -0
  60. data/ext/zstdruby/libzstd/module.modulemap +25 -0
  61. data/ext/zstdruby/libzstd/zdict.h +4 -4
  62. data/ext/zstdruby/libzstd/zstd.h +179 -136
  63. data/ext/zstdruby/zstdruby.c +2 -2
  64. data/lib/zstd-ruby/version.rb +1 -1
  65. metadata +11 -6
@@ -53,6 +53,28 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS
53
53
  /* *******************************************************
54
54
  * HUF : Huffman block compression
55
55
  *********************************************************/
56
+ #define HUF_WORKSPACE_MAX_ALIGNMENT 8
57
+
58
+ static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align)
59
+ {
60
+ size_t const mask = align - 1;
61
+ size_t const rem = (size_t)workspace & mask;
62
+ size_t const add = (align - rem) & mask;
63
+ BYTE* const aligned = (BYTE*)workspace + add;
64
+ assert((align & (align - 1)) == 0); /* pow 2 */
65
+ assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT);
66
+ if (*workspaceSizePtr >= add) {
67
+ assert(add < align);
68
+ assert(((size_t)aligned & mask) == 0);
69
+ *workspaceSizePtr -= add;
70
+ return aligned;
71
+ } else {
72
+ *workspaceSizePtr = 0;
73
+ return NULL;
74
+ }
75
+ }
76
+
77
+
56
78
  /* HUF_compressWeights() :
57
79
  * Same as FSE_compress(), but dedicated to huff0's weights compression.
58
80
  * The use case needs much less stack memory.
@@ -75,7 +97,7 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT
75
97
 
76
98
  unsigned maxSymbolValue = HUF_TABLELOG_MAX;
77
99
  U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
78
- HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)workspace;
100
+ HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
79
101
 
80
102
  if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC);
81
103
 
@@ -106,6 +128,40 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT
106
128
  return (size_t)(op-ostart);
107
129
  }
108
130
 
131
+ static size_t HUF_getNbBits(HUF_CElt elt)
132
+ {
133
+ return elt & 0xFF;
134
+ }
135
+
136
+ static size_t HUF_getNbBitsFast(HUF_CElt elt)
137
+ {
138
+ return elt;
139
+ }
140
+
141
+ static size_t HUF_getValue(HUF_CElt elt)
142
+ {
143
+ return elt & ~0xFF;
144
+ }
145
+
146
+ static size_t HUF_getValueFast(HUF_CElt elt)
147
+ {
148
+ return elt;
149
+ }
150
+
151
+ static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits)
152
+ {
153
+ assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX);
154
+ *elt = nbBits;
155
+ }
156
+
157
+ static void HUF_setValue(HUF_CElt* elt, size_t value)
158
+ {
159
+ size_t const nbBits = HUF_getNbBits(*elt);
160
+ if (nbBits > 0) {
161
+ assert((value >> nbBits) == 0);
162
+ *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits);
163
+ }
164
+ }
109
165
 
110
166
  typedef struct {
111
167
  HUF_CompressWeightsWksp wksp;
@@ -117,9 +173,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
117
173
  const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog,
118
174
  void* workspace, size_t workspaceSize)
119
175
  {
176
+ HUF_CElt const* const ct = CTable + 1;
120
177
  BYTE* op = (BYTE*)dst;
121
178
  U32 n;
122
- HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)workspace;
179
+ HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
123
180
 
124
181
  /* check conditions */
125
182
  if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
@@ -130,9 +187,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
130
187
  for (n=1; n<huffLog+1; n++)
131
188
  wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
132
189
  for (n=0; n<maxSymbolValue; n++)
133
- wksp->huffWeight[n] = wksp->bitsToWeight[CTable[n].nbBits];
190
+ wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])];
134
191
 
135
192
  /* attempt weights compression by FSE */
193
+ if (maxDstSize < 1) return ERROR(dstSize_tooSmall);
136
194
  { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) );
137
195
  if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */
138
196
  op[0] = (BYTE)hSize;
@@ -166,6 +224,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
166
224
  U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
167
225
  U32 tableLog = 0;
168
226
  U32 nbSymbols = 0;
227
+ HUF_CElt* const ct = CTable + 1;
169
228
 
170
229
  /* get symbol weights */
171
230
  CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
@@ -175,6 +234,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
175
234
  if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
176
235
  if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
177
236
 
237
+ CTable[0] = tableLog;
238
+
178
239
  /* Prepare base value per rank */
179
240
  { U32 n, nextRankStart = 0;
180
241
  for (n=1; n<=tableLog; n++) {
@@ -186,13 +247,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
186
247
  /* fill nbBits */
187
248
  { U32 n; for (n=0; n<nbSymbols; n++) {
188
249
  const U32 w = huffWeight[n];
189
- CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0);
250
+ HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0));
190
251
  } }
191
252
 
192
253
  /* fill val */
193
254
  { U16 nbPerRank[HUF_TABLELOG_MAX+2] = {0}; /* support w=0=>n=tableLog+1 */
194
255
  U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
195
- { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
256
+ { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; }
196
257
  /* determine stating value per rank */
197
258
  valPerRank[tableLog+1] = 0; /* for w==0 */
198
259
  { U16 min = 0;
@@ -202,18 +263,18 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
202
263
  min >>= 1;
203
264
  } }
204
265
  /* assign value within rank, symbol order */
205
- { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
266
+ { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
206
267
  }
207
268
 
208
269
  *maxSymbolValuePtr = nbSymbols - 1;
209
270
  return readSize;
210
271
  }
211
272
 
212
- U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
273
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
213
274
  {
214
- const HUF_CElt* table = (const HUF_CElt*)symbolTable;
275
+ const HUF_CElt* ct = CTable + 1;
215
276
  assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
216
- return table[symbolValue].nbBits;
277
+ return (U32)HUF_getNbBits(ct[symbolValue]);
217
278
  }
218
279
 
219
280
 
@@ -367,22 +428,118 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
367
428
  }
368
429
 
369
430
  typedef struct {
370
- U32 base;
371
- U32 curr;
431
+ U16 base;
432
+ U16 curr;
372
433
  } rankPos;
373
434
 
374
435
  typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
375
436
 
376
- #define RANK_POSITION_TABLE_SIZE 32
437
+ /* Number of buckets available for HUF_sort() */
438
+ #define RANK_POSITION_TABLE_SIZE 192
377
439
 
378
440
  typedef struct {
379
441
  huffNodeTable huffNodeTbl;
380
442
  rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
381
443
  } HUF_buildCTable_wksp_tables;
382
444
 
445
+ /* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
446
+ * Strategy is to use as many buckets as possible for representing distinct
447
+ * counts while using the remainder to represent all "large" counts.
448
+ *
449
+ * To satisfy this requirement for 192 buckets, we can do the following:
450
+ * Let buckets 0-166 represent distinct counts of [0, 166]
451
+ * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
452
+ */
453
+ #define RANK_POSITION_MAX_COUNT_LOG 32
454
+ #define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
455
+ #define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
456
+
457
+ /* Return the appropriate bucket index for a given count. See definition of
458
+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
459
+ */
460
+ static U32 HUF_getIndex(U32 const count) {
461
+ return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
462
+ ? count
463
+ : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
464
+ }
465
+
466
+ /* Helper swap function for HUF_quickSortPartition() */
467
+ static void HUF_swapNodes(nodeElt* a, nodeElt* b) {
468
+ nodeElt tmp = *a;
469
+ *a = *b;
470
+ *b = tmp;
471
+ }
472
+
473
+ /* Returns 0 if the huffNode array is not sorted by descending count */
474
+ MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) {
475
+ U32 i;
476
+ for (i = 1; i < maxSymbolValue1; ++i) {
477
+ if (huffNode[i].count > huffNode[i-1].count) {
478
+ return 0;
479
+ }
480
+ }
481
+ return 1;
482
+ }
483
+
484
+ /* Insertion sort by descending order */
485
+ HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) {
486
+ int i;
487
+ int const size = high-low+1;
488
+ huffNode += low;
489
+ for (i = 1; i < size; ++i) {
490
+ nodeElt const key = huffNode[i];
491
+ int j = i - 1;
492
+ while (j >= 0 && huffNode[j].count < key.count) {
493
+ huffNode[j + 1] = huffNode[j];
494
+ j--;
495
+ }
496
+ huffNode[j + 1] = key;
497
+ }
498
+ }
499
+
500
+ /* Pivot helper function for quicksort. */
501
+ static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) {
502
+ /* Simply select rightmost element as pivot. "Better" selectors like
503
+ * median-of-three don't experimentally appear to have any benefit.
504
+ */
505
+ U32 const pivot = arr[high].count;
506
+ int i = low - 1;
507
+ int j = low;
508
+ for ( ; j < high; j++) {
509
+ if (arr[j].count > pivot) {
510
+ i++;
511
+ HUF_swapNodes(&arr[i], &arr[j]);
512
+ }
513
+ }
514
+ HUF_swapNodes(&arr[i + 1], &arr[high]);
515
+ return i + 1;
516
+ }
517
+
518
+ /* Classic quicksort by descending with partially iterative calls
519
+ * to reduce worst case callstack size.
520
+ */
521
+ static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) {
522
+ int const kInsertionSortThreshold = 8;
523
+ if (high - low < kInsertionSortThreshold) {
524
+ HUF_insertionSort(arr, low, high);
525
+ return;
526
+ }
527
+ while (low < high) {
528
+ int const idx = HUF_quickSortPartition(arr, low, high);
529
+ if (idx - low < high - idx) {
530
+ HUF_simpleQuickSort(arr, low, idx - 1);
531
+ low = idx + 1;
532
+ } else {
533
+ HUF_simpleQuickSort(arr, idx + 1, high);
534
+ high = idx - 1;
535
+ }
536
+ }
537
+ }
538
+
383
539
  /**
384
540
  * HUF_sort():
385
541
  * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
542
+ * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
386
543
  *
387
544
  * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
388
545
  * Must have (maxSymbolValue + 1) entries.
@@ -390,44 +547,52 @@ typedef struct {
390
547
  * @param[in] maxSymbolValue Maximum symbol value.
391
548
  * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
392
549
  */
393
- static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition)
394
- {
395
- int n;
396
- int const maxSymbolValue1 = (int)maxSymbolValue + 1;
550
+ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) {
551
+ U32 n;
552
+ U32 const maxSymbolValue1 = maxSymbolValue+1;
397
553
 
398
554
  /* Compute base and set curr to base.
399
- * For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1.
400
- * Then 2^lowerRank <= count[n]+1 <= 2^rank.
555
+ * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1.
556
+ * See HUF_getIndex to see bucketing strategy.
401
557
  * We attribute each symbol to lowerRank's base value, because we want to know where
402
558
  * each rank begins in the output, so for rank R we want to count ranks R+1 and above.
403
559
  */
404
560
  ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
405
561
  for (n = 0; n < maxSymbolValue1; ++n) {
406
- U32 lowerRank = BIT_highbit32(count[n] + 1);
562
+ U32 lowerRank = HUF_getIndex(count[n]);
563
+ assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1);
407
564
  rankPosition[lowerRank].base++;
408
565
  }
566
+
409
567
  assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0);
568
+ /* Set up the rankPosition table */
410
569
  for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {
411
570
  rankPosition[n-1].base += rankPosition[n].base;
412
571
  rankPosition[n-1].curr = rankPosition[n-1].base;
413
572
  }
414
- /* Sort */
573
+
574
+ /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
415
575
  for (n = 0; n < maxSymbolValue1; ++n) {
416
576
  U32 const c = count[n];
417
- U32 const r = BIT_highbit32(c+1) + 1;
418
- U32 pos = rankPosition[r].curr++;
419
- /* Insert into the correct position in the rank.
420
- * We have at most 256 symbols, so this insertion should be fine.
421
- */
422
- while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) {
423
- huffNode[pos] = huffNode[pos-1];
424
- pos--;
425
- }
577
+ U32 const r = HUF_getIndex(c) + 1;
578
+ U32 const pos = rankPosition[r].curr++;
579
+ assert(pos < maxSymbolValue1);
426
580
  huffNode[pos].count = c;
427
581
  huffNode[pos].byte = (BYTE)n;
428
582
  }
429
- }
430
583
 
584
+ /* Sort each bucket. */
585
+ for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
586
+ U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
587
+ U32 const bucketStartIdx = rankPosition[n].base;
588
+ if (bucketSize > 1) {
589
+ assert(bucketStartIdx < maxSymbolValue1);
590
+ HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1);
591
+ }
592
+ }
593
+
594
+ assert(HUF_isSorted(huffNode, maxSymbolValue1));
595
+ }
431
596
 
432
597
  /** HUF_buildCTable_wksp() :
433
598
  * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
@@ -490,6 +655,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
490
655
  */
491
656
  static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits)
492
657
  {
658
+ HUF_CElt* const ct = CTable + 1;
493
659
  /* fill result into ctable (val, nbBits) */
494
660
  int n;
495
661
  U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
@@ -505,20 +671,20 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
505
671
  min >>= 1;
506
672
  } }
507
673
  for (n=0; n<alphabetSize; n++)
508
- CTable[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */
674
+ HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */
509
675
  for (n=0; n<alphabetSize; n++)
510
- CTable[n].val = valPerRank[CTable[n].nbBits]++; /* assign value within rank, symbol order */
676
+ HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */
677
+ CTable[0] = maxNbBits;
511
678
  }
512
679
 
513
- size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
680
+ size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
514
681
  {
515
- HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace;
682
+ HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
516
683
  nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
517
684
  nodeElt* const huffNode = huffNode0+1;
518
685
  int nonNullRank;
519
686
 
520
687
  /* safety checks */
521
- if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
522
688
  if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
523
689
  return ERROR(workSpace_tooSmall);
524
690
  if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
@@ -536,96 +702,334 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbo
536
702
  maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
537
703
  if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */
538
704
 
539
- HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
705
+ HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
540
706
 
541
707
  return maxNbBits;
542
708
  }
543
709
 
544
710
  size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
545
711
  {
712
+ HUF_CElt const* ct = CTable + 1;
546
713
  size_t nbBits = 0;
547
714
  int s;
548
715
  for (s = 0; s <= (int)maxSymbolValue; ++s) {
549
- nbBits += CTable[s].nbBits * count[s];
716
+ nbBits += HUF_getNbBits(ct[s]) * count[s];
550
717
  }
551
718
  return nbBits >> 3;
552
719
  }
553
720
 
554
721
  int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
722
+ HUF_CElt const* ct = CTable + 1;
555
723
  int bad = 0;
556
724
  int s;
557
725
  for (s = 0; s <= (int)maxSymbolValue; ++s) {
558
- bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
726
+ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
559
727
  }
560
728
  return !bad;
561
729
  }
562
730
 
563
731
  size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
564
732
 
733
+ /** HUF_CStream_t:
734
+ * Huffman uses its own BIT_CStream_t implementation.
735
+ * There are three major differences from BIT_CStream_t:
736
+ * 1. HUF_addBits() takes a HUF_CElt (size_t) which is
737
+ * the pair (nbBits, value) in the format:
738
+ * format:
739
+ * - Bits [0, 4) = nbBits
740
+ * - Bits [4, 64 - nbBits) = 0
741
+ * - Bits [64 - nbBits, 64) = value
742
+ * 2. The bitContainer is built from the upper bits and
743
+ * right shifted. E.g. to add a new value of N bits
744
+ * you right shift the bitContainer by N, then or in
745
+ * the new value into the N upper bits.
746
+ * 3. The bitstream has two bit containers. You can add
747
+ * bits to the second container and merge them into
748
+ * the first container.
749
+ */
750
+
751
+ #define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8)
752
+
753
+ typedef struct {
754
+ size_t bitContainer[2];
755
+ size_t bitPos[2];
756
+
757
+ BYTE* startPtr;
758
+ BYTE* ptr;
759
+ BYTE* endPtr;
760
+ } HUF_CStream_t;
761
+
762
+ /**! HUF_initCStream():
763
+ * Initializes the bitstream.
764
+ * @returns 0 or an error code.
765
+ */
766
+ static size_t HUF_initCStream(HUF_CStream_t* bitC,
767
+ void* startPtr, size_t dstCapacity)
768
+ {
769
+ ZSTD_memset(bitC, 0, sizeof(*bitC));
770
+ bitC->startPtr = (BYTE*)startPtr;
771
+ bitC->ptr = bitC->startPtr;
772
+ bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]);
773
+ if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall);
774
+ return 0;
775
+ }
776
+
777
+ /*! HUF_addBits():
778
+ * Adds the symbol stored in HUF_CElt elt to the bitstream.
779
+ *
780
+ * @param elt The element we're adding. This is a (nbBits, value) pair.
781
+ * See the HUF_CStream_t docs for the format.
782
+ * @param idx Insert into the bitstream at this idx.
783
+ * @param kFast This is a template parameter. If the bitstream is guaranteed
784
+ * to have at least 4 unused bits after this call it may be 1,
785
+ * otherwise it must be 0. HUF_addBits() is faster when fast is set.
786
+ */
787
+ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast)
788
+ {
789
+ assert(idx <= 1);
790
+ assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX);
791
+ /* This is efficient on x86-64 with BMI2 because shrx
792
+ * only reads the low 6 bits of the register. The compiler
793
+ * knows this and elides the mask. When fast is set,
794
+ * every operation can use the same value loaded from elt.
795
+ */
796
+ bitC->bitContainer[idx] >>= HUF_getNbBits(elt);
797
+ bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt);
798
+ /* We only read the low 8 bits of bitC->bitPos[idx] so it
799
+ * doesn't matter that the high bits have noise from the value.
800
+ */
801
+ bitC->bitPos[idx] += HUF_getNbBitsFast(elt);
802
+ assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
803
+ /* The last 4-bits of elt are dirty if fast is set,
804
+ * so we must not be overwriting bits that have already been
805
+ * inserted into the bit container.
806
+ */
807
+ #if DEBUGLEVEL >= 1
808
+ {
809
+ size_t const nbBits = HUF_getNbBits(elt);
810
+ size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
811
+ (void)dirtyBits;
812
+ /* Middle bits are 0. */
813
+ assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
814
+ /* We didn't overwrite any bits in the bit container. */
815
+ assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
816
+ (void)dirtyBits;
817
+ }
818
+ #endif
819
+ }
820
+
821
+ FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC)
822
+ {
823
+ bitC->bitContainer[1] = 0;
824
+ bitC->bitPos[1] = 0;
825
+ }
826
+
827
+ /*! HUF_mergeIndex1() :
828
+ * Merges the bit container @ index 1 into the bit container @ index 0
829
+ * and zeros the bit container @ index 1.
830
+ */
831
+ FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC)
832
+ {
833
+ assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER);
834
+ bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF);
835
+ bitC->bitContainer[0] |= bitC->bitContainer[1];
836
+ bitC->bitPos[0] += bitC->bitPos[1];
837
+ assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER);
838
+ }
839
+
840
+ /*! HUF_flushBits() :
841
+ * Flushes the bits in the bit container @ index 0.
842
+ *
843
+ * @post bitPos will be < 8.
844
+ * @param kFast If kFast is set then we must know a-priori that
845
+ * the bit container will not overflow.
846
+ */
847
+ FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast)
848
+ {
849
+ /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */
850
+ size_t const nbBits = bitC->bitPos[0] & 0xFF;
851
+ size_t const nbBytes = nbBits >> 3;
852
+ /* The top nbBits bits of bitContainer are the ones we need. */
853
+ size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits);
854
+ /* Mask bitPos to account for the bytes we consumed. */
855
+ bitC->bitPos[0] &= 7;
856
+ assert(nbBits > 0);
857
+ assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8);
858
+ assert(bitC->ptr <= bitC->endPtr);
859
+ MEM_writeLEST(bitC->ptr, bitContainer);
860
+ bitC->ptr += nbBytes;
861
+ assert(!kFast || bitC->ptr <= bitC->endPtr);
862
+ if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
863
+ /* bitContainer doesn't need to be modified because the leftover
864
+ * bits are already the top bitPos bits. And we don't care about
865
+ * noise in the lower values.
866
+ */
867
+ }
868
+
869
+ /*! HUF_endMark()
870
+ * @returns The Huffman stream end mark: A 1-bit value = 1.
871
+ */
872
+ static HUF_CElt HUF_endMark(void)
873
+ {
874
+ HUF_CElt endMark;
875
+ HUF_setNbBits(&endMark, 1);
876
+ HUF_setValue(&endMark, 1);
877
+ return endMark;
878
+ }
879
+
880
+ /*! HUF_closeCStream() :
881
+ * @return Size of CStream, in bytes,
882
+ * or 0 if it could not fit into dstBuffer */
883
+ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
884
+ {
885
+ HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0);
886
+ HUF_flushBits(bitC, /* kFast */ 0);
887
+ {
888
+ size_t const nbBits = bitC->bitPos[0] & 0xFF;
889
+ if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
890
+ return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
891
+ }
892
+ }
893
+
565
894
  FORCE_INLINE_TEMPLATE void
566
- HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
895
+ HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast)
567
896
  {
568
- BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
897
+ HUF_addBits(bitCPtr, CTable[symbol], idx, fast);
569
898
  }
570
899
 
571
- #define HUF_FLUSHBITS(s) BIT_flushBits(s)
900
+ FORCE_INLINE_TEMPLATE void
901
+ HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC,
902
+ const BYTE* ip, size_t srcSize,
903
+ const HUF_CElt* ct,
904
+ int kUnroll, int kFastFlush, int kLastFast)
905
+ {
906
+ /* Join to kUnroll */
907
+ int n = (int)srcSize;
908
+ int rem = n % kUnroll;
909
+ if (rem > 0) {
910
+ for (; rem > 0; --rem) {
911
+ HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0);
912
+ }
913
+ HUF_flushBits(bitC, kFastFlush);
914
+ }
915
+ assert(n % kUnroll == 0);
916
+
917
+ /* Join to 2 * kUnroll */
918
+ if (n % (2 * kUnroll)) {
919
+ int u;
920
+ for (u = 1; u < kUnroll; ++u) {
921
+ HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1);
922
+ }
923
+ HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast);
924
+ HUF_flushBits(bitC, kFastFlush);
925
+ n -= kUnroll;
926
+ }
927
+ assert(n % (2 * kUnroll) == 0);
928
+
929
+ for (; n>0; n-= 2 * kUnroll) {
930
+ /* Encode kUnroll symbols into the bitstream @ index 0. */
931
+ int u;
932
+ for (u = 1; u < kUnroll; ++u) {
933
+ HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1);
934
+ }
935
+ HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast);
936
+ HUF_flushBits(bitC, kFastFlush);
937
+ /* Encode kUnroll symbols into the bitstream @ index 1.
938
+ * This allows us to start filling the bit container
939
+ * without any data dependencies.
940
+ */
941
+ HUF_zeroIndex1(bitC);
942
+ for (u = 1; u < kUnroll; ++u) {
943
+ HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1);
944
+ }
945
+ HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast);
946
+ /* Merge bitstream @ index 1 into the bitstream @ index 0 */
947
+ HUF_mergeIndex1(bitC);
948
+ HUF_flushBits(bitC, kFastFlush);
949
+ }
950
+ assert(n == 0);
951
+
952
+ }
572
953
 
573
- #define HUF_FLUSHBITS_1(stream) \
574
- if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
954
+ /**
955
+ * Returns a tight upper bound on the output space needed by Huffman
956
+ * with 8 bytes buffer to handle over-writes. If the output is at least
957
+ * this large we don't need to do bounds checks during Huffman encoding.
958
+ */
959
+ static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog)
960
+ {
961
+ return ((srcSize * tableLog) >> 3) + 8;
962
+ }
575
963
 
576
- #define HUF_FLUSHBITS_2(stream) \
577
- if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
578
964
 
579
965
  FORCE_INLINE_TEMPLATE size_t
580
966
  HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
581
967
  const void* src, size_t srcSize,
582
968
  const HUF_CElt* CTable)
583
969
  {
970
+ U32 const tableLog = (U32)CTable[0];
971
+ HUF_CElt const* ct = CTable + 1;
584
972
  const BYTE* ip = (const BYTE*) src;
585
973
  BYTE* const ostart = (BYTE*)dst;
586
974
  BYTE* const oend = ostart + dstSize;
587
975
  BYTE* op = ostart;
588
- size_t n;
589
- BIT_CStream_t bitC;
976
+ HUF_CStream_t bitC;
590
977
 
591
978
  /* init */
592
979
  if (dstSize < 8) return 0; /* not enough space to compress */
593
- { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op));
980
+ { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
594
981
  if (HUF_isError(initErr)) return 0; }
595
982
 
596
- n = srcSize & ~3; /* join to mod 4 */
597
- switch (srcSize & 3)
598
- {
599
- case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
600
- HUF_FLUSHBITS_2(&bitC);
601
- /* fall-through */
602
- case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
603
- HUF_FLUSHBITS_1(&bitC);
604
- /* fall-through */
605
- case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
606
- HUF_FLUSHBITS(&bitC);
607
- /* fall-through */
608
- case 0 : /* fall-through */
609
- default: break;
610
- }
611
-
612
- for (; n>0; n-=4) { /* note : n&3==0 at this stage */
613
- HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
614
- HUF_FLUSHBITS_1(&bitC);
615
- HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
616
- HUF_FLUSHBITS_2(&bitC);
617
- HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
618
- HUF_FLUSHBITS_1(&bitC);
619
- HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
620
- HUF_FLUSHBITS(&bitC);
983
+ if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
984
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0);
985
+ else {
986
+ if (MEM_32bits()) {
987
+ switch (tableLog) {
988
+ case 11:
989
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0);
990
+ break;
991
+ case 10: ZSTD_FALLTHROUGH;
992
+ case 9: ZSTD_FALLTHROUGH;
993
+ case 8:
994
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1);
995
+ break;
996
+ case 7: ZSTD_FALLTHROUGH;
997
+ default:
998
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1);
999
+ break;
1000
+ }
1001
+ } else {
1002
+ switch (tableLog) {
1003
+ case 11:
1004
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0);
1005
+ break;
1006
+ case 10:
1007
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1);
1008
+ break;
1009
+ case 9:
1010
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0);
1011
+ break;
1012
+ case 8:
1013
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0);
1014
+ break;
1015
+ case 7:
1016
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0);
1017
+ break;
1018
+ case 6: ZSTD_FALLTHROUGH;
1019
+ default:
1020
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1);
1021
+ break;
1022
+ }
1023
+ }
621
1024
  }
1025
+ assert(bitC.ptr <= bitC.endPtr);
622
1026
 
623
- return BIT_closeCStream(&bitC);
1027
+ return HUF_closeCStream(&bitC);
624
1028
  }
625
1029
 
626
1030
  #if DYNAMIC_BMI2
627
1031
 
628
- static TARGET_ATTRIBUTE("bmi2") size_t
1032
+ static BMI2_TARGET_ATTRIBUTE size_t
629
1033
  HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
630
1034
  const void* src, size_t srcSize,
631
1035
  const HUF_CElt* CTable)
@@ -667,9 +1071,13 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
667
1071
 
668
1072
  size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
669
1073
  {
670
- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
1074
+ return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
671
1075
  }
672
1076
 
1077
+ size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
1078
+ {
1079
+ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
1080
+ }
673
1081
 
674
1082
  static size_t
675
1083
  HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
@@ -689,8 +1097,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
689
1097
 
690
1098
  assert(op <= oend);
691
1099
  { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
692
- if (cSize==0) return 0;
693
- assert(cSize <= 65535);
1100
+ if (cSize == 0 || cSize > 65535) return 0;
694
1101
  MEM_writeLE16(ostart, (U16)cSize);
695
1102
  op += cSize;
696
1103
  }
@@ -698,8 +1105,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
698
1105
  ip += segmentSize;
699
1106
  assert(op <= oend);
700
1107
  { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
701
- if (cSize==0) return 0;
702
- assert(cSize <= 65535);
1108
+ if (cSize == 0 || cSize > 65535) return 0;
703
1109
  MEM_writeLE16(ostart+2, (U16)cSize);
704
1110
  op += cSize;
705
1111
  }
@@ -707,8 +1113,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
707
1113
  ip += segmentSize;
708
1114
  assert(op <= oend);
709
1115
  { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
710
- if (cSize==0) return 0;
711
- assert(cSize <= 65535);
1116
+ if (cSize == 0 || cSize > 65535) return 0;
712
1117
  MEM_writeLE16(ostart+4, (U16)cSize);
713
1118
  op += cSize;
714
1119
  }
@@ -717,7 +1122,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
717
1122
  assert(op <= oend);
718
1123
  assert(ip <= iend);
719
1124
  { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
720
- if (cSize==0) return 0;
1125
+ if (cSize == 0 || cSize > 65535) return 0;
721
1126
  op += cSize;
722
1127
  }
723
1128
 
@@ -726,7 +1131,12 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
726
1131
 
727
1132
  size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
728
1133
  {
729
- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
1134
+ return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
1135
+ }
1136
+
1137
+ size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
1138
+ {
1139
+ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
730
1140
  }
731
1141
 
732
1142
  typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
@@ -750,35 +1160,38 @@ static size_t HUF_compressCTable_internal(
750
1160
 
751
1161
  typedef struct {
752
1162
  unsigned count[HUF_SYMBOLVALUE_MAX + 1];
753
- HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
1163
+ HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];
754
1164
  union {
755
1165
  HUF_buildCTable_wksp_tables buildCTable_wksp;
756
1166
  HUF_WriteCTableWksp writeCTable_wksp;
1167
+ U32 hist_wksp[HIST_WKSP_SIZE_U32];
757
1168
  } wksps;
758
1169
  } HUF_compress_tables_t;
759
1170
 
1171
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
1172
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */
1173
+
760
1174
  /* HUF_compress_internal() :
761
1175
  * `workSpace_align4` must be aligned on 4-bytes boundaries,
762
- * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */
1176
+ * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
763
1177
  static size_t
764
1178
  HUF_compress_internal (void* dst, size_t dstSize,
765
1179
  const void* src, size_t srcSize,
766
1180
  unsigned maxSymbolValue, unsigned huffLog,
767
1181
  HUF_nbStreams_e nbStreams,
768
- void* workSpace_align4, size_t wkspSize,
1182
+ void* workSpace, size_t wkspSize,
769
1183
  HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
770
- const int bmi2)
1184
+ const int bmi2, unsigned suspectUncompressible)
771
1185
  {
772
- HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4;
1186
+ HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
773
1187
  BYTE* const ostart = (BYTE*)dst;
774
1188
  BYTE* const oend = ostart + dstSize;
775
1189
  BYTE* op = ostart;
776
1190
 
777
- HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE);
778
- assert(((size_t)workSpace_align4 & 3) == 0); /* must be aligned on 4-bytes boundaries */
1191
+ HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
779
1192
 
780
1193
  /* checks & inits */
781
- if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall);
1194
+ if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
782
1195
  if (!srcSize) return 0; /* Uncompressed */
783
1196
  if (!dstSize) return 0; /* cannot fit anything within dst budget */
784
1197
  if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */
@@ -794,8 +1207,23 @@ HUF_compress_internal (void* dst, size_t dstSize,
794
1207
  nbStreams, oldHufTable, bmi2);
795
1208
  }
796
1209
 
1210
+ /* If uncompressible data is suspected, do a smaller sampling first */
1211
+ DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
1212
+ if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
1213
+ size_t largestTotal = 0;
1214
+ { unsigned maxSymbolValueBegin = maxSymbolValue;
1215
+ CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
1216
+ largestTotal += largestBegin;
1217
+ }
1218
+ { unsigned maxSymbolValueEnd = maxSymbolValue;
1219
+ CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
1220
+ largestTotal += largestEnd;
1221
+ }
1222
+ if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */
1223
+ }
1224
+
797
1225
  /* Scan input and build symbol stats */
798
- { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) );
1226
+ { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) );
799
1227
  if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
800
1228
  if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */
801
1229
  }
@@ -820,9 +1248,12 @@ HUF_compress_internal (void* dst, size_t dstSize,
820
1248
  &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
821
1249
  CHECK_F(maxBits);
822
1250
  huffLog = (U32)maxBits;
823
- /* Zero unused symbols in CTable, so we can check it for validity */
824
- ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0,
825
- sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
1251
+ }
1252
+ /* Zero unused symbols in CTable, so we can check it for validity */
1253
+ {
1254
+ size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
1255
+ size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
1256
+ ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
826
1257
  }
827
1258
 
828
1259
  /* Write table description header */
@@ -859,19 +1290,20 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
859
1290
  return HUF_compress_internal(dst, dstSize, src, srcSize,
860
1291
  maxSymbolValue, huffLog, HUF_singleStream,
861
1292
  workSpace, wkspSize,
862
- NULL, NULL, 0, 0 /*bmi2*/);
1293
+ NULL, NULL, 0, 0 /*bmi2*/, 0);
863
1294
  }
864
1295
 
865
1296
  size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
866
1297
  const void* src, size_t srcSize,
867
1298
  unsigned maxSymbolValue, unsigned huffLog,
868
1299
  void* workSpace, size_t wkspSize,
869
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
1300
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
1301
+ int bmi2, unsigned suspectUncompressible)
870
1302
  {
871
1303
  return HUF_compress_internal(dst, dstSize, src, srcSize,
872
1304
  maxSymbolValue, huffLog, HUF_singleStream,
873
1305
  workSpace, wkspSize, hufTable,
874
- repeat, preferRepeat, bmi2);
1306
+ repeat, preferRepeat, bmi2, suspectUncompressible);
875
1307
  }
876
1308
 
877
1309
  /* HUF_compress4X_repeat():
@@ -885,22 +1317,23 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
885
1317
  return HUF_compress_internal(dst, dstSize, src, srcSize,
886
1318
  maxSymbolValue, huffLog, HUF_fourStreams,
887
1319
  workSpace, wkspSize,
888
- NULL, NULL, 0, 0 /*bmi2*/);
1320
+ NULL, NULL, 0, 0 /*bmi2*/, 0);
889
1321
  }
890
1322
 
891
1323
  /* HUF_compress4X_repeat():
892
1324
  * compress input using 4 streams.
1325
+ * consider skipping quickly
893
1326
  * re-use an existing huffman compression table */
894
1327
  size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
895
1328
  const void* src, size_t srcSize,
896
1329
  unsigned maxSymbolValue, unsigned huffLog,
897
1330
  void* workSpace, size_t wkspSize,
898
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
1331
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
899
1332
  {
900
1333
  return HUF_compress_internal(dst, dstSize, src, srcSize,
901
1334
  maxSymbolValue, huffLog, HUF_fourStreams,
902
1335
  workSpace, wkspSize,
903
- hufTable, repeat, preferRepeat, bmi2);
1336
+ hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
904
1337
  }
905
1338
 
906
1339
  #ifndef ZSTD_NO_UNUSED_FUNCTIONS
@@ -918,7 +1351,7 @@ size_t HUF_compress1X (void* dst, size_t dstSize,
918
1351
  const void* src, size_t srcSize,
919
1352
  unsigned maxSymbolValue, unsigned huffLog)
920
1353
  {
921
- unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
1354
+ U64 workSpace[HUF_WORKSPACE_SIZE_U64];
922
1355
  return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
923
1356
  }
924
1357
 
@@ -926,7 +1359,7 @@ size_t HUF_compress2 (void* dst, size_t dstSize,
926
1359
  const void* src, size_t srcSize,
927
1360
  unsigned maxSymbolValue, unsigned huffLog)
928
1361
  {
929
- unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
1362
+ U64 workSpace[HUF_WORKSPACE_SIZE_U64];
930
1363
  return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
931
1364
  }
932
1365