zstd-ruby 1.5.0.0 → 1.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/README.md +1 -1
  4. data/ext/zstdruby/extconf.rb +1 -0
  5. data/ext/zstdruby/libzstd/Makefile +50 -175
  6. data/ext/zstdruby/libzstd/README.md +7 -1
  7. data/ext/zstdruby/libzstd/common/bitstream.h +24 -9
  8. data/ext/zstdruby/libzstd/common/compiler.h +89 -43
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +11 -5
  10. data/ext/zstdruby/libzstd/common/error_private.h +79 -0
  11. data/ext/zstdruby/libzstd/common/fse.h +2 -1
  12. data/ext/zstdruby/libzstd/common/fse_decompress.c +1 -1
  13. data/ext/zstdruby/libzstd/common/huf.h +24 -22
  14. data/ext/zstdruby/libzstd/common/mem.h +18 -0
  15. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  16. data/ext/zstdruby/libzstd/common/xxhash.c +5 -805
  17. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  18. data/ext/zstdruby/libzstd/common/zstd_internal.h +92 -88
  19. data/ext/zstdruby/libzstd/common/zstd_trace.h +12 -3
  20. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  21. data/ext/zstdruby/libzstd/compress/fse_compress.c +63 -27
  22. data/ext/zstdruby/libzstd/compress/huf_compress.c +537 -104
  23. data/ext/zstdruby/libzstd/compress/zstd_compress.c +194 -278
  24. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +102 -44
  25. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +4 -3
  26. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +3 -1
  27. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +5 -4
  28. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +3 -2
  29. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +3 -3
  30. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +289 -114
  31. data/ext/zstdruby/libzstd/compress/zstd_fast.c +302 -123
  32. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +418 -502
  33. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +4 -4
  34. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  35. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +4 -1
  36. data/ext/zstdruby/libzstd/compress/zstd_opt.c +186 -108
  37. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +59 -29
  38. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +727 -189
  39. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  40. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +85 -22
  41. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +744 -220
  42. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -2
  43. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +34 -3
  44. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +23 -3
  45. data/ext/zstdruby/libzstd/dictBuilder/cover.c +9 -2
  46. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +11 -4
  47. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +99 -28
  48. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2 -6
  49. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3 -7
  50. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3 -7
  51. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3 -7
  52. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +3 -7
  53. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +3 -7
  54. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +3 -7
  55. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  56. data/ext/zstdruby/libzstd/libzstd.pc.in +1 -0
  57. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  58. data/ext/zstdruby/libzstd/zdict.h +4 -4
  59. data/ext/zstdruby/libzstd/zstd.h +179 -136
  60. data/ext/zstdruby/zstdruby.c +2 -2
  61. data/lib/zstd-ruby/version.rb +1 -1
  62. metadata +8 -3
@@ -53,6 +53,28 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxS
53
53
  /* *******************************************************
54
54
  * HUF : Huffman block compression
55
55
  *********************************************************/
56
+ #define HUF_WORKSPACE_MAX_ALIGNMENT 8
57
+
58
+ static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePtr, size_t align)
59
+ {
60
+ size_t const mask = align - 1;
61
+ size_t const rem = (size_t)workspace & mask;
62
+ size_t const add = (align - rem) & mask;
63
+ BYTE* const aligned = (BYTE*)workspace + add;
64
+ assert((align & (align - 1)) == 0); /* pow 2 */
65
+ assert(align <= HUF_WORKSPACE_MAX_ALIGNMENT);
66
+ if (*workspaceSizePtr >= add) {
67
+ assert(add < align);
68
+ assert(((size_t)aligned & mask) == 0);
69
+ *workspaceSizePtr -= add;
70
+ return aligned;
71
+ } else {
72
+ *workspaceSizePtr = 0;
73
+ return NULL;
74
+ }
75
+ }
76
+
77
+
56
78
  /* HUF_compressWeights() :
57
79
  * Same as FSE_compress(), but dedicated to huff0's weights compression.
58
80
  * The use case needs much less stack memory.
@@ -75,7 +97,7 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT
75
97
 
76
98
  unsigned maxSymbolValue = HUF_TABLELOG_MAX;
77
99
  U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
78
- HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)workspace;
100
+ HUF_CompressWeightsWksp* wksp = (HUF_CompressWeightsWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
79
101
 
80
102
  if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENERIC);
81
103
 
@@ -106,6 +128,40 @@ static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightT
106
128
  return (size_t)(op-ostart);
107
129
  }
108
130
 
131
+ static size_t HUF_getNbBits(HUF_CElt elt)
132
+ {
133
+ return elt & 0xFF;
134
+ }
135
+
136
+ static size_t HUF_getNbBitsFast(HUF_CElt elt)
137
+ {
138
+ return elt;
139
+ }
140
+
141
+ static size_t HUF_getValue(HUF_CElt elt)
142
+ {
143
+ return elt & ~0xFF;
144
+ }
145
+
146
+ static size_t HUF_getValueFast(HUF_CElt elt)
147
+ {
148
+ return elt;
149
+ }
150
+
151
+ static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits)
152
+ {
153
+ assert(nbBits <= HUF_TABLELOG_ABSOLUTEMAX);
154
+ *elt = nbBits;
155
+ }
156
+
157
+ static void HUF_setValue(HUF_CElt* elt, size_t value)
158
+ {
159
+ size_t const nbBits = HUF_getNbBits(*elt);
160
+ if (nbBits > 0) {
161
+ assert((value >> nbBits) == 0);
162
+ *elt |= value << (sizeof(HUF_CElt) * 8 - nbBits);
163
+ }
164
+ }
109
165
 
110
166
  typedef struct {
111
167
  HUF_CompressWeightsWksp wksp;
@@ -117,9 +173,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
117
173
  const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog,
118
174
  void* workspace, size_t workspaceSize)
119
175
  {
176
+ HUF_CElt const* const ct = CTable + 1;
120
177
  BYTE* op = (BYTE*)dst;
121
178
  U32 n;
122
- HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)workspace;
179
+ HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
123
180
 
124
181
  /* check conditions */
125
182
  if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
@@ -130,9 +187,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
130
187
  for (n=1; n<huffLog+1; n++)
131
188
  wksp->bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
132
189
  for (n=0; n<maxSymbolValue; n++)
133
- wksp->huffWeight[n] = wksp->bitsToWeight[CTable[n].nbBits];
190
+ wksp->huffWeight[n] = wksp->bitsToWeight[HUF_getNbBits(ct[n])];
134
191
 
135
192
  /* attempt weights compression by FSE */
193
+ if (maxDstSize < 1) return ERROR(dstSize_tooSmall);
136
194
  { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huffWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) );
137
195
  if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */
138
196
  op[0] = (BYTE)hSize;
@@ -166,6 +224,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
166
224
  U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for values from 0 to 16 */
167
225
  U32 tableLog = 0;
168
226
  U32 nbSymbols = 0;
227
+ HUF_CElt* const ct = CTable + 1;
169
228
 
170
229
  /* get symbol weights */
171
230
  CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
@@ -175,6 +234,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
175
234
  if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
176
235
  if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
177
236
 
237
+ CTable[0] = tableLog;
238
+
178
239
  /* Prepare base value per rank */
179
240
  { U32 n, nextRankStart = 0;
180
241
  for (n=1; n<=tableLog; n++) {
@@ -186,13 +247,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
186
247
  /* fill nbBits */
187
248
  { U32 n; for (n=0; n<nbSymbols; n++) {
188
249
  const U32 w = huffWeight[n];
189
- CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0);
250
+ HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w != 0));
190
251
  } }
191
252
 
192
253
  /* fill val */
193
254
  { U16 nbPerRank[HUF_TABLELOG_MAX+2] = {0}; /* support w=0=>n=tableLog+1 */
194
255
  U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
195
- { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
256
+ { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[n])]++; }
196
257
  /* determine stating value per rank */
197
258
  valPerRank[tableLog+1] = 0; /* for w==0 */
198
259
  { U16 min = 0;
@@ -202,18 +263,18 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
202
263
  min >>= 1;
203
264
  } }
204
265
  /* assign value within rank, symbol order */
205
- { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
266
+ { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
206
267
  }
207
268
 
208
269
  *maxSymbolValuePtr = nbSymbols - 1;
209
270
  return readSize;
210
271
  }
211
272
 
212
- U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
273
+ U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
213
274
  {
214
- const HUF_CElt* table = (const HUF_CElt*)symbolTable;
275
+ const HUF_CElt* ct = CTable + 1;
215
276
  assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
216
- return table[symbolValue].nbBits;
277
+ return (U32)HUF_getNbBits(ct[symbolValue]);
217
278
  }
218
279
 
219
280
 
@@ -367,22 +428,118 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
367
428
  }
368
429
 
369
430
  typedef struct {
370
- U32 base;
371
- U32 curr;
431
+ U16 base;
432
+ U16 curr;
372
433
  } rankPos;
373
434
 
374
435
  typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
375
436
 
376
- #define RANK_POSITION_TABLE_SIZE 32
437
+ /* Number of buckets available for HUF_sort() */
438
+ #define RANK_POSITION_TABLE_SIZE 192
377
439
 
378
440
  typedef struct {
379
441
  huffNodeTable huffNodeTbl;
380
442
  rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
381
443
  } HUF_buildCTable_wksp_tables;
382
444
 
445
+ /* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
446
+ * Strategy is to use as many buckets as possible for representing distinct
447
+ * counts while using the remainder to represent all "large" counts.
448
+ *
449
+ * To satisfy this requirement for 192 buckets, we can do the following:
450
+ * Let buckets 0-166 represent distinct counts of [0, 166]
451
+ * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
452
+ */
453
+ #define RANK_POSITION_MAX_COUNT_LOG 32
454
+ #define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
455
+ #define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
456
+
457
+ /* Return the appropriate bucket index for a given count. See definition of
458
+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
459
+ */
460
+ static U32 HUF_getIndex(U32 const count) {
461
+ return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
462
+ ? count
463
+ : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
464
+ }
465
+
466
+ /* Helper swap function for HUF_quickSortPartition() */
467
+ static void HUF_swapNodes(nodeElt* a, nodeElt* b) {
468
+ nodeElt tmp = *a;
469
+ *a = *b;
470
+ *b = tmp;
471
+ }
472
+
473
+ /* Returns 0 if the huffNode array is not sorted by descending count */
474
+ MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1) {
475
+ U32 i;
476
+ for (i = 1; i < maxSymbolValue1; ++i) {
477
+ if (huffNode[i].count > huffNode[i-1].count) {
478
+ return 0;
479
+ }
480
+ }
481
+ return 1;
482
+ }
483
+
484
+ /* Insertion sort by descending order */
485
+ HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int const high) {
486
+ int i;
487
+ int const size = high-low+1;
488
+ huffNode += low;
489
+ for (i = 1; i < size; ++i) {
490
+ nodeElt const key = huffNode[i];
491
+ int j = i - 1;
492
+ while (j >= 0 && huffNode[j].count < key.count) {
493
+ huffNode[j + 1] = huffNode[j];
494
+ j--;
495
+ }
496
+ huffNode[j + 1] = key;
497
+ }
498
+ }
499
+
500
+ /* Pivot helper function for quicksort. */
501
+ static int HUF_quickSortPartition(nodeElt arr[], int const low, int const high) {
502
+ /* Simply select rightmost element as pivot. "Better" selectors like
503
+ * median-of-three don't experimentally appear to have any benefit.
504
+ */
505
+ U32 const pivot = arr[high].count;
506
+ int i = low - 1;
507
+ int j = low;
508
+ for ( ; j < high; j++) {
509
+ if (arr[j].count > pivot) {
510
+ i++;
511
+ HUF_swapNodes(&arr[i], &arr[j]);
512
+ }
513
+ }
514
+ HUF_swapNodes(&arr[i + 1], &arr[high]);
515
+ return i + 1;
516
+ }
517
+
518
+ /* Classic quicksort by descending with partially iterative calls
519
+ * to reduce worst case callstack size.
520
+ */
521
+ static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) {
522
+ int const kInsertionSortThreshold = 8;
523
+ if (high - low < kInsertionSortThreshold) {
524
+ HUF_insertionSort(arr, low, high);
525
+ return;
526
+ }
527
+ while (low < high) {
528
+ int const idx = HUF_quickSortPartition(arr, low, high);
529
+ if (idx - low < high - idx) {
530
+ HUF_simpleQuickSort(arr, low, idx - 1);
531
+ low = idx + 1;
532
+ } else {
533
+ HUF_simpleQuickSort(arr, idx + 1, high);
534
+ high = idx - 1;
535
+ }
536
+ }
537
+ }
538
+
383
539
  /**
384
540
  * HUF_sort():
385
541
  * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing order.
542
+ * This is a typical bucket sorting strategy that uses either quicksort or insertion sort to sort each bucket.
386
543
  *
387
544
  * @param[out] huffNode Sorted symbols by decreasing count. Only members `.count` and `.byte` are filled.
388
545
  * Must have (maxSymbolValue + 1) entries.
@@ -390,44 +547,52 @@ typedef struct {
390
547
  * @param[in] maxSymbolValue Maximum symbol value.
391
548
  * @param rankPosition This is a scratch workspace. Must have RANK_POSITION_TABLE_SIZE entries.
392
549
  */
393
- static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition)
394
- {
395
- int n;
396
- int const maxSymbolValue1 = (int)maxSymbolValue + 1;
550
+ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSymbolValue, rankPos rankPosition[]) {
551
+ U32 n;
552
+ U32 const maxSymbolValue1 = maxSymbolValue+1;
397
553
 
398
554
  /* Compute base and set curr to base.
399
- * For symbol s let lowerRank = BIT_highbit32(count[n]+1) and rank = lowerRank + 1.
400
- * Then 2^lowerRank <= count[n]+1 <= 2^rank.
555
+ * For symbol s let lowerRank = HUF_getIndex(count[n]) and rank = lowerRank + 1.
556
+ * See HUF_getIndex to see bucketing strategy.
401
557
  * We attribute each symbol to lowerRank's base value, because we want to know where
402
558
  * each rank begins in the output, so for rank R we want to count ranks R+1 and above.
403
559
  */
404
560
  ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
405
561
  for (n = 0; n < maxSymbolValue1; ++n) {
406
- U32 lowerRank = BIT_highbit32(count[n] + 1);
562
+ U32 lowerRank = HUF_getIndex(count[n]);
563
+ assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1);
407
564
  rankPosition[lowerRank].base++;
408
565
  }
566
+
409
567
  assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base == 0);
568
+ /* Set up the rankPosition table */
410
569
  for (n = RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {
411
570
  rankPosition[n-1].base += rankPosition[n].base;
412
571
  rankPosition[n-1].curr = rankPosition[n-1].base;
413
572
  }
414
- /* Sort */
573
+
574
+ /* Insert each symbol into their appropriate bucket, setting up rankPosition table. */
415
575
  for (n = 0; n < maxSymbolValue1; ++n) {
416
576
  U32 const c = count[n];
417
- U32 const r = BIT_highbit32(c+1) + 1;
418
- U32 pos = rankPosition[r].curr++;
419
- /* Insert into the correct position in the rank.
420
- * We have at most 256 symbols, so this insertion should be fine.
421
- */
422
- while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) {
423
- huffNode[pos] = huffNode[pos-1];
424
- pos--;
425
- }
577
+ U32 const r = HUF_getIndex(c) + 1;
578
+ U32 const pos = rankPosition[r].curr++;
579
+ assert(pos < maxSymbolValue1);
426
580
  huffNode[pos].count = c;
427
581
  huffNode[pos].byte = (BYTE)n;
428
582
  }
429
- }
430
583
 
584
+ /* Sort each bucket. */
585
+ for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
586
+ U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
587
+ U32 const bucketStartIdx = rankPosition[n].base;
588
+ if (bucketSize > 1) {
589
+ assert(bucketStartIdx < maxSymbolValue1);
590
+ HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1);
591
+ }
592
+ }
593
+
594
+ assert(HUF_isSorted(huffNode, maxSymbolValue1));
595
+ }
431
596
 
432
597
  /** HUF_buildCTable_wksp() :
433
598
  * Same as HUF_buildCTable(), but using externally allocated scratch buffer.
@@ -490,6 +655,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
490
655
  */
491
656
  static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits)
492
657
  {
658
+ HUF_CElt* const ct = CTable + 1;
493
659
  /* fill result into ctable (val, nbBits) */
494
660
  int n;
495
661
  U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
@@ -505,20 +671,20 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
505
671
  min >>= 1;
506
672
  } }
507
673
  for (n=0; n<alphabetSize; n++)
508
- CTable[huffNode[n].byte].nbBits = huffNode[n].nbBits; /* push nbBits per symbol, symbol order */
674
+ HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */
509
675
  for (n=0; n<alphabetSize; n++)
510
- CTable[n].val = valPerRank[CTable[n].nbBits]++; /* assign value within rank, symbol order */
676
+ HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); /* assign value within rank, symbol order */
677
+ CTable[0] = maxNbBits;
511
678
  }
512
679
 
513
- size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
680
+ size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
514
681
  {
515
- HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace;
682
+ HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
516
683
  nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
517
684
  nodeElt* const huffNode = huffNode0+1;
518
685
  int nonNullRank;
519
686
 
520
687
  /* safety checks */
521
- if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */
522
688
  if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
523
689
  return ERROR(workSpace_tooSmall);
524
690
  if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
@@ -536,96 +702,334 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbo
536
702
  maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
537
703
  if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */
538
704
 
539
- HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
705
+ HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue, maxNbBits);
540
706
 
541
707
  return maxNbBits;
542
708
  }
543
709
 
544
710
  size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
545
711
  {
712
+ HUF_CElt const* ct = CTable + 1;
546
713
  size_t nbBits = 0;
547
714
  int s;
548
715
  for (s = 0; s <= (int)maxSymbolValue; ++s) {
549
- nbBits += CTable[s].nbBits * count[s];
716
+ nbBits += HUF_getNbBits(ct[s]) * count[s];
550
717
  }
551
718
  return nbBits >> 3;
552
719
  }
553
720
 
554
721
  int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
722
+ HUF_CElt const* ct = CTable + 1;
555
723
  int bad = 0;
556
724
  int s;
557
725
  for (s = 0; s <= (int)maxSymbolValue; ++s) {
558
- bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
726
+ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
559
727
  }
560
728
  return !bad;
561
729
  }
562
730
 
563
731
  size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
564
732
 
733
+ /** HUF_CStream_t:
734
+ * Huffman uses its own BIT_CStream_t implementation.
735
+ * There are three major differences from BIT_CStream_t:
736
+ * 1. HUF_addBits() takes a HUF_CElt (size_t) which is
737
+ * the pair (nbBits, value) in the format:
738
+ * format:
739
+ * - Bits [0, 4) = nbBits
740
+ * - Bits [4, 64 - nbBits) = 0
741
+ * - Bits [64 - nbBits, 64) = value
742
+ * 2. The bitContainer is built from the upper bits and
743
+ * right shifted. E.g. to add a new value of N bits
744
+ * you right shift the bitContainer by N, then or in
745
+ * the new value into the N upper bits.
746
+ * 3. The bitstream has two bit containers. You can add
747
+ * bits to the second container and merge them into
748
+ * the first container.
749
+ */
750
+
751
+ #define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8)
752
+
753
+ typedef struct {
754
+ size_t bitContainer[2];
755
+ size_t bitPos[2];
756
+
757
+ BYTE* startPtr;
758
+ BYTE* ptr;
759
+ BYTE* endPtr;
760
+ } HUF_CStream_t;
761
+
762
+ /**! HUF_initCStream():
763
+ * Initializes the bitstream.
764
+ * @returns 0 or an error code.
765
+ */
766
+ static size_t HUF_initCStream(HUF_CStream_t* bitC,
767
+ void* startPtr, size_t dstCapacity)
768
+ {
769
+ ZSTD_memset(bitC, 0, sizeof(*bitC));
770
+ bitC->startPtr = (BYTE*)startPtr;
771
+ bitC->ptr = bitC->startPtr;
772
+ bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer[0]);
773
+ if (dstCapacity <= sizeof(bitC->bitContainer[0])) return ERROR(dstSize_tooSmall);
774
+ return 0;
775
+ }
776
+
777
+ /*! HUF_addBits():
778
+ * Adds the symbol stored in HUF_CElt elt to the bitstream.
779
+ *
780
+ * @param elt The element we're adding. This is a (nbBits, value) pair.
781
+ * See the HUF_CStream_t docs for the format.
782
+ * @param idx Insert into the bitstream at this idx.
783
+ * @param kFast This is a template parameter. If the bitstream is guaranteed
784
+ * to have at least 4 unused bits after this call it may be 1,
785
+ * otherwise it must be 0. HUF_addBits() is faster when fast is set.
786
+ */
787
+ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int idx, int kFast)
788
+ {
789
+ assert(idx <= 1);
790
+ assert(HUF_getNbBits(elt) <= HUF_TABLELOG_ABSOLUTEMAX);
791
+ /* This is efficient on x86-64 with BMI2 because shrx
792
+ * only reads the low 6 bits of the register. The compiler
793
+ * knows this and elides the mask. When fast is set,
794
+ * every operation can use the same value loaded from elt.
795
+ */
796
+ bitC->bitContainer[idx] >>= HUF_getNbBits(elt);
797
+ bitC->bitContainer[idx] |= kFast ? HUF_getValueFast(elt) : HUF_getValue(elt);
798
+ /* We only read the low 8 bits of bitC->bitPos[idx] so it
799
+ * doesn't matter that the high bits have noise from the value.
800
+ */
801
+ bitC->bitPos[idx] += HUF_getNbBitsFast(elt);
802
+ assert((bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
803
+ /* The last 4-bits of elt are dirty if fast is set,
804
+ * so we must not be overwriting bits that have already been
805
+ * inserted into the bit container.
806
+ */
807
+ #if DEBUGLEVEL >= 1
808
+ {
809
+ size_t const nbBits = HUF_getNbBits(elt);
810
+ size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
811
+ (void)dirtyBits;
812
+ /* Middle bits are 0. */
813
+ assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
814
+ /* We didn't overwrite any bits in the bit container. */
815
+ assert(!kFast || (bitC->bitPos[idx] & 0xFF) <= HUF_BITS_IN_CONTAINER);
816
+ (void)dirtyBits;
817
+ }
818
+ #endif
819
+ }
820
+
821
+ FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC)
822
+ {
823
+ bitC->bitContainer[1] = 0;
824
+ bitC->bitPos[1] = 0;
825
+ }
826
+
827
+ /*! HUF_mergeIndex1() :
828
+ * Merges the bit container @ index 1 into the bit container @ index 0
829
+ * and zeros the bit container @ index 1.
830
+ */
831
+ FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC)
832
+ {
833
+ assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER);
834
+ bitC->bitContainer[0] >>= (bitC->bitPos[1] & 0xFF);
835
+ bitC->bitContainer[0] |= bitC->bitContainer[1];
836
+ bitC->bitPos[0] += bitC->bitPos[1];
837
+ assert((bitC->bitPos[0] & 0xFF) <= HUF_BITS_IN_CONTAINER);
838
+ }
839
+
840
+ /*! HUF_flushBits() :
841
+ * Flushes the bits in the bit container @ index 0.
842
+ *
843
+ * @post bitPos will be < 8.
844
+ * @param kFast If kFast is set then we must know a-priori that
845
+ * the bit container will not overflow.
846
+ */
847
+ FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast)
848
+ {
849
+ /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */
850
+ size_t const nbBits = bitC->bitPos[0] & 0xFF;
851
+ size_t const nbBytes = nbBits >> 3;
852
+ /* The top nbBits bits of bitContainer are the ones we need. */
853
+ size_t const bitContainer = bitC->bitContainer[0] >> (HUF_BITS_IN_CONTAINER - nbBits);
854
+ /* Mask bitPos to account for the bytes we consumed. */
855
+ bitC->bitPos[0] &= 7;
856
+ assert(nbBits > 0);
857
+ assert(nbBits <= sizeof(bitC->bitContainer[0]) * 8);
858
+ assert(bitC->ptr <= bitC->endPtr);
859
+ MEM_writeLEST(bitC->ptr, bitContainer);
860
+ bitC->ptr += nbBytes;
861
+ assert(!kFast || bitC->ptr <= bitC->endPtr);
862
+ if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
863
+ /* bitContainer doesn't need to be modified because the leftover
864
+ * bits are already the top bitPos bits. And we don't care about
865
+ * noise in the lower values.
866
+ */
867
+ }
868
+
869
+ /*! HUF_endMark()
870
+ * @returns The Huffman stream end mark: A 1-bit value = 1.
871
+ */
872
+ static HUF_CElt HUF_endMark(void)
873
+ {
874
+ HUF_CElt endMark;
875
+ HUF_setNbBits(&endMark, 1);
876
+ HUF_setValue(&endMark, 1);
877
+ return endMark;
878
+ }
879
+
880
+ /*! HUF_closeCStream() :
881
+ * @return Size of CStream, in bytes,
882
+ * or 0 if it could not fit into dstBuffer */
883
+ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
884
+ {
885
+ HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0);
886
+ HUF_flushBits(bitC, /* kFast */ 0);
887
+ {
888
+ size_t const nbBits = bitC->bitPos[0] & 0xFF;
889
+ if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
890
+ return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
891
+ }
892
+ }
893
+
565
894
  FORCE_INLINE_TEMPLATE void
566
- HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
895
+ HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable, int idx, int fast)
567
896
  {
568
- BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
897
+ HUF_addBits(bitCPtr, CTable[symbol], idx, fast);
569
898
  }
570
899
 
571
- #define HUF_FLUSHBITS(s) BIT_flushBits(s)
900
+ FORCE_INLINE_TEMPLATE void
901
+ HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC,
902
+ const BYTE* ip, size_t srcSize,
903
+ const HUF_CElt* ct,
904
+ int kUnroll, int kFastFlush, int kLastFast)
905
+ {
906
+ /* Join to kUnroll */
907
+ int n = (int)srcSize;
908
+ int rem = n % kUnroll;
909
+ if (rem > 0) {
910
+ for (; rem > 0; --rem) {
911
+ HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0);
912
+ }
913
+ HUF_flushBits(bitC, kFastFlush);
914
+ }
915
+ assert(n % kUnroll == 0);
916
+
917
+ /* Join to 2 * kUnroll */
918
+ if (n % (2 * kUnroll)) {
919
+ int u;
920
+ for (u = 1; u < kUnroll; ++u) {
921
+ HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1);
922
+ }
923
+ HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast);
924
+ HUF_flushBits(bitC, kFastFlush);
925
+ n -= kUnroll;
926
+ }
927
+ assert(n % (2 * kUnroll) == 0);
928
+
929
+ for (; n>0; n-= 2 * kUnroll) {
930
+ /* Encode kUnroll symbols into the bitstream @ index 0. */
931
+ int u;
932
+ for (u = 1; u < kUnroll; ++u) {
933
+ HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ 1);
934
+ }
935
+ HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast */ kLastFast);
936
+ HUF_flushBits(bitC, kFastFlush);
937
+ /* Encode kUnroll symbols into the bitstream @ index 1.
938
+ * This allows us to start filling the bit container
939
+ * without any data dependencies.
940
+ */
941
+ HUF_zeroIndex1(bitC);
942
+ for (u = 1; u < kUnroll; ++u) {
943
+ HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /* fast */ 1);
944
+ }
945
+ HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1, /* fast */ kLastFast);
946
+ /* Merge bitstream @ index 1 into the bitstream @ index 0 */
947
+ HUF_mergeIndex1(bitC);
948
+ HUF_flushBits(bitC, kFastFlush);
949
+ }
950
+ assert(n == 0);
951
+
952
+ }
572
953
 
573
- #define HUF_FLUSHBITS_1(stream) \
574
- if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
954
+ /**
955
+ * Returns a tight upper bound on the output space needed by Huffman
956
+ * with 8 bytes buffer to handle over-writes. If the output is at least
957
+ * this large we don't need to do bounds checks during Huffman encoding.
958
+ */
959
+ static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog)
960
+ {
961
+ return ((srcSize * tableLog) >> 3) + 8;
962
+ }
575
963
 
576
- #define HUF_FLUSHBITS_2(stream) \
577
- if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
578
964
 
579
965
  FORCE_INLINE_TEMPLATE size_t
580
966
  HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
581
967
  const void* src, size_t srcSize,
582
968
  const HUF_CElt* CTable)
583
969
  {
970
+ U32 const tableLog = (U32)CTable[0];
971
+ HUF_CElt const* ct = CTable + 1;
584
972
  const BYTE* ip = (const BYTE*) src;
585
973
  BYTE* const ostart = (BYTE*)dst;
586
974
  BYTE* const oend = ostart + dstSize;
587
975
  BYTE* op = ostart;
588
- size_t n;
589
- BIT_CStream_t bitC;
976
+ HUF_CStream_t bitC;
590
977
 
591
978
  /* init */
592
979
  if (dstSize < 8) return 0; /* not enough space to compress */
593
- { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op));
980
+ { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
594
981
  if (HUF_isError(initErr)) return 0; }
595
982
 
596
- n = srcSize & ~3; /* join to mod 4 */
597
- switch (srcSize & 3)
598
- {
599
- case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
600
- HUF_FLUSHBITS_2(&bitC);
601
- /* fall-through */
602
- case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
603
- HUF_FLUSHBITS_1(&bitC);
604
- /* fall-through */
605
- case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
606
- HUF_FLUSHBITS(&bitC);
607
- /* fall-through */
608
- case 0 : /* fall-through */
609
- default: break;
610
- }
611
-
612
- for (; n>0; n-=4) { /* note : n&3==0 at this stage */
613
- HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
614
- HUF_FLUSHBITS_1(&bitC);
615
- HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
616
- HUF_FLUSHBITS_2(&bitC);
617
- HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
618
- HUF_FLUSHBITS_1(&bitC);
619
- HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
620
- HUF_FLUSHBITS(&bitC);
983
+ if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
984
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0);
985
+ else {
986
+ if (MEM_32bits()) {
987
+ switch (tableLog) {
988
+ case 11:
989
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0);
990
+ break;
991
+ case 10: ZSTD_FALLTHROUGH;
992
+ case 9: ZSTD_FALLTHROUGH;
993
+ case 8:
994
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1);
995
+ break;
996
+ case 7: ZSTD_FALLTHROUGH;
997
+ default:
998
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1);
999
+ break;
1000
+ }
1001
+ } else {
1002
+ switch (tableLog) {
1003
+ case 11:
1004
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0);
1005
+ break;
1006
+ case 10:
1007
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1);
1008
+ break;
1009
+ case 9:
1010
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0);
1011
+ break;
1012
+ case 8:
1013
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0);
1014
+ break;
1015
+ case 7:
1016
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0);
1017
+ break;
1018
+ case 6: ZSTD_FALLTHROUGH;
1019
+ default:
1020
+ HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1);
1021
+ break;
1022
+ }
1023
+ }
621
1024
  }
1025
+ assert(bitC.ptr <= bitC.endPtr);
622
1026
 
623
- return BIT_closeCStream(&bitC);
1027
+ return HUF_closeCStream(&bitC);
624
1028
  }
625
1029
 
626
1030
  #if DYNAMIC_BMI2
627
1031
 
628
- static TARGET_ATTRIBUTE("bmi2") size_t
1032
+ static BMI2_TARGET_ATTRIBUTE size_t
629
1033
  HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
630
1034
  const void* src, size_t srcSize,
631
1035
  const HUF_CElt* CTable)
@@ -667,9 +1071,13 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
667
1071
 
668
1072
  size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
669
1073
  {
670
- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
1074
+ return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
671
1075
  }
672
1076
 
1077
+ size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
1078
+ {
1079
+ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
1080
+ }
673
1081
 
674
1082
  static size_t
675
1083
  HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
@@ -689,8 +1097,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
689
1097
 
690
1098
  assert(op <= oend);
691
1099
  { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
692
- if (cSize==0) return 0;
693
- assert(cSize <= 65535);
1100
+ if (cSize == 0 || cSize > 65535) return 0;
694
1101
  MEM_writeLE16(ostart, (U16)cSize);
695
1102
  op += cSize;
696
1103
  }
@@ -698,8 +1105,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
698
1105
  ip += segmentSize;
699
1106
  assert(op <= oend);
700
1107
  { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
701
- if (cSize==0) return 0;
702
- assert(cSize <= 65535);
1108
+ if (cSize == 0 || cSize > 65535) return 0;
703
1109
  MEM_writeLE16(ostart+2, (U16)cSize);
704
1110
  op += cSize;
705
1111
  }
@@ -707,8 +1113,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
707
1113
  ip += segmentSize;
708
1114
  assert(op <= oend);
709
1115
  { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
710
- if (cSize==0) return 0;
711
- assert(cSize <= 65535);
1116
+ if (cSize == 0 || cSize > 65535) return 0;
712
1117
  MEM_writeLE16(ostart+4, (U16)cSize);
713
1118
  op += cSize;
714
1119
  }
@@ -717,7 +1122,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
717
1122
  assert(op <= oend);
718
1123
  assert(ip <= iend);
719
1124
  { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
720
- if (cSize==0) return 0;
1125
+ if (cSize == 0 || cSize > 65535) return 0;
721
1126
  op += cSize;
722
1127
  }
723
1128
 
@@ -726,7 +1131,12 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
726
1131
 
727
1132
  size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
728
1133
  {
729
- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
1134
+ return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
1135
+ }
1136
+
1137
+ size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
1138
+ {
1139
+ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
730
1140
  }
731
1141
 
732
1142
  typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
@@ -750,35 +1160,38 @@ static size_t HUF_compressCTable_internal(
750
1160
 
751
1161
  typedef struct {
752
1162
  unsigned count[HUF_SYMBOLVALUE_MAX + 1];
753
- HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
1163
+ HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];
754
1164
  union {
755
1165
  HUF_buildCTable_wksp_tables buildCTable_wksp;
756
1166
  HUF_WriteCTableWksp writeCTable_wksp;
1167
+ U32 hist_wksp[HIST_WKSP_SIZE_U32];
757
1168
  } wksps;
758
1169
  } HUF_compress_tables_t;
759
1170
 
1171
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
1172
+ #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */
1173
+
760
1174
  /* HUF_compress_internal() :
761
1175
  * `workSpace_align4` must be aligned on 4-bytes boundaries,
762
- * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsigned */
1176
+ * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
763
1177
  static size_t
764
1178
  HUF_compress_internal (void* dst, size_t dstSize,
765
1179
  const void* src, size_t srcSize,
766
1180
  unsigned maxSymbolValue, unsigned huffLog,
767
1181
  HUF_nbStreams_e nbStreams,
768
- void* workSpace_align4, size_t wkspSize,
1182
+ void* workSpace, size_t wkspSize,
769
1183
  HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
770
- const int bmi2)
1184
+ const int bmi2, unsigned suspectUncompressible)
771
1185
  {
772
- HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace_align4;
1186
+ HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
773
1187
  BYTE* const ostart = (BYTE*)dst;
774
1188
  BYTE* const oend = ostart + dstSize;
775
1189
  BYTE* op = ostart;
776
1190
 
777
- HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE);
778
- assert(((size_t)workSpace_align4 & 3) == 0); /* must be aligned on 4-bytes boundaries */
1191
+ HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
779
1192
 
780
1193
  /* checks & inits */
781
- if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall);
1194
+ if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
782
1195
  if (!srcSize) return 0; /* Uncompressed */
783
1196
  if (!dstSize) return 0; /* cannot fit anything within dst budget */
784
1197
  if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */
@@ -794,8 +1207,23 @@ HUF_compress_internal (void* dst, size_t dstSize,
794
1207
  nbStreams, oldHufTable, bmi2);
795
1208
  }
796
1209
 
1210
+ /* If uncompressible data is suspected, do a smaller sampling first */
1211
+ DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
1212
+ if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
1213
+ size_t largestTotal = 0;
1214
+ { unsigned maxSymbolValueBegin = maxSymbolValue;
1215
+ CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
1216
+ largestTotal += largestBegin;
1217
+ }
1218
+ { unsigned maxSymbolValueEnd = maxSymbolValue;
1219
+ CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSymbolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
1220
+ largestTotal += largestEnd;
1221
+ }
1222
+ if (largestTotal <= ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >> 7)+4) return 0; /* heuristic : probably not compressible enough */
1223
+ }
1224
+
797
1225
  /* Scan input and build symbol stats */
798
- { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace_align4, wkspSize) );
1226
+ { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.hist_wksp)) );
799
1227
  if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */
800
1228
  if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */
801
1229
  }
@@ -820,9 +1248,12 @@ HUF_compress_internal (void* dst, size_t dstSize,
820
1248
  &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
821
1249
  CHECK_F(maxBits);
822
1250
  huffLog = (U32)maxBits;
823
- /* Zero unused symbols in CTable, so we can check it for validity */
824
- ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0,
825
- sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
1251
+ }
1252
+ /* Zero unused symbols in CTable, so we can check it for validity */
1253
+ {
1254
+ size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
1255
+ size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
1256
+ ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
826
1257
  }
827
1258
 
828
1259
  /* Write table description header */
@@ -859,19 +1290,20 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
859
1290
  return HUF_compress_internal(dst, dstSize, src, srcSize,
860
1291
  maxSymbolValue, huffLog, HUF_singleStream,
861
1292
  workSpace, wkspSize,
862
- NULL, NULL, 0, 0 /*bmi2*/);
1293
+ NULL, NULL, 0, 0 /*bmi2*/, 0);
863
1294
  }
864
1295
 
865
1296
  size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
866
1297
  const void* src, size_t srcSize,
867
1298
  unsigned maxSymbolValue, unsigned huffLog,
868
1299
  void* workSpace, size_t wkspSize,
869
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
1300
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
1301
+ int bmi2, unsigned suspectUncompressible)
870
1302
  {
871
1303
  return HUF_compress_internal(dst, dstSize, src, srcSize,
872
1304
  maxSymbolValue, huffLog, HUF_singleStream,
873
1305
  workSpace, wkspSize, hufTable,
874
- repeat, preferRepeat, bmi2);
1306
+ repeat, preferRepeat, bmi2, suspectUncompressible);
875
1307
  }
876
1308
 
877
1309
  /* HUF_compress4X_repeat():
@@ -885,22 +1317,23 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
885
1317
  return HUF_compress_internal(dst, dstSize, src, srcSize,
886
1318
  maxSymbolValue, huffLog, HUF_fourStreams,
887
1319
  workSpace, wkspSize,
888
- NULL, NULL, 0, 0 /*bmi2*/);
1320
+ NULL, NULL, 0, 0 /*bmi2*/, 0);
889
1321
  }
890
1322
 
891
1323
  /* HUF_compress4X_repeat():
892
1324
  * compress input using 4 streams.
1325
+ * consider skipping quickly
893
1326
  * re-use an existing huffman compression table */
894
1327
  size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
895
1328
  const void* src, size_t srcSize,
896
1329
  unsigned maxSymbolValue, unsigned huffLog,
897
1330
  void* workSpace, size_t wkspSize,
898
- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
1331
+ HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
899
1332
  {
900
1333
  return HUF_compress_internal(dst, dstSize, src, srcSize,
901
1334
  maxSymbolValue, huffLog, HUF_fourStreams,
902
1335
  workSpace, wkspSize,
903
- hufTable, repeat, preferRepeat, bmi2);
1336
+ hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
904
1337
  }
905
1338
 
906
1339
  #ifndef ZSTD_NO_UNUSED_FUNCTIONS
@@ -918,7 +1351,7 @@ size_t HUF_compress1X (void* dst, size_t dstSize,
918
1351
  const void* src, size_t srcSize,
919
1352
  unsigned maxSymbolValue, unsigned huffLog)
920
1353
  {
921
- unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
1354
+ U64 workSpace[HUF_WORKSPACE_SIZE_U64];
922
1355
  return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
923
1356
  }
924
1357
 
@@ -926,7 +1359,7 @@ size_t HUF_compress2 (void* dst, size_t dstSize,
926
1359
  const void* src, size_t srcSize,
927
1360
  unsigned maxSymbolValue, unsigned huffLog)
928
1361
  {
929
- unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
1362
+ U64 workSpace[HUF_WORKSPACE_SIZE_U64];
930
1363
  return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
931
1364
  }
932
1365